Kaydet (Commit) c86b80a9 authored tarafından Eray Özkural's avatar Eray Özkural

* implement first version of search API

  - we can add, remove docs and query docs, that's all we need
  - supports exact search, relies on smart preprocessing to be added later
  - simple tokenization
* add a test for basic search function
üst 33d977b7
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Project SYSTEM "Project-3.8.dtd">
<!-- Project file for project pisi -->
<!-- Saved: 2005-11-19, 21:48:29 -->
<!-- Saved: 2005-12-06, 17:06:16 -->
<!-- Copyright (C) 2005 PiSi Development Team, -->
<Project version="3.8">
<ProgLanguage mixed="0">Python</ProgLanguage>
......@@ -438,6 +438,26 @@
<Dir>pisi</Dir>
<Name>file.py</Name>
</Source>
<Source>
<Dir>pisi</Dir>
<Dir>search</Dir>
<Name>invertedindex.py</Name>
</Source>
<Source>
<Dir>pisi</Dir>
<Dir>search</Dir>
<Name>tokenize.py</Name>
</Source>
<Source>
<Dir>pisi</Dir>
<Dir>search</Dir>
<Name>__init__.py</Name>
</Source>
<Source>
<Dir>pisi</Dir>
<Dir>search</Dir>
<Name>preprocess.py</Name>
</Source>
</Sources>
<Forms>
</Forms>
......@@ -480,9 +500,9 @@
</Vcs>
<FiletypeAssociations>
<FiletypeAssociation pattern="*.ui.h" type="FORMS" />
<FiletypeAssociation pattern="*.ui" type="FORMS" />
<FiletypeAssociation pattern="*.ptl" type="SOURCES" />
<FiletypeAssociation pattern="*.idl" type="INTERFACES" />
<FiletypeAssociation pattern="*.ui" type="FORMS" />
<FiletypeAssociation pattern="*.py" type="SOURCES" />
<FiletypeAssociation pattern="*.ptl" type="SOURCES" />
</FiletypeAssociations>
</Project>
......@@ -16,6 +16,7 @@ import bsddb.dbshelve as shelve
import bsddb.db as db
import os
import fcntl
import types
import gettext
__trans = gettext.translation('pisi', fallback=True)
......@@ -77,7 +78,6 @@ class LockedDBShelf(shelve.DBShelf):
except IOError:
raise Error(_("Another instance of PISI is running. Try later!"))
def close(self):
if self.closed:
return
......@@ -89,3 +89,13 @@ class LockedDBShelf(shelve.DBShelf):
def unlock(self):
self.lockfile.close()
os.unlink(self.filename + '.lock')
@staticmethod
def encodekey(key):
'''utility method for dbs that must store unicodes in keys'''
if type(key)==types.UnicodeType:
return key.encode('utf-8')
elif type(key)==types.StringType:
return key
else:
raise Error('Key must be either string or unicode')
......@@ -13,6 +13,7 @@
# Author: Eray Ozkural <eray@uludag.org.tr>
import pisi
import pisi.context as ctx
class Error(pisi.Error):
pass
......@@ -20,13 +21,16 @@ class Error(pisi.Error):
class Exception(pisi.Exception):
pass
# API
# API
from invertedindex import InvertedIndex
from preprocess import preprocess
def init(ids, langs):
"initialize databases"
import pisi.context as ctx
assert type(ids)==type([])
assert type(langs)==type([])
ctx.invidx = {}
for id in ids:
......@@ -44,10 +48,11 @@ def finalize():
ctx.invidx = {}
def add_doc(id, lang, docid, str):
pass
terms = preprocess(lang, str)
ctx.invidx[id][lang].add_doc(docid, terms)
def remove_doc(id, lang, docid, str):
pass
ctx.invidx[id][lang].remove_doc(docid)
def query_terms(id, lang, terms):
pass
def query(id, lang, terms):
return ctx.invidx[id][lang].query(terms)
......@@ -11,6 +11,11 @@
#
# Author: Eray Ozkural <eray@uludag.org.tr>
import types
import pisi.lockeddbshelve as shelve
class InvertedIndex(object):
"""a database of term -> set of documents"""
......@@ -21,14 +26,19 @@ class InvertedIndex(object):
self.d.close()
def has_term(self, term):
return self.d.has_key(str(term))
return self.d.has_key(shelve.LockedDBShelf.encodekey(term))
def get_term(self, term):
term = str(term)
"""get set of doc ids given term"""
term = shelve.LockedDBShelf.encodekey(term)
if not self.has_term(term):
self.d[term] = set()
return self.d[term]
def query(self, terms):
docs = [ self.get_term(x) for x in terms ]
return reduce(lambda x,y: x.union(y), docs)
def list_terms(self):
list = []
for term in self.d.iterkeys():
......@@ -37,12 +47,14 @@ class InvertedIndex(object):
def add_doc(self, doc, terms):
for term_i in terms:
term_i = shelve.LockedDBShelf.encodekey(term_i)
term_i_docs = self.get_term(term_i)
term_i_docs.add(doc)
self.d[term_i] = term_i_docs # update
def remove_doc(self, doc, terms):
for term_i in terms:
term_i = shelve.LockedDBShelf.encodekey(term_i)
term_i_docs = self.get_term(term_i)
term_i_docs.remove(doc)
self.d[term_i] = term_i_docs # update
# -*- coding: utf-8 -*-
#
# Copyright (C) 2005, TUBITAK/UEKAE
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.
#
def preprocess(str, lang):
if
import tokenize
def preprocess(lang, str):
return tokenize.tokenize(lang, str)
......@@ -11,7 +11,21 @@
#
# Author: Eray Ozkural <eray@uludag.org.tr>
#
# rev 1: very little tokenization, for testing
# rev 1: very simple tokenization, for testing
def tokenize(lang, string):
pass
import string
def tokenize(lang, str):
if type(str) != type(unicode()):
str = unicode(str)
tokens = []
token = unicode()
for x in str:
if x in string.whitespace:
if len(token) > 0:
tokens.append(token)
token = unicode()
else:
token += x
return tokens
# -*- coding: utf-8 -*-
#
# Copyright (C) 2005, TUBITAK/UEKAE
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.
#
import unittest
import os
import pisi.search
import testcase
class SearchTestCase(testcase.TestCase):
def setUp(self):
testcase.TestCase.setUp(self, database = False)
def testSearch(self):
doc1 = "A set object is an unordered collection of immutable values."
doc2 = "Being an unordered collection, sets do not record element position or order of insertion."
doc3 = "There are currently two builtin set types, set and frozenset"
pisi.search.init(['test'], ['en'])
pisi.search.add_doc('test', 'en', 1, doc1)
pisi.search.add_doc('test', 'en', 2, doc2)
pisi.search.add_doc('test', 'en', 3, doc3)
q1 = pisi.search.query('test', 'en', ['set'])
self.assertEqual(q1, set([1,3]))
q2 = pisi.search.query('test', 'en', ['an', 'collection'])
self.assertEqual(q2, set([1,2]))
pisi.search.finalize()
suite = unittest.makeSuite(SearchTestCase)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment