* implement search command

- just enough improvements in pisi.search to make it work :) * rearrange pisi TODO list

* implement search command
- just enough improvements in pisi.search to make it work :) * rearrange pisi TODO list
4ed2f0a3 · Eray Özkural · 8bfd9145 · 4ed2f0a3 · 4ed2f0a3 · 4ed2f0a3
Kaydet (Commit) 4ed2f0a3 authored Ara 06, 2005 tarafından Eray Özkural
8 changed files
--- a/TODO
+++ b/TODO
@@ -74,6 +74,7 @@ Legend:

 3. Beta

+ + build cmd: installl build deps
 + improve url support, it's a little dumb at the moment
   + devise a File class that works with URLs
   + add URL support to autoxml
@@ -169,26 +170,21 @@ Legend:

 4. Release

+ * high priority
+
+- transaction stuff for database (eray)
+
 * medium priority
 
- / high-level build commands
-   - install build dependencies
 - component improvements (optional)
   ? if comp a has comp b, then it must have all packages in b
-   - maintain a list if components each component has
+   - maintain a list of components each component has

 * low priority 
 
- - search command: (eray)
-   - implement an inverted index (if there's a good one, just use it)
-   - search in either summary, or description
-   - english stemming
-   - turkish stemming
-   - phonetic/syntactic fault tolerance (whichever works!)
-     - in particular evaluate soundex and metaphone
-     - research if there are language independent phonetic encodings
-     - implement as an option (or make it possible to turn it off if it works well :))
-   - fault tolerant search in package names (somehow)
+ + search command: (eray)
+   + implement an inverted index
+   + search in summary and description
 / overhaul installdb (try to merge its use with packagedb) (eray)
 / verify metodlari
   + SpecFile
@@ -203,7 +199,10 @@ Legend:
 5. Post Release

 - high-level build commands
-   - build-all (or build-distro) command
+   - build command enhancements:
+     - aware of repo sources
+     - full dependency analysis like install/upgrade etc.
+   - build-all (or build-distro) command, or as a separate tool
   - an emerge command, just for fun
 - more support for categories:
   - put categories into a database
@@ -213,10 +212,22 @@ Legend:
   - sourceb
 - improve url support: consider adding "other" protocols
   - removable media: media://
- - transaction stuff for database (eray)
 - multi-architecture support (baris, caglar)
   / design decisions
   / extend XML specs to support that
   - cross-platform building support
 - incremental build/fetch for repository index (pisi-index.xml)
   - diffsets (caglar)
+ - a command to check if repos are consistent wrt dep relations
+
+ * low priority
+
+ - search command: (eray)
+   - search components
+   - english stemming
+   - turkish stemming
+   - phonetic/syntactic fault tolerance (whichever works!)
+     - in particular evaluate soundex and metaphone
+     - research if there are language independent phonetic encodings
+     - implement as an option (or make it possible to turn it off if it works well :))
+   - fault tolerant search in package names (somehow)
--- a/pisi/api.py
+++ b/pisi/api.py
@@ -46,6 +46,7 @@ from pisi.build import build, build_until
 from pisi.atomicoperations import resurrect_package
 from pisi.metadata import MetaData
 from pisi.files import Files
+import pisi.search

 class Error(pisi.Error):
    pass
@@ -75,6 +76,7 @@ def init(database = True, options = None, ui = None, comar = True):
        ctx.componentdb = pisi.component.ComponentDB()
        packagedb.init_db()
        pisi.sourcedb.init()
+        pisi.search.init(['summary', 'description'], ['en', 'tr'])
    else:
        ctx.repodb = None
        ctx.installdb = None
@@ -93,6 +95,7 @@ def finalize():
            ctx.componentdb.close()
        packagedb.finalize_db()
        pisi.sourcedb.finalize()
+        pisi.search.finalize()
        ctx.ui.debug('PISI API finalized')
        ctx.ui.close()
        ctx.initialized = False

--- a/pisi/cli/commands.py
+++ b/pisi/cli/commands.py
@@ -928,18 +928,56 @@ class ListPending(Command):
        list = ctx.installdb.list_pending()
        for p in list.keys():
            print p
-
        self.finalize()


-class Search(Command):
+class Search(Info):
    """Search packages

-Usage: search <search pattern>
+Usage: search <term1> <term2> ... <termn>

-#FIXME: fill this later
+Finds a package in repository containing specified search terms
 """
-    pass
+    __metaclass__ = autocommand
+
+    def __init__(self):
+        super(Search, self).__init__()
+        
+    name = ("search", "s")
+
+    def options(self):
+        super(Search, self).options()
+        self.parser.add_option("-l", "--language", action="store",
+                               help=_("set search language"))
+
+    def get_lang(self):
+        lang = ctx.get_option('language')
+        if not lang:
+            lang = pisi.pxml.autoxml.LocalText.get_lang()
+        if not lang in ['en', 'tr']:
+            lang = 'en'
+        return lang
+
+    def search(self, id, terms):
+        lang = self.get_lang()
+        return pisi.search.query(id, lang, terms)
+        
+    def run(self):
+
+        self.init(True)
+
+        if not self.args:
+            self.help()
+            return
+
+        r1 = self.search('summary', self.args)
+        r2 = self.search('description', self.args)
+        r = r1.union(r2)
+
+        for pkg in r:
+            self.printinfo_package(pkg)
+
+        self.finalize()

 class SearchFile(Command):
    """Search for a file
@@ -961,8 +999,8 @@ Finds the installed package which contains the specified file.
        self.parser.add_option("-f", "--fuzzy", action="store_true",
                               default=False, help=_("fuzzy search"))
    
-    # and what does exact mean? -- exa
-    @staticmethod #fuck python 2.3 compatibility I don't care about it
+    # what does exact mean? -- exa
+    @staticmethod
    def search_exact(path):
        files = []
        path = path.lstrip('/') #FIXME: this shouldn't be necessary :/

--- a/pisi/packagedb.py
+++ b/pisi/packagedb.py
@@ -80,6 +80,13 @@ class PackageDB(object):
                self.dr[dep_name] = [ (name, dep) ]
        # add component
        ctx.componentdb.add_package(package_info.partOf, package_info.name)
+        # index summary and description
+        for (lang, doc) in package_info.summary.iteritems():
+            if lang in ['en', 'tr']:
+                pisi.search.add_doc('summary', lang, package_info.name, doc)
+        for (lang, doc) in package_info.description.iteritems():
+            if lang in ['en', 'tr']:
+                pisi.search.add_doc('description', lang, package_info.name, doc)

    def clear(self):
        self.d.clear()

--- a/pisi/pxml/autoxml.py
+++ b/pisi/pxml/autoxml.py
@@ -106,6 +106,7 @@ class LocalText(dict):
            newnode.appendChild(newtext)
            node.appendChild(newnode)

+    #FIXME: maybe more appropriate for pisi.util
    @staticmethod
    def get_lang():
        try:

--- a/pisi/search/__init__.py
+++ b/pisi/search/__init__.py
@@ -55,4 +55,6 @@ def remove_doc(id, lang, docid, str):
    ctx.invidx[id][lang].remove_doc(docid)

 def query(id, lang, terms):
+    import preprocess as p
+    terms = map(lambda x: p.lower(lang, x), terms)
    return ctx.invidx[id][lang].query(terms)
--- a/pisi/search/preprocess.py
+++ b/pisi/search/preprocess.py
@@ -12,5 +12,29 @@

 import tokenize

+def lowly_python(str):
+    def lowly_char(c):
+        if c=='I':
+            lowly = 'i'   # because of some fools we can't choose locale in lower
+        else:
+            lowly = c.lower()
+        return c
+    
+    r = ""
+    for c in str:
+        r += lowly_char(c)
+    return r
+    
+def lower(lang, str):
+    if lang=='tr':
+        return lowly_python(str)
+    else:
+        return str.lower()
+
 def preprocess(lang, str):
-    return tokenize.tokenize(lang, str)
+    terms = tokenize.tokenize(lang, str)
+    
+    # normalize
+    terms = map(lambda x: lower(lang, x), terms)
+    
+    return terms
--- a/pisi/search/tokenize.py
+++ b/pisi/search/tokenize.py
@@ -25,6 +25,8 @@ def tokenize(lang, str):
            if len(token) > 0:
                tokens.append(token)
                token = unicode()
+        elif x in string.punctuation:
+            pass # eat punctuation
        else:
            token += x