index lookup function added

darcs-hash:20050827174813-7ad00-fe84d120801b63aaaf9f8482a66d1ed1181851bd.gz
author: Andreas Gohr <andi@splitbrain.org> 2005-08-27 19:48:13 +0200
committer: Andreas Gohr <andi@splitbrain.org> 2005-08-27 19:48:13 +0200
commit: 488dd6ce70042d607766283ed733c292391cd9ca (patch)
tree: 31b6c4fbc53a090dca64b557d99563cc63dc5e89
parent: 4d5714c43e9dc0f790ceb15942c6b3703cde785e (diff)
download: rpg-488dd6ce70042d607766283ed733c292391cd9ca.tar.gz
rpg-488dd6ce70042d607766283ed733c292391cd9ca.tar.bz2
1 files changed, 114 insertions, 1 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index 7ca870526..fd20a4747 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -92,7 +92,7 @@ function idx_getPageWords($page){
 function idx_addPage($page){
     global $conf;
 
-    // load known words and documents
+    // load known documents
     $page_idx = file($conf['cachedir'].'/page.idx');
 
     // get page id (this is the linenumber in page.idx)
@@ -185,4 +185,117 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
     fwrite($fh,"\n");
 }
 
+/**
+ * Lookup words in index
+ *
+ * Takes an array of word and will return a list of matching
+ * documents for each one.
+ *
+ * It returns an array using the same index as the input
+ * array. Returns false if something went wrong.
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function idx_lookup($words){
+    global $conf;
+
+    $result = array();
+
+    // load known words and documents
+    $page_idx = file($conf['cachedir'].'/page.idx');
+    $word_idx = file($conf['cachedir'].'/word.idx');
+
+    // get word IDs
+    $wids = array();
+    $pos = 0;
+    foreach($words as $word){
+
+        //FIXME words should be cleaned here as in getPageWords
+
+        $wid = array_search("$word\n",$word_idx);
+        if(is_int($wid)){
+            $wids[] = $wid;
+            $result[$pos]['wordid'] = $wid;
+        }
+        $result[$pos]['word'] = $word;
+        $pos++;
+    }
+    sort($wids);
+
+
+    // Open index
+    $idx = fopen($conf['cachedir'].'/index.idx','r');
+    if(!$idx){
+       msg("Failed to open index files",-1);
+       return false;
+    } 
+
+    // Walk the index til the lines are found
+    $docs = array();                          // hold docs found
+    $lno  = 0;
+    $line = '';
+    $srch = array_shift($wids);               // which word do we look for?
+    while (!feof($idx)) {
+        // read full line
+        $line .= fgets($idx, 4096);
+        if(substr($line,-1) != "\n") continue;
+        if($lno > $srch)             break;   // shouldn't happen
+ 
+
+        // do we want this line?
+        if($lno == $srch){
+            // add docs to list
+            $docs[$srch] = idx_parseIndexLine($page_idx,$line);
+
+            $srch = array_shift($wids);        // next word to look up
+            if($srch == null) break;           // no more words
+        }
+
+        $line = ''; // reset line buffer
+        $lno++;     // increase linecounter
+    }
+    fclose($idx);
+
+    // merge docs into results
+    $count = count($result);
+    for($i=0; $i<$count; $i++){
+        if(isset($result[$i]['wordid'])){
+            $result[$i]['pages'] = $docs[$result[$i]['wordid']];
+        }
+    }
+dbg($result);
+
+}
+
+/**
+ * Returns a list of documents and counts from a index line
+ *
+ * It omits docs with a count of 0 and pages that no longer
+ * exist.
+ *
+ * @param  array  $page_idx The list of known pages
+ * @param  string $line     A line from the main index
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function idx_parseIndexLine(&$page_idx,$line){
+    $result = array();
+
+    $line = trim($line);
+    if($line == '') return;
+
+    $parts = explode(':',$line);
+    foreach($parts as $part){
+        if($part == '') continue;
+        list($doc,$cnt) = explode('*',$part);
+        if(!$cnt) continue;
+        $doc = trim($page_idx[$doc]);
+        if(!$doc) continue;
+        // make sure the document still exists
+        if(!@file_exists(wikiFN($doc))) continue;
+
+        $result[$doc] = $cnt;
+    }
+    return $result;
+}
+
 //Setup VIM: ex: et ts=4 enc=utf-8 :
author	Andreas Gohr <andi@splitbrain.org>	2005-08-27 19:48:13 +0200
committer	Andreas Gohr <andi@splitbrain.org>	2005-08-27 19:48:13 +0200
commit	488dd6ce70042d607766283ed733c292391cd9ca (patch)
tree	31b6c4fbc53a090dca64b557d99563cc63dc5e89
parent	4d5714c43e9dc0f790ceb15942c6b3703cde785e (diff)
download	rpg-488dd6ce70042d607766283ed733c292391cd9ca.tar.gz rpg-488dd6ce70042d607766283ed733c292391cd9ca.tar.bz2