summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Gohr <andi@splitbrain.org>2005-08-27 19:48:13 +0200
committerAndreas Gohr <andi@splitbrain.org>2005-08-27 19:48:13 +0200
commit488dd6ce70042d607766283ed733c292391cd9ca (patch)
tree31b6c4fbc53a090dca64b557d99563cc63dc5e89
parent4d5714c43e9dc0f790ceb15942c6b3703cde785e (diff)
downloadrpg-488dd6ce70042d607766283ed733c292391cd9ca.tar.gz
rpg-488dd6ce70042d607766283ed733c292391cd9ca.tar.bz2
index lookup function added
darcs-hash:20050827174813-7ad00-fe84d120801b63aaaf9f8482a66d1ed1181851bd.gz
-rw-r--r--inc/indexer.php115
1 files changed, 114 insertions, 1 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index 7ca870526..fd20a4747 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -92,7 +92,7 @@ function idx_getPageWords($page){
function idx_addPage($page){
global $conf;
- // load known words and documents
+ // load known documents
$page_idx = file($conf['cachedir'].'/page.idx');
// get page id (this is the linenumber in page.idx)
@@ -185,4 +185,117 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
fwrite($fh,"\n");
}
+/**
+ * Lookup words in index
+ *
+ * Takes an array of word and will return a list of matching
+ * documents for each one.
+ *
+ * It returns an array using the same index as the input
+ * array. Returns false if something went wrong.
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function idx_lookup($words){
+ global $conf;
+
+ $result = array();
+
+ // load known words and documents
+ $page_idx = file($conf['cachedir'].'/page.idx');
+ $word_idx = file($conf['cachedir'].'/word.idx');
+
+ // get word IDs
+ $wids = array();
+ $pos = 0;
+ foreach($words as $word){
+
+ //FIXME words should be cleaned here as in getPageWords
+
+ $wid = array_search("$word\n",$word_idx);
+ if(is_int($wid)){
+ $wids[] = $wid;
+ $result[$pos]['wordid'] = $wid;
+ }
+ $result[$pos]['word'] = $word;
+ $pos++;
+ }
+ sort($wids);
+
+
+ // Open index
+ $idx = fopen($conf['cachedir'].'/index.idx','r');
+ if(!$idx){
+ msg("Failed to open index files",-1);
+ return false;
+ }
+
+ // Walk the index til the lines are found
+ $docs = array(); // hold docs found
+ $lno = 0;
+ $line = '';
+ $srch = array_shift($wids); // which word do we look for?
+ while (!feof($idx)) {
+ // read full line
+ $line .= fgets($idx, 4096);
+ if(substr($line,-1) != "\n") continue;
+ if($lno > $srch) break; // shouldn't happen
+
+
+ // do we want this line?
+ if($lno == $srch){
+ // add docs to list
+ $docs[$srch] = idx_parseIndexLine($page_idx,$line);
+
+ $srch = array_shift($wids); // next word to look up
+ if($srch == null) break; // no more words
+ }
+
+ $line = ''; // reset line buffer
+ $lno++; // increase linecounter
+ }
+ fclose($idx);
+
+ // merge docs into results
+ $count = count($result);
+ for($i=0; $i<$count; $i++){
+ if(isset($result[$i]['wordid'])){
+ $result[$i]['pages'] = $docs[$result[$i]['wordid']];
+ }
+ }
+dbg($result);
+
+}
+
+/**
+ * Returns a list of documents and counts from a index line
+ *
+ * It omits docs with a count of 0 and pages that no longer
+ * exist.
+ *
+ * @param array $page_idx The list of known pages
+ * @param string $line A line from the main index
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function idx_parseIndexLine(&$page_idx,$line){
+ $result = array();
+
+ $line = trim($line);
+ if($line == '') return;
+
+ $parts = explode(':',$line);
+ foreach($parts as $part){
+ if($part == '') continue;
+ list($doc,$cnt) = explode('*',$part);
+ if(!$cnt) continue;
+ $doc = trim($page_idx[$doc]);
+ if(!$doc) continue;
+ // make sure the document still exists
+ if(!@file_exists(wikiFN($doc))) continue;
+
+ $result[$doc] = $cnt;
+ }
+ return $result;
+}
+
//Setup VIM: ex: et ts=4 enc=utf-8 :