summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Gohr <andi@splitbrain.org>2005-08-13 16:20:47 +0200
committerAndreas Gohr <andi@splitbrain.org>2005-08-13 16:20:47 +0200
commit44ca0adf4877b3bb6ac8bf0ff41fb908fed7a827 (patch)
treed433eb79e89331847e723b03328ce1ac1f3ab36e
parentcf58092346a64b27e741107331dab3e69e7a70a0 (diff)
downloadrpg-44ca0adf4877b3bb6ac8bf0ff41fb908fed7a827.tar.gz
rpg-44ca0adf4877b3bb6ac8bf0ff41fb908fed7a827.tar.bz2
more indexer functions
darcs-hash:20050813142047-7ad00-8b4417587c577220191d6b6d35574e82b5bf5596.gz
-rw-r--r--inc/indexer.php140
1 files changed, 134 insertions, 6 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index 3fb710338..173b7aa3c 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -13,10 +13,20 @@
require_once(DOKU_INC.'inc/parserutils.php');
/**
- * based upon class.search_indexer_phpcms.php::index_entry
+ * Split a page into words
+ *
+ * It is based upon PHPCMS's indexer function index_entry
+ *
+ * Returns an array of of word counts, false if an error occured
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
*/
-function idx_getPageWords($id){
- $body = rawWiki($id);
+function idx_getPageWords($page){
+ global $conf;
+ $word_idx = file($conf['cachedir'].'/word.idx');
+
+ // split page into words
+ $body = rawWiki($page);
$body = utf8_stripspecials($body,' ','._\-:');
$body = utf8_strtolower($body);
$body = trim($body);
@@ -25,11 +35,11 @@ function idx_getPageWords($id){
$index = array(); //resulting index
$old = '';
+ $wid = -1;
$doit = true;
$pos = 0;
//compact wordlist FIXME check for stopwords
-
foreach($words as $word){
if(strlen($word) == 0) continue;
@@ -40,7 +50,7 @@ function idx_getPageWords($id){
continue;
}
// just increase the counter
- $index[$word]++;
+ $index[$wid]++;
continue;
}
@@ -58,13 +68,131 @@ function idx_getPageWords($id){
//FIXME add stopword check
+ // get word ID
+ $wid = array_search("$word\n",$word_idx);
+ if(!is_int($wid)){
+ $word_idx[] = "$word\n";
+ $wid = count($word_idx)-1;
+ }
// add to index
- $index[$word] = 1;
+ $index[$wid] = 1;
+ }
+
+ // save back word index
+ $fh = fopen($conf['cachedir'].'/word.idx','w');
+ if(!$fh){
+ trigger_error("Failed to write word.idx", E_USER_ERROR);
+ return false;
}
+ fwrite($fh,join('',$word_idx));
+ fclose($fh);
return $index;
}
+/**
+ * Adds/updates the search for the given page
+ *
+ * This is the core function of the indexer which does most
+ * of the work. This function needs to be called with proper
+ * locking!
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function idx_addPage($page){
+ global $conf;
+
+ // load known words and documents
+ $page_idx = file($conf['cachedir'].'/page.idx');
+
+ // get page id (this is the linenumber in page.idx)
+ $pid = array_search("$page\n",$page_idx);
+ if(!is_int($pid)){
+ $page_idx[] = "$page\n";
+ $pid = count($page_idx)-1;
+ // page was new - write back
+ $fh = fopen($conf['cachedir'].'/page.idx','w');
+ if(!$fh) return false;
+ fwrite($fh,join('',$page_idx));
+ fclose($fh);
+ }
+
+ // get word usage in page
+ $words = idx_getPageWords($page);
+ if($words === false) return false;
+ if(!count($words)) return true;
+
+ // Open index and temp file
+ $idx = fopen($conf['cachedir'].'/index.idx','r');
+ $tmp = fopen($conf['cachedir'].'/index.tmp','w');
+ if(!$idx || !$tmp){
+ trigger_error("Failed to open index files", E_USER_ERROR);
+ return false;
+ }
+
+ // copy from index to temp file, modifying were needed
+ $lno = 0;
+ $line = '';
+ while (!feof($idx)) {
+ // read full line
+ $line .= fgets($idx, 4096);
+ if(substr($line,-1) != "\n") continue;
+
+ // write a new Line to temp file
+ idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
+ $line = ''; // reset line buffer
+ $lno++; // increase linecounter
+ }
+ fclose($idx);
+
+ // add missing lines (usually index and word should contain
+ // the same number of lines, however if the page contained
+ // new words the word file has some more lines which need to
+ // be added here
+ $word_idx = file($conf['cachedir'].'/word.idx');
+ $wcnt = count($word_idx);
+ for($lno; $lno<$wcnt; $lno++){
+ idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
+ }
+
+ // close the temp file and move it over to be the new one
+ fclose($tmp);
+ return rename($conf['cachedir'].'/index.tmp',
+ $conf['cachedir'].'/index.idx');
+}
+
+/**
+ * Write a new index line to the filehandle
+ *
+ * This function writes an line for the index file to the
+ * given filehandle. It removes the given document from
+ * the given line and readds it when $count is >0.
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function idx_writeIndexLine($fh,$line,$pid,$count){
+ $line = trim($line);
+
+ if($line != ''){
+ $parts = explode(':',$line);
+ // remove doc from given line
+ foreach($parts as $part){
+ if($part == '') continue;
+ list($doc,$cnt) = explode('*',$part);
+ if($doc != $pid){
+ fwrite($fh,"$doc*$cnt:");
+ }
+ }
+ }
+
+ // add doc
+ if ($count){
+ fwrite($fh,"$pid*$count");
+ }
+
+ // add newline
+ fwrite($fh,"\n");
+}
//Setup VIM: ex: et ts=4 enc=utf-8 :