diff options
-rw-r--r-- | inc/indexer.php | 140 |
1 files changed, 134 insertions, 6 deletions
diff --git a/inc/indexer.php b/inc/indexer.php index 3fb710338..173b7aa3c 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -13,10 +13,20 @@ require_once(DOKU_INC.'inc/parserutils.php'); /** - * based upon class.search_indexer_phpcms.php::index_entry + * Split a page into words + * + * It is based upon PHPCMS's indexer function index_entry + * + * Returns an array of of word counts, false if an error occured + * + * @author Andreas Gohr <andi@splitbrain.org> */ -function idx_getPageWords($id){ - $body = rawWiki($id); +function idx_getPageWords($page){ + global $conf; + $word_idx = file($conf['cachedir'].'/word.idx'); + + // split page into words + $body = rawWiki($page); $body = utf8_stripspecials($body,' ','._\-:'); $body = utf8_strtolower($body); $body = trim($body); @@ -25,11 +35,11 @@ function idx_getPageWords($id){ $index = array(); //resulting index $old = ''; + $wid = -1; $doit = true; $pos = 0; //compact wordlist FIXME check for stopwords - foreach($words as $word){ if(strlen($word) == 0) continue; @@ -40,7 +50,7 @@ function idx_getPageWords($id){ continue; } // just increase the counter - $index[$word]++; + $index[$wid]++; continue; } @@ -58,13 +68,131 @@ function idx_getPageWords($id){ //FIXME add stopword check + // get word ID + $wid = array_search("$word\n",$word_idx); + if(!is_int($wid)){ + $word_idx[] = "$word\n"; + $wid = count($word_idx)-1; + } // add to index - $index[$word] = 1; + $index[$wid] = 1; + } + + // save back word index + $fh = fopen($conf['cachedir'].'/word.idx','w'); + if(!$fh){ + trigger_error("Failed to write word.idx", E_USER_ERROR); + return false; } + fwrite($fh,join('',$word_idx)); + fclose($fh); return $index; } +/** + * Adds/updates the search for the given page + * + * This is the core function of the indexer which does most + * of the work. This function needs to be called with proper + * locking! + * + * @author Andreas Gohr <andi@splitbrain.org> + */ +function idx_addPage($page){ + global $conf; + + // load known words and documents + $page_idx = file($conf['cachedir'].'/page.idx'); + + // get page id (this is the linenumber in page.idx) + $pid = array_search("$page\n",$page_idx); + if(!is_int($pid)){ + $page_idx[] = "$page\n"; + $pid = count($page_idx)-1; + // page was new - write back + $fh = fopen($conf['cachedir'].'/page.idx','w'); + if(!$fh) return false; + fwrite($fh,join('',$page_idx)); + fclose($fh); + } + + // get word usage in page + $words = idx_getPageWords($page); + if($words === false) return false; + if(!count($words)) return true; + + // Open index and temp file + $idx = fopen($conf['cachedir'].'/index.idx','r'); + $tmp = fopen($conf['cachedir'].'/index.tmp','w'); + if(!$idx || !$tmp){ + trigger_error("Failed to open index files", E_USER_ERROR); + return false; + } + + // copy from index to temp file, modifying were needed + $lno = 0; + $line = ''; + while (!feof($idx)) { + // read full line + $line .= fgets($idx, 4096); + if(substr($line,-1) != "\n") continue; + + // write a new Line to temp file + idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); + $line = ''; // reset line buffer + $lno++; // increase linecounter + } + fclose($idx); + + // add missing lines (usually index and word should contain + // the same number of lines, however if the page contained + // new words the word file has some more lines which need to + // be added here + $word_idx = file($conf['cachedir'].'/word.idx'); + $wcnt = count($word_idx); + for($lno; $lno<$wcnt; $lno++){ + idx_writeIndexLine($tmp,'',$pid,$words[$lno]); + } + + // close the temp file and move it over to be the new one + fclose($tmp); + return rename($conf['cachedir'].'/index.tmp', + $conf['cachedir'].'/index.idx'); +} + +/** + * Write a new index line to the filehandle + * + * This function writes an line for the index file to the + * given filehandle. It removes the given document from + * the given line and readds it when $count is >0. + * + * @author Andreas Gohr <andi@splitbrain.org> + */ +function idx_writeIndexLine($fh,$line,$pid,$count){ + $line = trim($line); + + if($line != ''){ + $parts = explode(':',$line); + // remove doc from given line + foreach($parts as $part){ + if($part == '') continue; + list($doc,$cnt) = explode('*',$part); + if($doc != $pid){ + fwrite($fh,"$doc*$cnt:"); + } + } + } + + // add doc + if ($count){ + fwrite($fh,"$pid*$count"); + } + + // add newline + fwrite($fh,"\n"); +} //Setup VIM: ex: et ts=4 enc=utf-8 : |