diff options
author | Andreas Gohr <andi@splitbrain.org> | 2005-08-28 17:28:21 +0200 |
---|---|---|
committer | Andreas Gohr <andi@splitbrain.org> | 2005-08-28 17:28:21 +0200 |
commit | f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694 (patch) | |
tree | abc7dff7a3288ade15406b61d103b6e0efb8a490 /inc/indexer.php | |
parent | 488dd6ce70042d607766283ed733c292391cd9ca (diff) | |
download | rpg-f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694.tar.gz rpg-f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694.tar.bz2 |
new fulltext search function using the index
The new search function was added but is not yet integrated into
DokuWikis interface.
darcs-hash:20050828152821-7ad00-a6e79a9dc5aaf41c547cf42dccdbc3b5bc8d303e.gz
Diffstat (limited to 'inc/indexer.php')
-rw-r--r-- | inc/indexer.php | 57 |
1 files changed, 39 insertions, 18 deletions
diff --git a/inc/indexer.php b/inc/indexer.php index fd20a4747..65ae126dd 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -191,9 +191,6 @@ function idx_writeIndexLine($fh,$line,$pid,$count){ * Takes an array of word and will return a list of matching * documents for each one. * - * It returns an array using the same index as the input - * array. Returns false if something went wrong. - * * @author Andreas Gohr <andi@splitbrain.org> */ function idx_lookup($words){ @@ -207,21 +204,17 @@ function idx_lookup($words){ // get word IDs $wids = array(); - $pos = 0; foreach($words as $word){ - - //FIXME words should be cleaned here as in getPageWords - $wid = array_search("$word\n",$word_idx); if(is_int($wid)){ $wids[] = $wid; - $result[$pos]['wordid'] = $wid; + $result[$word] = $wid; + }else{ + $result[$word] = array(); } - $result[$pos]['word'] = $word; - $pos++; } sort($wids); - + $wids = array_unique($wids); // Open index $idx = fopen($conf['cachedir'].'/index.idx','r'); @@ -256,15 +249,14 @@ function idx_lookup($words){ } fclose($idx); - // merge docs into results - $count = count($result); - for($i=0; $i<$count; $i++){ - if(isset($result[$i]['wordid'])){ - $result[$i]['pages'] = $docs[$result[$i]['wordid']]; + // merge found pages into result array + foreach(array_keys($result) as $word){ + if(is_int($result[$word])){ + $result[$word] = $docs[$result[$word]]; } } -dbg($result); + return $result; } /** @@ -281,7 +273,7 @@ function idx_parseIndexLine(&$page_idx,$line){ $result = array(); $line = trim($line); - if($line == '') return; + if($line == '') return $result; $parts = explode(':',$line); foreach($parts as $part){ @@ -298,4 +290,33 @@ function idx_parseIndexLine(&$page_idx,$line){ return $result; } +/** + * Tokenizes a string into an array of search words + * + * Uses the same algorithm as idx_getPageWords() + * + * @todo make combined function to use alone or in getPageWords + */ +function idx_tokenizer($string,&$stopwords){ + $words = array(); + + if(preg_match('/[^0-9A-Za-z]/u', $string)){ + $arr = explode(' ', utf8_stripspecials($string,' ','._\-:')); + foreach ($arr as $w) { + if (!is_numeric($w) && strlen($w) < 3) continue; + $w = utf8_strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) continue; + $words[] = $w; + } + }else{ + $w = $string; + if (!is_numeric($w) && strlen($w) < 3) return $words; + $w = strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) return $words; + $words[] = $w; + } + + return $words; +} + //Setup VIM: ex: et ts=4 enc=utf-8 : |