summaryrefslogtreecommitdiff
path: root/inc/indexer.php
diff options
context:
space:
mode:
authorAndreas Gohr <andi@splitbrain.org>2005-08-28 17:28:21 +0200
committerAndreas Gohr <andi@splitbrain.org>2005-08-28 17:28:21 +0200
commitf5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694 (patch)
treeabc7dff7a3288ade15406b61d103b6e0efb8a490 /inc/indexer.php
parent488dd6ce70042d607766283ed733c292391cd9ca (diff)
downloadrpg-f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694.tar.gz
rpg-f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694.tar.bz2
new fulltext search function using the index
The new search function was added but is not yet integrated into DokuWikis interface. darcs-hash:20050828152821-7ad00-a6e79a9dc5aaf41c547cf42dccdbc3b5bc8d303e.gz
Diffstat (limited to 'inc/indexer.php')
-rw-r--r--inc/indexer.php57
1 files changed, 39 insertions, 18 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index fd20a4747..65ae126dd 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -191,9 +191,6 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
* Takes an array of word and will return a list of matching
* documents for each one.
*
- * It returns an array using the same index as the input
- * array. Returns false if something went wrong.
- *
* @author Andreas Gohr <andi@splitbrain.org>
*/
function idx_lookup($words){
@@ -207,21 +204,17 @@ function idx_lookup($words){
// get word IDs
$wids = array();
- $pos = 0;
foreach($words as $word){
-
- //FIXME words should be cleaned here as in getPageWords
-
$wid = array_search("$word\n",$word_idx);
if(is_int($wid)){
$wids[] = $wid;
- $result[$pos]['wordid'] = $wid;
+ $result[$word] = $wid;
+ }else{
+ $result[$word] = array();
}
- $result[$pos]['word'] = $word;
- $pos++;
}
sort($wids);
-
+ $wids = array_unique($wids);
// Open index
$idx = fopen($conf['cachedir'].'/index.idx','r');
@@ -256,15 +249,14 @@ function idx_lookup($words){
}
fclose($idx);
- // merge docs into results
- $count = count($result);
- for($i=0; $i<$count; $i++){
- if(isset($result[$i]['wordid'])){
- $result[$i]['pages'] = $docs[$result[$i]['wordid']];
+ // merge found pages into result array
+ foreach(array_keys($result) as $word){
+ if(is_int($result[$word])){
+ $result[$word] = $docs[$result[$word]];
}
}
-dbg($result);
+ return $result;
}
/**
@@ -281,7 +273,7 @@ function idx_parseIndexLine(&$page_idx,$line){
$result = array();
$line = trim($line);
- if($line == '') return;
+ if($line == '') return $result;
$parts = explode(':',$line);
foreach($parts as $part){
@@ -298,4 +290,33 @@ function idx_parseIndexLine(&$page_idx,$line){
return $result;
}
+/**
+ * Tokenizes a string into an array of search words
+ *
+ * Uses the same algorithm as idx_getPageWords()
+ *
+ * @todo make combined function to use alone or in getPageWords
+ */
+function idx_tokenizer($string,&$stopwords){
+ $words = array();
+
+ if(preg_match('/[^0-9A-Za-z]/u', $string)){
+ $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
+ foreach ($arr as $w) {
+ if (!is_numeric($w) && strlen($w) < 3) continue;
+ $w = utf8_strtolower($w);
+ if(is_int(array_search("$w\n",$stopwords))) continue;
+ $words[] = $w;
+ }
+ }else{
+ $w = $string;
+ if (!is_numeric($w) && strlen($w) < 3) return $words;
+ $w = strtolower($w);
+ if(is_int(array_search("$w\n",$stopwords))) return $words;
+ $words[] = $w;
+ }
+
+ return $words;
+}
+
//Setup VIM: ex: et ts=4 enc=utf-8 :