summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--inc/fulltext.php147
-rw-r--r--inc/indexer.php57
2 files changed, 186 insertions, 18 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
new file mode 100644
index 000000000..8549a67c1
--- /dev/null
+++ b/inc/fulltext.php
@@ -0,0 +1,147 @@
+<?php
+/**
+ * DokuWiki fulltextsearch functions using the index
+ *
+ * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+
+ if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
+ require_once(DOKU_INC.'inc/indexer.php');
+
+
+/**
+ * The fulltext search
+ *
+ * Returns a list of matching documents for the given query
+ */
+function ft_pageSearch($query){
+ $q = ft_queryParser($query);
+
+ // lookup all words found in the query
+ $words = array_merge($q['and'],$q['not']);
+ foreach($q['phrases'] as $phrase){
+ $words = array_merge($words,$phrase['words']);
+ }
+ if(!count($words)) return array();
+ $result = idx_lookup($words);
+
+ // merge search results with query
+ foreach($q['and'] as $pos => $w){
+ $q['and'][$pos] = $result[$w];
+ }
+ // create a list of unwanted docs
+ $not = array();
+ foreach($q['not'] as $pos => $w){
+ $not = array_merge($not,array_keys($result[$w]));
+ }
+
+
+ // combine and words
+ if(count($q['and']) > 1){
+ $docs = ft_resultCombine($q['and']);
+ }else{
+ $docs = $q['and'][0];
+ }
+ if(!count($docs)) return array();
+
+ // remove negative matches
+ foreach($not as $n){
+ unset($docs[$n]);
+ }
+
+ if(!count($docs)) return array();
+
+
+ // handle phrases
+ if(count($q['phrases'])){
+ //build a regexp
+ $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
+ $q['phrases'] = array_map('preg_quote',$q['phrases']);
+ $regex = '('.join('|',$q['phrases']).')';
+
+ // check the source of all documents for the exact phrases
+ foreach(array_keys($docs) as $id){
+ $text = utf8_strtolower(rawWiki($id));
+ if(!preg_match_all('/'.$regex.'/usi',$text)){
+ unset($docs[$id]); // no hit - remove
+ }
+ }
+ }
+
+ if(!count($docs)) return array();
+
+ // if there are any hits left, sort them by count
+ arsort($docs);
+
+ return $docs;
+}
+
+/**
+ * Combine found documents and sum up their scores
+ *
+ * This function is used to combine searched words with a logical
+ * AND. Only documents available in all arrays are returned.
+ *
+ * based upon PEAR's PHP_Compat function for array_intersect_key()
+ *
+ * @param array $args An array of page arrays
+ */
+function ft_resultCombine($args){
+ $array_count = count($args);
+ $result = array();
+ foreach ($args[0] as $key1 => $value1) {
+ for ($i = 1; $i !== $array_count; $i++) {
+ foreach ($args[$i] as $key2 => $value2) {
+ if ((string) $key1 === (string) $key2) {
+ if(!isset($result[$key1])) $result[$key1] = $value1;
+ $result[$key1] += $value2;
+ }
+ }
+ }
+ }
+ return $result;
+}
+
+/**
+ * Builds an array of search words from a query
+ *
+ * @todo support OR and parenthesises?
+ */
+function ft_queryParser($query){
+ global $conf;
+ $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
+ if(@file_exists($swfile)){
+ $stopwords = file($swfile);
+ }else{
+ $stopwords = array();
+ }
+
+ $q = array();
+ $q['query'] = $query;
+ $q['phrases'] = array();
+ $q['and'] = array();
+ $q['not'] = array();
+
+ // handle phrase searches
+ while(preg_match('/"(.*?)"/',$query,$match)){
+ $q['phrases'][] = $match[0];
+ $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
+ $query = preg_replace('/"(.*?)"/','',$query,1);
+ }
+
+ $words = explode(' ',$query);
+ foreach($words as $w){
+ if($w{0} == '-'){
+ $token = idx_tokenizer($w,$stopwords);
+ if(count($token)) $q['not'] = array_merge($q['not'],$token);
+ }else{
+ $token = idx_tokenizer($w,$stopwords);
+ if(count($token)) $q['and'] = array_merge($q['and'],$token);
+ }
+ }
+
+ return $q;
+}
+
+
diff --git a/inc/indexer.php b/inc/indexer.php
index fd20a4747..65ae126dd 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -191,9 +191,6 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
* Takes an array of word and will return a list of matching
* documents for each one.
*
- * It returns an array using the same index as the input
- * array. Returns false if something went wrong.
- *
* @author Andreas Gohr <andi@splitbrain.org>
*/
function idx_lookup($words){
@@ -207,21 +204,17 @@ function idx_lookup($words){
// get word IDs
$wids = array();
- $pos = 0;
foreach($words as $word){
-
- //FIXME words should be cleaned here as in getPageWords
-
$wid = array_search("$word\n",$word_idx);
if(is_int($wid)){
$wids[] = $wid;
- $result[$pos]['wordid'] = $wid;
+ $result[$word] = $wid;
+ }else{
+ $result[$word] = array();
}
- $result[$pos]['word'] = $word;
- $pos++;
}
sort($wids);
-
+ $wids = array_unique($wids);
// Open index
$idx = fopen($conf['cachedir'].'/index.idx','r');
@@ -256,15 +249,14 @@ function idx_lookup($words){
}
fclose($idx);
- // merge docs into results
- $count = count($result);
- for($i=0; $i<$count; $i++){
- if(isset($result[$i]['wordid'])){
- $result[$i]['pages'] = $docs[$result[$i]['wordid']];
+ // merge found pages into result array
+ foreach(array_keys($result) as $word){
+ if(is_int($result[$word])){
+ $result[$word] = $docs[$result[$word]];
}
}
-dbg($result);
+ return $result;
}
/**
@@ -281,7 +273,7 @@ function idx_parseIndexLine(&$page_idx,$line){
$result = array();
$line = trim($line);
- if($line == '') return;
+ if($line == '') return $result;
$parts = explode(':',$line);
foreach($parts as $part){
@@ -298,4 +290,33 @@ function idx_parseIndexLine(&$page_idx,$line){
return $result;
}
+/**
+ * Tokenizes a string into an array of search words
+ *
+ * Uses the same algorithm as idx_getPageWords()
+ *
+ * @todo make combined function to use alone or in getPageWords
+ */
+function idx_tokenizer($string,&$stopwords){
+ $words = array();
+
+ if(preg_match('/[^0-9A-Za-z]/u', $string)){
+ $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
+ foreach ($arr as $w) {
+ if (!is_numeric($w) && strlen($w) < 3) continue;
+ $w = utf8_strtolower($w);
+ if(is_int(array_search("$w\n",$stopwords))) continue;
+ $words[] = $w;
+ }
+ }else{
+ $w = $string;
+ if (!is_numeric($w) && strlen($w) < 3) return $words;
+ $w = strtolower($w);
+ if(is_int(array_search("$w\n",$stopwords))) return $words;
+ $words[] = $w;
+ }
+
+ return $words;
+}
+
//Setup VIM: ex: et ts=4 enc=utf-8 :