From f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694 Mon Sep 17 00:00:00 2001
From: Andreas Gohr <andi@splitbrain.org>
Date: Sun, 28 Aug 2005 17:28:21 +0200
Subject: new fulltext search function using the index

The new search function was added but is not yet integrated into
DokuWikis interface.

darcs-hash:20050828152821-7ad00-a6e79a9dc5aaf41c547cf42dccdbc3b5bc8d303e.gz
---
 inc/fulltext.php | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 inc/fulltext.php

(limited to 'inc/fulltext.php')

diff --git a/inc/fulltext.php b/inc/fulltext.php
new file mode 100644
index 000000000..8549a67c1
--- /dev/null
+++ b/inc/fulltext.php
@@ -0,0 +1,147 @@
+<?php
+/**
+ * DokuWiki fulltextsearch functions using the index
+ *
+ * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
+ * @author     Andreas Gohr <andi@splitbrain.org>
+ */
+
+  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
+  require_once(DOKU_INC.'inc/indexer.php');
+
+
+/**
+ * The fulltext search
+ *
+ * Returns a list of matching documents for the given query
+ */
+function ft_pageSearch($query){
+    $q = ft_queryParser($query);
+
+    // lookup all words found in the query
+    $words  = array_merge($q['and'],$q['not']);
+    foreach($q['phrases'] as $phrase){
+        $words  = array_merge($words,$phrase['words']);
+    }
+    if(!count($words)) return array();
+    $result = idx_lookup($words);
+
+    // merge search results with query
+    foreach($q['and'] as $pos => $w){
+        $q['and'][$pos] = $result[$w];
+    }
+    // create a list of unwanted docs
+    $not = array();
+    foreach($q['not'] as $pos => $w){
+        $not = array_merge($not,array_keys($result[$w]));
+    }
+
+
+    // combine and words
+    if(count($q['and']) > 1){
+        $docs = ft_resultCombine($q['and']);
+    }else{
+        $docs = $q['and'][0];
+    }
+    if(!count($docs)) return array();
+
+    // remove negative matches
+    foreach($not as $n){
+        unset($docs[$n]);
+    }
+
+    if(!count($docs)) return array();
+
+
+    // handle phrases
+    if(count($q['phrases'])){
+        //build a regexp
+        $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
+        $q['phrases'] = array_map('preg_quote',$q['phrases']);
+        $regex = '('.join('|',$q['phrases']).')';
+
+        // check the source of all documents for the exact phrases
+        foreach(array_keys($docs) as $id){
+            $text  = utf8_strtolower(rawWiki($id));
+            if(!preg_match_all('/'.$regex.'/usi',$text)){
+                unset($docs[$id]); // no hit - remove
+            }
+        }
+    }
+
+    if(!count($docs)) return array();
+
+    // if there are any hits left, sort them by count
+    arsort($docs);
+
+    return $docs;
+}
+
+/**
+ * Combine found documents and sum up their scores
+ *
+ * This function is used to combine searched words with a logical
+ * AND. Only documents available in all arrays are returned.
+ *
+ * based upon PEAR's PHP_Compat function for array_intersect_key()
+ *
+ * @param array $args An array of page arrays
+ */
+function ft_resultCombine($args){
+    $array_count = count($args);
+    $result = array();
+    foreach ($args[0] as $key1 => $value1) {
+        for ($i = 1; $i !== $array_count; $i++) {
+            foreach ($args[$i] as $key2 => $value2) {
+                if ((string) $key1 === (string) $key2) {
+                    if(!isset($result[$key1])) $result[$key1] = $value1;
+                    $result[$key1] += $value2;
+                }
+            }
+        }
+    }
+    return $result;
+}
+
+/**
+ * Builds an array of search words from a query
+ *
+ * @todo support OR and parenthesises?
+ */
+function ft_queryParser($query){
+    global $conf;
+    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
+    if(@file_exists($swfile)){
+        $stopwords = file($swfile);
+    }else{
+        $stopwords = array();
+    }
+
+    $q = array();
+    $q['query']   = $query;
+    $q['phrases'] = array();
+    $q['and']     = array();
+    $q['not']     = array();
+    
+    // handle phrase searches
+    while(preg_match('/"(.*?)"/',$query,$match)){
+        $q['phrases'][] = $match[0];
+        $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
+        $query = preg_replace('/"(.*?)"/','',$query,1);
+    }
+
+    $words = explode(' ',$query);
+    foreach($words as $w){
+        if($w{0} == '-'){
+            $token = idx_tokenizer($w,$stopwords);
+            if(count($token)) $q['not'] = array_merge($q['not'],$token);
+        }else{
+            $token = idx_tokenizer($w,$stopwords);
+            if(count($token)) $q['and'] = array_merge($q['and'],$token);
+        }
+    }
+
+    return $q;
+}
+
+
-- 
cgit v1.2.3