From 506fa8936561993b7f70aa507d0c39a44a6ebab9 Mon Sep 17 00:00:00 2001
From: Andreas Gohr <andi@splitbrain.org>
Date: Sun, 4 Sep 2005 00:02:29 +0200
Subject: the search now uses the index

darcs-hash:20050903220229-7ad00-5d95f905eaeb3f6b867aa3ee43c2a8bccc533c00.gz
---
 inc/fulltext.php | 74 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 inc/html.php     | 24 ++++++++++--------
 inc/search.php   |  3 +++
 3 files changed, 82 insertions(+), 19 deletions(-)

(limited to 'inc')
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 8549a67c1..6c4e148a2 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -14,15 +14,16 @@
  * The fulltext search
  *
  * Returns a list of matching documents for the given query
+ *
  */
-function ft_pageSearch($query){
+function ft_pageSearch($query,&$poswords){
     $q = ft_queryParser($query);
 
+    // use this for higlighting later:
+    $poswords = join(' ',$q['and']);
+
     // lookup all words found in the query
     $words  = array_merge($q['and'],$q['not']);
-    foreach($q['phrases'] as $phrase){
-        $words  = array_merge($words,$phrase['words']);
-    }
     if(!count($words)) return array();
     $result = idx_lookup($words);
 
@@ -36,8 +37,7 @@ function ft_pageSearch($query){
         $not = array_merge($not,array_keys($result[$w]));
     }
 
-
-    // combine and words
+    // combine and-words
     if(count($q['and']) > 1){
         $docs = ft_resultCombine($q['and']);
     }else{
@@ -52,7 +52,6 @@ function ft_pageSearch($query){
 
     if(!count($docs)) return array();
 
-
     // handle phrases
     if(count($q['phrases'])){
         //build a regexp
@@ -63,7 +62,7 @@ function ft_pageSearch($query){
         // check the source of all documents for the exact phrases
         foreach(array_keys($docs) as $id){
             $text  = utf8_strtolower(rawWiki($id));
-            if(!preg_match_all('/'.$regex.'/usi',$text)){
+            if(!preg_match('/'.$regex.'/usi',$text)){
                 unset($docs[$id]); // no hit - remove
             }
         }
@@ -77,6 +76,63 @@ function ft_pageSearch($query){
     return $docs;
 }
 
+/**
+ * Quicksearch for pagenames
+ *
+ * By default it only matches the pagename and ignores the
+ * namespace. This can be changed with the second parameter
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function ft_pageLookup($id,$pageonly=true){
+    global $conf;
+    $id    = preg_quote($id,'/');
+    $pages = file($conf['cachedir'].'/page.idx');
+    $pages = array_values(preg_grep('/'.$id.'/',$pages));
+
+    $cnt = count($pages);
+    for($i=0; $i<$cnt; $i++){
+        if($pageonly){
+            if(!preg_match('/'.$id.'/',noNS($pages[$i]))){
+                unset($pages[$i]);
+                continue;
+            } 
+        }
+        if(!@file_exists(wikiFN($pages[$i]))){
+            unset($pages[$i]);
+            continue;
+        }
+    }
+    sort($pages);
+    return $pages;
+}
+
+/**
+ * Creates a snippet extract
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function ft_snippet($id,$poswords){
+    $poswords = preg_quote($poswords,'#');
+    $re       = '('.str_replace(' ','|',$poswords).')';
+    $text     = rawWiki($id);
+    //FIXME caseinsensitive matching doesn't work with UTF-8!?
+    preg_match_all('#(.{0,50})'.$re.'(.{0,50})#iu',$text,$matches,PREG_SET_ORDER);
+
+    $cnt = 0;
+    $snippet = '';
+    foreach($matches as $match){
+        $snippet .= '...'.htmlspecialchars($match[1]);
+        $snippet .= '<span class="search_hit">';
+        $snippet .= htmlspecialchars($match[2]);
+        $snippet .= '</span>';
+        $snippet .= htmlspecialchars($match[3]).'... ';
+        if($cnt++ == 2) break;
+    }
+
+    return $snippet;
+}
+
 /**
  * Combine found documents and sum up their scores
  *
@@ -144,4 +200,4 @@ function ft_queryParser($query){
     return $q;
 }
 
-
+//Setup VIM: ex: et ts=4 enc=utf-8 :
diff --git a/inc/html.php b/inc/html.php
index dcd11feb1..b73eebf8c 100644
--- a/inc/html.php
+++ b/inc/html.php
@@ -295,6 +295,7 @@ function html_hilight($html,$query){
  */
 function html_search(){
   require_once(DOKU_INC.'inc/search.php');
+  require_once(DOKU_INC.'inc/fulltext.php');
   global $conf;
   global $QUERY;
   global $ID;
@@ -312,14 +313,14 @@ function html_search(){
 
   //do quick pagesearch
   $data = array();
-  search($data,$conf['datadir'],'search_pagename',array(query => cleanID($QUERY)));
+  $data = ft_pageLookup(cleanID($QUERY));
   if(count($data)){
     sort($data);
     print '<div class="search_quickresult">';
     print '<b>'.$lang[quickhits].':</b><br />';
-    foreach($data as $row){
+    foreach($data as $id){
       print '<div class="search_quickhits">';
-      print html_wikilink(':'.$row['id'],$conf['useheading']?NULL:$row['id']);
+      print html_wikilink(':'.$id,$conf['useheading']?NULL:$id);
       print '</div> ';
     }
     //clear float (see http://www.complexspiral.com/publications/containing-floats/)
@@ -329,16 +330,19 @@ function html_search(){
   flush();
 
   //do fulltext search
-  $data = array();
-  search($data,$conf['datadir'],'search_fulltext',array(query => utf8_strtolower($QUERY)));
+  $data = ft_pageSearch($QUERY,$poswords);
   if(count($data)){
-    usort($data,'sort_search_fulltext');
-    foreach($data as $row){
+    $num = 1;
+    foreach($data as $id => $cnt){
       print '<div class="search_result">';
-      print html_wikilink(':'.$row['id'],$conf['useheading']?NULL:$row['id'],$row['poswords']);
-      print ': <span class="search_cnt">'.$row['count'].' '.$lang['hits'].'</span><br />';
-      print '<div class="search_snippet">'.$row['snippet'].'</div>';
+      print html_wikilink(':'.$id,$conf['useheading']?NULL:$id,$poswords);
+      print ': <span class="search_cnt">'.$cnt.' '.$lang['hits'].'</span><br />';
+      if($num < 15){ // create snippets for the first number of matches only #FIXME add to conf ?
+        print '<div class="search_snippet">'.ft_snippet($id,$poswords).'</div>';
+      }
       print '</div>';
+      flush();
+      $num++;
     }
   }else{
     print '<div class="nothing">'.$lang['nothingfound'].'</div>';
diff --git a/inc/search.php b/inc/search.php
index 3604db15e..ea20c4f3b 100644
--- a/inc/search.php
+++ b/inc/search.php
@@ -283,6 +283,7 @@ function search_backlinks(&$data,$base,$file,$type,$lvl,$opts){
  * $opts['query'] is the search query
  *
  * @author  Andreas Gohr <andi@splitbrain.org>
+ * @deprecated - fulltext indexer is used instead
  */
 function search_fulltext(&$data,$base,$file,$type,$lvl,$opts){
   //we do nothing with directories
@@ -383,6 +384,8 @@ function search_reference(&$data,$base,$file,$type,$lvl,$opts){
  *
  * @author  Andreas Gohr <andi@splitbrain.org>
  * @author  Matthias Grimm <matthiasgrimm@users.sourceforge.net>
+ *
+ * @deprecated - fulltext indexer is used instead
  */
 function search_regex(&$data,$base,$file,$reg,$words){
 
-- 
cgit v1.2.3