1 files changed, 44 insertions, 58 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 7ace3a724..8155325ee 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){
  * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  */
 function _ft_pageSearch(&$data) {
+    $Indexer = idx_get_indexer();
+
     // parse the given query
-    $q = ft_queryParser($data['query']);
+    $q = ft_queryParser($Indexer, $data['query']);
     $data['highlight'] = $q['highlight'];
 
     if (empty($q['parsed_ary'])) return array();
 
     // lookup all words found in the query
-    $lookup = idx_lookup($q['words']);
+    $lookup = $Indexer->lookup($q['words']);
 
     // get all pages in this dokuwiki site (!: includes nonexistent pages)
     $pages_all = array();
-    foreach (idx_getIndex('page', '') as $id) {
-        $pages_all[trim($id)] = 0; // base: 0 hit
+    foreach ($Indexer->getPages() as $id) {
+        $pages_all[$id] = 0; // base: 0 hit
     }
 
     // process the query
@@ -122,29 +124,12 @@ function _ft_pageSearch(&$data) {
 /**
  * Returns the backlinks for a given page
  *
- * Does a quick lookup with the fulltext index, then
- * evaluates the instructions of the found pages
+ * Uses the metadata index.
  */
 function ft_backlinks($id){
-    global $conf;
-    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
-    $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
     $result = array();
 
-    // quick lookup of the pagename
-    $page    = noNS($id);
-    $matches = idx_lookup(idx_tokenizer($page,$stopwords));  // pagename may contain specials (_ or .)
-    $docs    = array_keys(ft_resultCombine(array_values($matches)));
-    $docs    = array_filter($docs,'isVisiblePage'); // discard hidden pages
-    if(!count($docs)) return $result;
-
-    // check metadata for matching links
-    foreach($docs as $match){
-        // metadata relation reference links are already resolved
-        $links = p_get_metadata($match,'relation references');
-        if (isset($links[$id])) $result[] = $match;
-    }
+    $result = idx_get_indexer()->lookupKey('relation_references', $id);
 
     if(!count($result)) return $result;
 
@@ -168,17 +153,14 @@ function ft_backlinks($id){
  * Aborts after $max found results
  */
 function ft_mediause($id,$max){
-    global $conf;
-    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
-    $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
     if(!$max) $max = 1; // need to find at least one
 
     $result = array();
 
     // quick lookup of the mediafile
+    // FIXME use metadata key lookup
     $media   = noNS($id);
-    $matches = idx_lookup(idx_tokenizer($media,$stopwords));
+    $matches = idx_lookup(idx_tokenizer($media));
     $docs    = array_keys(ft_resultCombine(array_values($matches)));
     if(!count($docs)) return $result;
 
@@ -229,7 +211,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){
 }
 
 function _ft_pageLookup(&$data){
-    global $conf;
     // split out original parameters
     $id = $data['id'];
     if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) {
@@ -239,29 +220,32 @@ function _ft_pageLookup(&$data){
 
     $in_ns    = $data['in_ns'];
     $in_title = $data['in_title'];
+    $cleaned = cleanID($id);
 
-    $pages  = array_map('rtrim', idx_getIndex('page', ''));
-    $titles = array_map('rtrim', idx_getIndex('title', ''));
-    // check for corrupt title index #FS2076
-    if(count($pages) != count($titles)){
-        $titles = array_fill(0,count($pages),'');
-        @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php
-    }
-    $pages = array_combine($pages, $titles);
+    $Indexer = idx_get_indexer();
+    $page_idx = $Indexer->getPages();
 
-    $cleaned = cleanID($id);
+    $pages = array();
     if ($id !== '' && $cleaned !== '') {
-        foreach ($pages as $p_id => $p_title) {
-            if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) &&
-                (!$in_title || (stripos($p_title, $id) === false)) ) {
-                unset($pages[$p_id]);
+        foreach ($page_idx as $p_id) {
+            if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
+                if (!isset($pages[$p_id]))
+                    $pages[$p_id] = p_get_first_heading($p_id, false);
+            }
+        }
+        if ($in_title) {
+            $wildcard_id = "*$id*";
+            foreach ($Indexer->lookupKey('title', $wildcard_id) as $p_id) {
+                if (!isset($pages[$p_id]))
+                    $pages[$p_id] = p_get_first_heading($p_id, false);
             }
         }
     }
     if (isset($ns)) {
-        foreach (array_keys($pages) as $p_id) {
-            if (strpos($p_id, $ns) !== 0) {
-                unset($pages[$p_id]);
+        foreach ($page_idx as $p_id) {
+            if (strpos($p_id, $ns) === 0) {
+                if (!isset($pages[$p_id]))
+                    $pages[$p_id] = p_get_first_heading($p_id, false);
             }
         }
     }
@@ -304,6 +288,7 @@ function ft_pagesorter($a, $b){
  */
 function ft_snippet($id,$highlight){
     $text = rawWiki($id);
+    $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
     $evdata = array(
             'id'        => $id,
             'text'      => &$text,
@@ -396,6 +381,11 @@ function ft_snippet($id,$highlight){
  * Wraps a search term in regex boundary checks.
  */
 function ft_snippet_re_preprocess($term) {
+    // do not process asian terms where word boundaries are not explicit
+    if(preg_match('/'.IDX_ASIAN.'/u',$term)){
+        return $term;
+    }
+
     if(substr($term,0,2) == '\\*'){
         $term = substr($term,2);
     }else{
@@ -494,11 +484,7 @@ function ft_resultComplement($args) {
  * @author Andreas Gohr <andi@splitbrain.org>
  * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  */
-function ft_queryParser($query){
-    global $conf;
-    $swfile    = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
-    $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
+function ft_queryParser($Indexer, $query){
     /**
      * parse a search query and transform it into intermediate representation
      *
@@ -544,7 +530,7 @@ function ft_queryParser($query){
         if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
             // phrase-include and phrase-exclude
             $not = $matches[1] ? 'NOT' : '';
-            $parsed = $not.ft_termParser($matches[2], $stopwords, false, true);
+            $parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
         } else {
             // fix incomplete phrase
             $term = str_replace('"', ' ', $term);
@@ -591,10 +577,10 @@ function ft_queryParser($query){
                     $parsed .= '(N+:'.$matches[1].')';
                 } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
                     // word-exclude
-                    $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
+                    $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
                 } else {
                     // word-include
-                    $parsed .= ft_termParser($token, $stopwords);
+                    $parsed .= ft_termParser($Indexer, $token);
                 }
             }
         }
@@ -728,18 +714,18 @@ function ft_queryParser($query){
  *
  * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  */
-function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) {
+function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
     $parsed = '';
     if ($consider_asian) {
         // successive asian characters need to be searched as a phrase
         $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
         foreach ($words as $word) {
-            if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true;
-            $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
+            $phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word);
+            $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
         }
     } else {
         $term_noparen = str_replace(array('(', ')'), ' ', $term);
-        $words = idx_tokenizer($term_noparen, $stopwords, true);
+        $words = $Indexer->tokenizer($term_noparen, true);
 
         // W_: no need to highlight
         if (empty($words)) {
@@ -756,4 +742,4 @@ function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode
     return $parsed;
 }
 
-//Setup VIM: ex: et ts=4 enc=utf-8 :
+//Setup VIM: ex: et ts=4 :