diff options
Diffstat (limited to 'inc/fulltext.php')
-rw-r--r-- | inc/fulltext.php | 102 |
1 files changed, 44 insertions, 58 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index 7ace3a724..8155325ee 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){ * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ function _ft_pageSearch(&$data) { + $Indexer = idx_get_indexer(); + // parse the given query - $q = ft_queryParser($data['query']); + $q = ft_queryParser($Indexer, $data['query']); $data['highlight'] = $q['highlight']; if (empty($q['parsed_ary'])) return array(); // lookup all words found in the query - $lookup = idx_lookup($q['words']); + $lookup = $Indexer->lookup($q['words']); // get all pages in this dokuwiki site (!: includes nonexistent pages) $pages_all = array(); - foreach (idx_getIndex('page', '') as $id) { - $pages_all[trim($id)] = 0; // base: 0 hit + foreach ($Indexer->getPages() as $id) { + $pages_all[$id] = 0; // base: 0 hit } // process the query @@ -122,29 +124,12 @@ function _ft_pageSearch(&$data) { /** * Returns the backlinks for a given page * - * Does a quick lookup with the fulltext index, then - * evaluates the instructions of the found pages + * Uses the metadata index. */ function ft_backlinks($id){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - $result = array(); - // quick lookup of the pagename - $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) - $docs = array_keys(ft_resultCombine(array_values($matches))); - $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages - if(!count($docs)) return $result; - - // check metadata for matching links - foreach($docs as $match){ - // metadata relation reference links are already resolved - $links = p_get_metadata($match,'relation references'); - if (isset($links[$id])) $result[] = $match; - } + $result = idx_get_indexer()->lookupKey('relation_references', $id); if(!count($result)) return $result; @@ -168,17 +153,14 @@ function ft_backlinks($id){ * Aborts after $max found results */ function ft_mediause($id,$max){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - if(!$max) $max = 1; // need to find at least one $result = array(); // quick lookup of the mediafile + // FIXME use metadata key lookup $media = noNS($id); - $matches = idx_lookup(idx_tokenizer($media,$stopwords)); + $matches = idx_lookup(idx_tokenizer($media)); $docs = array_keys(ft_resultCombine(array_values($matches))); if(!count($docs)) return $result; @@ -229,7 +211,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){ } function _ft_pageLookup(&$data){ - global $conf; // split out original parameters $id = $data['id']; if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) { @@ -239,29 +220,32 @@ function _ft_pageLookup(&$data){ $in_ns = $data['in_ns']; $in_title = $data['in_title']; + $cleaned = cleanID($id); - $pages = array_map('rtrim', idx_getIndex('page', '')); - $titles = array_map('rtrim', idx_getIndex('title', '')); - // check for corrupt title index #FS2076 - if(count($pages) != count($titles)){ - $titles = array_fill(0,count($pages),''); - @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php - } - $pages = array_combine($pages, $titles); + $Indexer = idx_get_indexer(); + $page_idx = $Indexer->getPages(); - $cleaned = cleanID($id); + $pages = array(); if ($id !== '' && $cleaned !== '') { - foreach ($pages as $p_id => $p_title) { - if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) && - (!$in_title || (stripos($p_title, $id) === false)) ) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); + } + } + if ($in_title) { + $wildcard_id = "*$id*"; + foreach ($Indexer->lookupKey('title', $wildcard_id) as $p_id) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } } if (isset($ns)) { - foreach (array_keys($pages) as $p_id) { - if (strpos($p_id, $ns) !== 0) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if (strpos($p_id, $ns) === 0) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } } @@ -304,6 +288,7 @@ function ft_pagesorter($a, $b){ */ function ft_snippet($id,$highlight){ $text = rawWiki($id); + $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens $evdata = array( 'id' => $id, 'text' => &$text, @@ -396,6 +381,11 @@ function ft_snippet($id,$highlight){ * Wraps a search term in regex boundary checks. */ function ft_snippet_re_preprocess($term) { + // do not process asian terms where word boundaries are not explicit + if(preg_match('/'.IDX_ASIAN.'/u',$term)){ + return $term; + } + if(substr($term,0,2) == '\\*'){ $term = substr($term,2); }else{ @@ -494,11 +484,7 @@ function ft_resultComplement($args) { * @author Andreas Gohr <andi@splitbrain.org> * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ -function ft_queryParser($query){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - +function ft_queryParser($Indexer, $query){ /** * parse a search query and transform it into intermediate representation * @@ -544,7 +530,7 @@ function ft_queryParser($query){ if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { // phrase-include and phrase-exclude $not = $matches[1] ? 'NOT' : ''; - $parsed = $not.ft_termParser($matches[2], $stopwords, false, true); + $parsed = $not.ft_termParser($Indexer, $matches[2], false, true); } else { // fix incomplete phrase $term = str_replace('"', ' ', $term); @@ -591,10 +577,10 @@ function ft_queryParser($query){ $parsed .= '(N+:'.$matches[1].')'; } elseif (preg_match('/^-(.+)$/', $token, $matches)) { // word-exclude - $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')'; + $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')'; } else { // word-include - $parsed .= ft_termParser($token, $stopwords); + $parsed .= ft_termParser($Indexer, $token); } } } @@ -728,18 +714,18 @@ function ft_queryParser($query){ * * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ -function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) { +function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) { $parsed = ''; if ($consider_asian) { // successive asian characters need to be searched as a phrase $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($words as $word) { - if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true; - $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); + $phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word); + $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode); } } else { $term_noparen = str_replace(array('(', ')'), ' ', $term); - $words = idx_tokenizer($term_noparen, $stopwords, true); + $words = $Indexer->tokenizer($term_noparen, true); // W_: no need to highlight if (empty($words)) { @@ -756,4 +742,4 @@ function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode return $parsed; } -//Setup VIM: ex: et ts=4 enc=utf-8 : +//Setup VIM: ex: et ts=4 : |