summaryrefslogtreecommitdiff
path: root/inc/fulltext.php
diff options
context:
space:
mode:
Diffstat (limited to 'inc/fulltext.php')
-rw-r--r--inc/fulltext.php98
1 files changed, 45 insertions, 53 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 943a5d401..8155325ee 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
function _ft_pageSearch(&$data) {
+ $Indexer = idx_get_indexer();
+
// parse the given query
- $q = ft_queryParser($data['query']);
+ $q = ft_queryParser($Indexer, $data['query']);
$data['highlight'] = $q['highlight'];
if (empty($q['parsed_ary'])) return array();
// lookup all words found in the query
- $lookup = idx_lookup($q['words']);
+ $lookup = $Indexer->lookup($q['words']);
// get all pages in this dokuwiki site (!: includes nonexistent pages)
$pages_all = array();
- foreach (idx_getIndex('page', '') as $id) {
- $pages_all[trim($id)] = 0; // base: 0 hit
+ foreach ($Indexer->getPages() as $id) {
+ $pages_all[$id] = 0; // base: 0 hit
}
// process the query
@@ -122,29 +124,12 @@ function _ft_pageSearch(&$data) {
/**
* Returns the backlinks for a given page
*
- * Does a quick lookup with the fulltext index, then
- * evaluates the instructions of the found pages
+ * Uses the metadata index.
*/
function ft_backlinks($id){
- global $conf;
- $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
- $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
$result = array();
- // quick lookup of the pagename
- $page = noNS($id);
- $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .)
- $docs = array_keys(ft_resultCombine(array_values($matches)));
- $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages
- if(!count($docs)) return $result;
-
- // check metadata for matching links
- foreach($docs as $match){
- // metadata relation reference links are already resolved
- $links = p_get_metadata($match,'relation references');
- if (isset($links[$id])) $result[] = $match;
- }
+ $result = idx_get_indexer()->lookupKey('relation_references', $id);
if(!count($result)) return $result;
@@ -168,17 +153,14 @@ function ft_backlinks($id){
* Aborts after $max found results
*/
function ft_mediause($id,$max){
- global $conf;
- $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
- $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
if(!$max) $max = 1; // need to find at least one
$result = array();
// quick lookup of the mediafile
+ // FIXME use metadata key lookup
$media = noNS($id);
- $matches = idx_lookup(idx_tokenizer($media,$stopwords));
+ $matches = idx_lookup(idx_tokenizer($media));
$docs = array_keys(ft_resultCombine(array_values($matches)));
if(!count($docs)) return $result;
@@ -238,24 +220,32 @@ function _ft_pageLookup(&$data){
$in_ns = $data['in_ns'];
$in_title = $data['in_title'];
+ $cleaned = cleanID($id);
- $pages = array_map('rtrim', idx_getIndex('page', ''));
- $titles = array_map('rtrim', idx_getIndex('title', ''));
- $pages = array_combine($pages, $titles);
+ $Indexer = idx_get_indexer();
+ $page_idx = $Indexer->getPages();
- $cleaned = cleanID($id);
+ $pages = array();
if ($id !== '' && $cleaned !== '') {
- foreach ($pages as $p_id => $p_title) {
- if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) &&
- (!$in_title || (stripos($p_title, $id) === false)) ) {
- unset($pages[$p_id]);
+ foreach ($page_idx as $p_id) {
+ if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
+ if (!isset($pages[$p_id]))
+ $pages[$p_id] = p_get_first_heading($p_id, false);
+ }
+ }
+ if ($in_title) {
+ $wildcard_id = "*$id*";
+ foreach ($Indexer->lookupKey('title', $wildcard_id) as $p_id) {
+ if (!isset($pages[$p_id]))
+ $pages[$p_id] = p_get_first_heading($p_id, false);
}
}
}
if (isset($ns)) {
- foreach (array_keys($pages) as $p_id) {
- if (strpos($p_id, $ns) !== 0) {
- unset($pages[$p_id]);
+ foreach ($page_idx as $p_id) {
+ if (strpos($p_id, $ns) === 0) {
+ if (!isset($pages[$p_id]))
+ $pages[$p_id] = p_get_first_heading($p_id, false);
}
}
}
@@ -270,7 +260,7 @@ function _ft_pageLookup(&$data){
}
}
- uasort($pages,'ft_pagesorter');
+ uksort($pages,'ft_pagesorter');
return $pages;
}
@@ -298,6 +288,7 @@ function ft_pagesorter($a, $b){
*/
function ft_snippet($id,$highlight){
$text = rawWiki($id);
+ $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
$evdata = array(
'id' => $id,
'text' => &$text,
@@ -390,6 +381,11 @@ function ft_snippet($id,$highlight){
* Wraps a search term in regex boundary checks.
*/
function ft_snippet_re_preprocess($term) {
+ // do not process asian terms where word boundaries are not explicit
+ if(preg_match('/'.IDX_ASIAN.'/u',$term)){
+ return $term;
+ }
+
if(substr($term,0,2) == '\\*'){
$term = substr($term,2);
}else{
@@ -488,11 +484,7 @@ function ft_resultComplement($args) {
* @author Andreas Gohr <andi@splitbrain.org>
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
-function ft_queryParser($query){
- global $conf;
- $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
- $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
+function ft_queryParser($Indexer, $query){
/**
* parse a search query and transform it into intermediate representation
*
@@ -538,7 +530,7 @@ function ft_queryParser($query){
if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
// phrase-include and phrase-exclude
$not = $matches[1] ? 'NOT' : '';
- $parsed = $not.ft_termParser($matches[2], $stopwords, false, true);
+ $parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
} else {
// fix incomplete phrase
$term = str_replace('"', ' ', $term);
@@ -585,10 +577,10 @@ function ft_queryParser($query){
$parsed .= '(N+:'.$matches[1].')';
} elseif (preg_match('/^-(.+)$/', $token, $matches)) {
// word-exclude
- $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
+ $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
} else {
// word-include
- $parsed .= ft_termParser($token, $stopwords);
+ $parsed .= ft_termParser($Indexer, $token);
}
}
}
@@ -722,18 +714,18 @@ function ft_queryParser($query){
*
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
-function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) {
+function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
$parsed = '';
if ($consider_asian) {
// successive asian characters need to be searched as a phrase
$words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
foreach ($words as $word) {
- if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true;
- $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
+ $phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word);
+ $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
}
} else {
$term_noparen = str_replace(array('(', ')'), ' ', $term);
- $words = idx_tokenizer($term_noparen, $stopwords, true);
+ $words = $Indexer->tokenizer($term_noparen, true);
// W_: no need to highlight
if (empty($words)) {
@@ -750,4 +742,4 @@ function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode
return $parsed;
}
-//Setup VIM: ex: et ts=4 enc=utf-8 :
+//Setup VIM: ex: et ts=4 :