summaryrefslogtreecommitdiff
path: root/inc/fulltext.php
diff options
context:
space:
mode:
Diffstat (limited to 'inc/fulltext.php')
-rw-r--r--inc/fulltext.php88
1 files changed, 32 insertions, 56 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index be3938cac..805da2e37 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
function _ft_pageSearch(&$data) {
+ $Indexer = idx_get_indexer();
+
// parse the given query
- $q = ft_queryParser($data['query']);
+ $q = ft_queryParser($Indexer, $data['query']);
$data['highlight'] = $q['highlight'];
if (empty($q['parsed_ary'])) return array();
// lookup all words found in the query
- $lookup = idx_lookup($q['words']);
+ $lookup = $Indexer->lookup($q['words']);
// get all pages in this dokuwiki site (!: includes nonexistent pages)
$pages_all = array();
- foreach (idx_getIndex('page', '') as $id) {
- $pages_all[trim($id)] = 0; // base: 0 hit
+ foreach ($Indexer->getPages() as $id) {
+ $pages_all[$id] = 0; // base: 0 hit
}
// process the query
@@ -122,29 +124,13 @@ function _ft_pageSearch(&$data) {
/**
* Returns the backlinks for a given page
*
- * Does a quick lookup with the fulltext index, then
- * evaluates the instructions of the found pages
+ * Uses the metadata index.
*/
function ft_backlinks($id){
- global $conf;
- $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
- $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
$result = array();
- // quick lookup of the pagename
- $page = noNS($id);
- $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .)
- $docs = array_keys(ft_resultCombine(array_values($matches)));
- $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages
- if(!count($docs)) return $result;
-
- // check metadata for matching links
- foreach($docs as $match){
- // metadata relation reference links are already resolved
- $links = p_get_metadata($match,'relation references');
- if (isset($links[$id])) $result[] = $match;
- }
+ $result = idx_get_indexer()->lookupKey('relation_references', $id);
+ $result = $result[$id];
if(!count($result)) return $result;
@@ -168,17 +154,14 @@ function ft_backlinks($id){
* Aborts after $max found results
*/
function ft_mediause($id,$max){
- global $conf;
- $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
- $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
if(!$max) $max = 1; // need to find at least one
$result = array();
// quick lookup of the mediafile
+ // FIXME use metadata key lookup
$media = noNS($id);
- $matches = idx_lookup(idx_tokenizer($media,$stopwords));
+ $matches = idx_lookup(idx_tokenizer($media));
$docs = array_keys(ft_resultCombine(array_values($matches)));
if(!count($docs)) return $result;
@@ -229,7 +212,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){
}
function _ft_pageLookup(&$data){
- global $conf;
// split out original parameters
$id = $data['id'];
if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) {
@@ -239,29 +221,27 @@ function _ft_pageLookup(&$data){
$in_ns = $data['in_ns'];
$in_title = $data['in_title'];
+ $cleaned = cleanID($id);
- $pages = array_map('rtrim', idx_getIndex('page', ''));
- $titles = array_map('rtrim', idx_getIndex('title', ''));
- // check for corrupt title index #FS2076
- if(count($pages) != count($titles)){
- $titles = array_fill(0,count($pages),'');
- @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php
- }
- $pages = array_combine($pages, $titles);
+ $Indexer = idx_get_indexer();
+ $page_idx = $Indexer->getPages();
- $cleaned = cleanID($id);
+ $pages = array();
if ($id !== '' && $cleaned !== '') {
- foreach ($pages as $p_id => $p_title) {
- if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) &&
- (!$in_title || (stripos($p_title, $id) === false)) ) {
- unset($pages[$p_id]);
+ foreach ($page_idx as $p_id) {
+ if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
+ if (!isset($pages[$p_id]))
+ $pages[$p_id] = p_get_first_heading($p_id, false);
}
}
+ //if ($in_title)
+ // $titles = $Indexer->lookupKey('title', "*$id*");
}
if (isset($ns)) {
- foreach (array_keys($pages) as $p_id) {
- if (strpos($p_id, $ns) !== 0) {
- unset($pages[$p_id]);
+ foreach ($page_idx as $p_id) {
+ if (strpos($p_id, $ns) === 0) {
+ if (!isset($pages[$p_id]))
+ $pages[$p_id] = p_get_first_heading($p_id, false);
}
}
}
@@ -499,11 +479,7 @@ function ft_resultComplement($args) {
* @author Andreas Gohr <andi@splitbrain.org>
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
-function ft_queryParser($query){
- global $conf;
- $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
- $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
+function ft_queryParser($Indexer, $query){
/**
* parse a search query and transform it into intermediate representation
*
@@ -549,7 +525,7 @@ function ft_queryParser($query){
if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
// phrase-include and phrase-exclude
$not = $matches[1] ? 'NOT' : '';
- $parsed = $not.ft_termParser($matches[2], $stopwords, false, true);
+ $parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
} else {
// fix incomplete phrase
$term = str_replace('"', ' ', $term);
@@ -596,10 +572,10 @@ function ft_queryParser($query){
$parsed .= '(N+:'.$matches[1].')';
} elseif (preg_match('/^-(.+)$/', $token, $matches)) {
// word-exclude
- $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
+ $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
} else {
// word-include
- $parsed .= ft_termParser($token, $stopwords);
+ $parsed .= ft_termParser($Indexer, $token);
}
}
}
@@ -733,18 +709,18 @@ function ft_queryParser($query){
*
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
-function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) {
+function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
$parsed = '';
if ($consider_asian) {
// successive asian characters need to be searched as a phrase
$words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
foreach ($words as $word) {
$phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word);
- $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
+ $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
}
} else {
$term_noparen = str_replace(array('(', ')'), ' ', $term);
- $words = idx_tokenizer($term_noparen, $stopwords, true);
+ $words = $Indexer->tokenizer($term_noparen, true);
// W_: no need to highlight
if (empty($words)) {