From 9b41be2446ea725a496f34b28ac4db84bece57c9 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 29 Dec 2010 03:50:05 -0500 Subject: Indexer v3 Rewrite part two, update uses of indexer --- inc/fulltext.php | 73 ++++++++++++++++++++++++-------------------------------- 1 file changed, 31 insertions(+), 42 deletions(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 7ace3a724..0411b9f99 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){ * @author Kazutaka Miyasaka */ function _ft_pageSearch(&$data) { + $Indexer = idx_get_indexer(); + // parse the given query - $q = ft_queryParser($data['query']); + $q = ft_queryParser($Indexer, $data['query']); $data['highlight'] = $q['highlight']; if (empty($q['parsed_ary'])) return array(); // lookup all words found in the query - $lookup = idx_lookup($q['words']); + $lookup = $Indexer->lookup($q['words']); // get all pages in this dokuwiki site (!: includes nonexistent pages) $pages_all = array(); - foreach (idx_getIndex('page', '') as $id) { - $pages_all[trim($id)] = 0; // base: 0 hit + foreach ($Indexer->getPages() as $id) { + $pages_all[$id] = 0; // base: 0 hit } // process the query @@ -126,15 +128,12 @@ function _ft_pageSearch(&$data) { * evaluates the instructions of the found pages */ function ft_backlinks($id){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - $result = array(); // quick lookup of the pagename + // FIXME use metadata key lookup $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) + $matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .) $docs = array_keys(ft_resultCombine(array_values($matches))); $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages if(!count($docs)) return $result; @@ -168,17 +167,14 @@ function ft_backlinks($id){ * Aborts after $max found results */ function ft_mediause($id,$max){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - if(!$max) $max = 1; // need to find at least one $result = array(); // quick lookup of the mediafile + // FIXME use metadata key lookup $media = noNS($id); - $matches = idx_lookup(idx_tokenizer($media,$stopwords)); + $matches = idx_lookup(idx_tokenizer($media)); $docs = array_keys(ft_resultCombine(array_values($matches))); if(!count($docs)) return $result; @@ -229,7 +225,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){ } function _ft_pageLookup(&$data){ - global $conf; // split out original parameters $id = $data['id']; if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) { @@ -239,29 +234,27 @@ function _ft_pageLookup(&$data){ $in_ns = $data['in_ns']; $in_title = $data['in_title']; + $cleaned = cleanID($id); - $pages = array_map('rtrim', idx_getIndex('page', '')); - $titles = array_map('rtrim', idx_getIndex('title', '')); - // check for corrupt title index #FS2076 - if(count($pages) != count($titles)){ - $titles = array_fill(0,count($pages),''); - @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php - } - $pages = array_combine($pages, $titles); + $Indexer = idx_get_indexer(); + $page_idx = $Indexer->getPages(); - $cleaned = cleanID($id); + $pages = array(); if ($id !== '' && $cleaned !== '') { - foreach ($pages as $p_id => $p_title) { - if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) && - (!$in_title || (stripos($p_title, $id) === false)) ) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } + //if ($in_title) + // $titles = $Indexer->lookupKey('title', "*$id*"); } if (isset($ns)) { - foreach (array_keys($pages) as $p_id) { - if (strpos($p_id, $ns) !== 0) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if (strpos($p_id, $ns) === 0) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } } @@ -494,11 +487,7 @@ function ft_resultComplement($args) { * @author Andreas Gohr * @author Kazutaka Miyasaka */ -function ft_queryParser($query){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - +function ft_queryParser($Indexer, $query){ /** * parse a search query and transform it into intermediate representation * @@ -544,7 +533,7 @@ function ft_queryParser($query){ if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { // phrase-include and phrase-exclude $not = $matches[1] ? 'NOT' : ''; - $parsed = $not.ft_termParser($matches[2], $stopwords, false, true); + $parsed = $not.ft_termParser($Indexer, $matches[2], false, true); } else { // fix incomplete phrase $term = str_replace('"', ' ', $term); @@ -591,10 +580,10 @@ function ft_queryParser($query){ $parsed .= '(N+:'.$matches[1].')'; } elseif (preg_match('/^-(.+)$/', $token, $matches)) { // word-exclude - $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')'; + $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')'; } else { // word-include - $parsed .= ft_termParser($token, $stopwords); + $parsed .= ft_termParser($Indexer, $token); } } } @@ -728,18 +717,18 @@ function ft_queryParser($query){ * * @author Kazutaka Miyasaka */ -function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) { +function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) { $parsed = ''; if ($consider_asian) { // successive asian characters need to be searched as a phrase $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($words as $word) { if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true; - $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); + $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode); } } else { $term_noparen = str_replace(array('(', ')'), ' ', $term); - $words = idx_tokenizer($term_noparen, $stopwords, true); + $words = $Indexer->tokenizer($term_noparen, true); // W_: no need to highlight if (empty($words)) { -- cgit v1.2.3 From 320f489ae6a653f52f9d489b84b9bdd26f4241ac Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sun, 23 Jan 2011 02:00:32 +0100 Subject: Indexer v3 Rewrite: Use the metadata index for backlinks; add INDEXER_METADATA_INDEX event This new event allows plugins to add or modify the metadata that will be indexed. Collecting this metadata in an event allows plugins to see if other plugins have already added the metadata they need and leads to just one single indexer call thus fewer files are read and written. Plugins could also replace/prevent the metadata indexer call using this event. --- inc/fulltext.php | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 0411b9f99..35ee4ba34 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -124,26 +124,13 @@ function _ft_pageSearch(&$data) { /** * Returns the backlinks for a given page * - * Does a quick lookup with the fulltext index, then - * evaluates the instructions of the found pages + * Uses the metadata index. */ function ft_backlinks($id){ $result = array(); - // quick lookup of the pagename - // FIXME use metadata key lookup - $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .) - $docs = array_keys(ft_resultCombine(array_values($matches))); - $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages - if(!count($docs)) return $result; - - // check metadata for matching links - foreach($docs as $match){ - // metadata relation reference links are already resolved - $links = p_get_metadata($match,'relation references'); - if (isset($links[$id])) $result[] = $match; - } + $result = idx_get_indexer()->lookupKey('relation_references', $id); + $result = $result[$id]; if(!count($result)) return $result; -- cgit v1.2.3 From f078bb0088870b4b68b348d546afa30a80a07e87 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Mon, 24 Jan 2011 03:46:11 -0500 Subject: Indexer Rewrite v3: wildcards in lookupKey and automatically unwrap single result --- inc/fulltext.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 35ee4ba34..f477e826e 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -130,7 +130,6 @@ function ft_backlinks($id){ $result = array(); $result = idx_get_indexer()->lookupKey('relation_references', $id); - $result = $result[$id]; if(!count($result)) return $result; @@ -234,8 +233,12 @@ function _ft_pageLookup(&$data){ $pages[$p_id] = p_get_first_heading($p_id, false); } } - //if ($in_title) - // $titles = $Indexer->lookupKey('title', "*$id*"); + if ($in_title) { + foreach ($Indexer->lookupKey('title', "*$id*") as $p_id) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); + } + } } if (isset($ns)) { foreach ($page_idx as $p_id) { -- cgit v1.2.3 From 52784dd85122f75ca221c53d4fd9dcc98bfd2450 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Thu, 10 Feb 2011 18:51:40 +0100 Subject: do not (re)render metadata in backlinks A page could have possibly hundreds of backlinks, when the cache is outdated they should not be rererendered at once --- inc/fulltext.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 0f2414213..bb2647165 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -142,7 +142,7 @@ function ft_backlinks($id){ // check metadata for matching links foreach($docs as $match){ // metadata relation reference links are already resolved - $links = p_get_metadata($match,'relation references'); + $links = p_get_metadata($match,'relation references',false); if (isset($links[$id])) $result[] = $match; } -- cgit v1.2.3 From 7233c152c0a107c0f12dbc09f5493022b264dddb Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Thu, 24 Feb 2011 23:53:51 +0100 Subject: Fix pass by reference error, always return an array in lookupKey() --- inc/fulltext.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 891558f96..8155325ee 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -234,7 +234,8 @@ function _ft_pageLookup(&$data){ } } if ($in_title) { - foreach ($Indexer->lookupKey('title', "*$id*") as $p_id) { + $wildcard_id = "*$id*"; + foreach ($Indexer->lookupKey('title', $wildcard_id) as $p_id) { if (!isset($pages[$p_id])) $pages[$p_id] = p_get_first_heading($p_id, false); } -- cgit v1.2.3