From 4987233dfc97c0b3bdfa620a25fc3ee5775f42a5 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Sun, 24 Oct 2010 10:08:30 +0200 Subject: detect corrupt title indexes FS#2076 --- inc/fulltext.php | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 943a5d401..8cdfac42e 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -241,6 +241,11 @@ function _ft_pageLookup(&$data){ $pages = array_map('rtrim', idx_getIndex('page', '')); $titles = array_map('rtrim', idx_getIndex('title', '')); + // check for corrupt title index #FS2076 + if(count($pages) != count($titles)){ + $titles = array_fill(0,count($pages),''); + @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php + } $pages = array_combine($pages, $titles); $cleaned = cleanID($id); -- cgit v1.2.3 From 7f97a9005507f048c61d44adc6ca954fcb884541 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Sun, 24 Oct 2010 12:27:01 +0200 Subject: fixed missing global --- inc/fulltext.php | 1 + 1 file changed, 1 insertion(+) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 8cdfac42e..0b9798eee 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -229,6 +229,7 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){ } function _ft_pageLookup(&$data){ + global $conf; // split out original parameters $id = $data['id']; if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) { -- cgit v1.2.3 From 3d2017d9ca9f1e056a33fea3dd482747e901409f Mon Sep 17 00:00:00 2001 From: Adrian Lang Date: Sun, 24 Oct 2010 20:31:23 +0200 Subject: Fix sorting of quicksearch results --- inc/fulltext.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 0b9798eee..7ace3a724 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -276,7 +276,7 @@ function _ft_pageLookup(&$data){ } } - uasort($pages,'ft_pagesorter'); + uksort($pages,'ft_pagesorter'); return $pages; } -- cgit v1.2.3 From e3776c06c37cc197709dac60892604dfea894ac2 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Mon, 29 Nov 2010 01:34:36 +0100 Subject: Remove enc=utf-8 in VIM modeline as it is not allowed in VIM 7.3 As of VIM 7.3 it is no longer possible to specify the encoding in the modeline. This gives an error message whenever such a file is opened, thus this commit removes the enc setting from the modeline. --- inc/fulltext.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 7ace3a724..59a9c1d96 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -756,4 +756,4 @@ function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode return $parsed; } -//Setup VIM: ex: et ts=4 enc=utf-8 : +//Setup VIM: ex: et ts=4 : -- cgit v1.2.3 From 35594613ea01a868e958663dc211bce021df23f4 Mon Sep 17 00:00:00 2001 From: Kazutaka Miyasaka Date: Tue, 14 Dec 2010 01:52:20 +0900 Subject: Fixed Asian search term handling in ft_snippet_re_preprocess() --- inc/fulltext.php | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 59a9c1d96..457ad1baf 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -396,6 +396,11 @@ function ft_snippet($id,$highlight){ * Wraps a search term in regex boundary checks. */ function ft_snippet_re_preprocess($term) { + // do not process asian terms where word boundaries are not explicit + if(preg_match('/'.IDX_ASIAN.'/u',$term)){ + return $term; + } + if(substr($term,0,2) == '\\*'){ $term = substr($term,2); }else{ -- cgit v1.2.3 From 6ac2077a96f206f26714699c001c72f914bf7970 Mon Sep 17 00:00:00 2001 From: Kazutaka Miyasaka Date: Tue, 14 Dec 2010 01:38:54 +0900 Subject: Removed unnecessary regexp excecution in ft_termParser() --- inc/fulltext.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 457ad1baf..be3938cac 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -739,7 +739,7 @@ function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode // successive asian characters need to be searched as a phrase $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($words as $word) { - if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true; + $phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word); $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); } } else { -- cgit v1.2.3 From 9b41be2446ea725a496f34b28ac4db84bece57c9 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 29 Dec 2010 03:50:05 -0500 Subject: Indexer v3 Rewrite part two, update uses of indexer --- inc/fulltext.php | 73 ++++++++++++++++++++++++-------------------------------- 1 file changed, 31 insertions(+), 42 deletions(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 7ace3a724..0411b9f99 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){ * @author Kazutaka Miyasaka */ function _ft_pageSearch(&$data) { + $Indexer = idx_get_indexer(); + // parse the given query - $q = ft_queryParser($data['query']); + $q = ft_queryParser($Indexer, $data['query']); $data['highlight'] = $q['highlight']; if (empty($q['parsed_ary'])) return array(); // lookup all words found in the query - $lookup = idx_lookup($q['words']); + $lookup = $Indexer->lookup($q['words']); // get all pages in this dokuwiki site (!: includes nonexistent pages) $pages_all = array(); - foreach (idx_getIndex('page', '') as $id) { - $pages_all[trim($id)] = 0; // base: 0 hit + foreach ($Indexer->getPages() as $id) { + $pages_all[$id] = 0; // base: 0 hit } // process the query @@ -126,15 +128,12 @@ function _ft_pageSearch(&$data) { * evaluates the instructions of the found pages */ function ft_backlinks($id){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - $result = array(); // quick lookup of the pagename + // FIXME use metadata key lookup $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) + $matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .) $docs = array_keys(ft_resultCombine(array_values($matches))); $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages if(!count($docs)) return $result; @@ -168,17 +167,14 @@ function ft_backlinks($id){ * Aborts after $max found results */ function ft_mediause($id,$max){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - if(!$max) $max = 1; // need to find at least one $result = array(); // quick lookup of the mediafile + // FIXME use metadata key lookup $media = noNS($id); - $matches = idx_lookup(idx_tokenizer($media,$stopwords)); + $matches = idx_lookup(idx_tokenizer($media)); $docs = array_keys(ft_resultCombine(array_values($matches))); if(!count($docs)) return $result; @@ -229,7 +225,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){ } function _ft_pageLookup(&$data){ - global $conf; // split out original parameters $id = $data['id']; if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) { @@ -239,29 +234,27 @@ function _ft_pageLookup(&$data){ $in_ns = $data['in_ns']; $in_title = $data['in_title']; + $cleaned = cleanID($id); - $pages = array_map('rtrim', idx_getIndex('page', '')); - $titles = array_map('rtrim', idx_getIndex('title', '')); - // check for corrupt title index #FS2076 - if(count($pages) != count($titles)){ - $titles = array_fill(0,count($pages),''); - @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php - } - $pages = array_combine($pages, $titles); + $Indexer = idx_get_indexer(); + $page_idx = $Indexer->getPages(); - $cleaned = cleanID($id); + $pages = array(); if ($id !== '' && $cleaned !== '') { - foreach ($pages as $p_id => $p_title) { - if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) && - (!$in_title || (stripos($p_title, $id) === false)) ) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } + //if ($in_title) + // $titles = $Indexer->lookupKey('title', "*$id*"); } if (isset($ns)) { - foreach (array_keys($pages) as $p_id) { - if (strpos($p_id, $ns) !== 0) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if (strpos($p_id, $ns) === 0) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } } @@ -494,11 +487,7 @@ function ft_resultComplement($args) { * @author Andreas Gohr * @author Kazutaka Miyasaka */ -function ft_queryParser($query){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - +function ft_queryParser($Indexer, $query){ /** * parse a search query and transform it into intermediate representation * @@ -544,7 +533,7 @@ function ft_queryParser($query){ if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { // phrase-include and phrase-exclude $not = $matches[1] ? 'NOT' : ''; - $parsed = $not.ft_termParser($matches[2], $stopwords, false, true); + $parsed = $not.ft_termParser($Indexer, $matches[2], false, true); } else { // fix incomplete phrase $term = str_replace('"', ' ', $term); @@ -591,10 +580,10 @@ function ft_queryParser($query){ $parsed .= '(N+:'.$matches[1].')'; } elseif (preg_match('/^-(.+)$/', $token, $matches)) { // word-exclude - $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')'; + $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')'; } else { // word-include - $parsed .= ft_termParser($token, $stopwords); + $parsed .= ft_termParser($Indexer, $token); } } } @@ -728,18 +717,18 @@ function ft_queryParser($query){ * * @author Kazutaka Miyasaka */ -function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) { +function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) { $parsed = ''; if ($consider_asian) { // successive asian characters need to be searched as a phrase $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($words as $word) { if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true; - $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); + $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode); } } else { $term_noparen = str_replace(array('(', ')'), ' ', $term); - $words = idx_tokenizer($term_noparen, $stopwords, true); + $words = $Indexer->tokenizer($term_noparen, true); // W_: no need to highlight if (empty($words)) { -- cgit v1.2.3 From 320f489ae6a653f52f9d489b84b9bdd26f4241ac Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sun, 23 Jan 2011 02:00:32 +0100 Subject: Indexer v3 Rewrite: Use the metadata index for backlinks; add INDEXER_METADATA_INDEX event This new event allows plugins to add or modify the metadata that will be indexed. Collecting this metadata in an event allows plugins to see if other plugins have already added the metadata they need and leads to just one single indexer call thus fewer files are read and written. Plugins could also replace/prevent the metadata indexer call using this event. --- inc/fulltext.php | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 0411b9f99..35ee4ba34 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -124,26 +124,13 @@ function _ft_pageSearch(&$data) { /** * Returns the backlinks for a given page * - * Does a quick lookup with the fulltext index, then - * evaluates the instructions of the found pages + * Uses the metadata index. */ function ft_backlinks($id){ $result = array(); - // quick lookup of the pagename - // FIXME use metadata key lookup - $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .) - $docs = array_keys(ft_resultCombine(array_values($matches))); - $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages - if(!count($docs)) return $result; - - // check metadata for matching links - foreach($docs as $match){ - // metadata relation reference links are already resolved - $links = p_get_metadata($match,'relation references'); - if (isset($links[$id])) $result[] = $match; - } + $result = idx_get_indexer()->lookupKey('relation_references', $id); + $result = $result[$id]; if(!count($result)) return $result; -- cgit v1.2.3 From f078bb0088870b4b68b348d546afa30a80a07e87 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Mon, 24 Jan 2011 03:46:11 -0500 Subject: Indexer Rewrite v3: wildcards in lookupKey and automatically unwrap single result --- inc/fulltext.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 35ee4ba34..f477e826e 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -130,7 +130,6 @@ function ft_backlinks($id){ $result = array(); $result = idx_get_indexer()->lookupKey('relation_references', $id); - $result = $result[$id]; if(!count($result)) return $result; @@ -234,8 +233,12 @@ function _ft_pageLookup(&$data){ $pages[$p_id] = p_get_first_heading($p_id, false); } } - //if ($in_title) - // $titles = $Indexer->lookupKey('title', "*$id*"); + if ($in_title) { + foreach ($Indexer->lookupKey('title', "*$id*") as $p_id) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); + } + } } if (isset($ns)) { foreach ($page_idx as $p_id) { -- cgit v1.2.3 From 4f0030dd466f56b3dc0c864656fb1bf0e76d2932 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Sun, 6 Feb 2011 19:07:31 +0100 Subject: ignore soft-hyphens for search FS#2049 This makes it possible to find words that include soft-hyphens. However, search higlighting will not work and I have no idea how to make it work. --- inc/fulltext.php | 1 + 1 file changed, 1 insertion(+) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index be3938cac..0f2414213 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -304,6 +304,7 @@ function ft_pagesorter($a, $b){ */ function ft_snippet($id,$highlight){ $text = rawWiki($id); + $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens $evdata = array( 'id' => $id, 'text' => &$text, -- cgit v1.2.3 From 52784dd85122f75ca221c53d4fd9dcc98bfd2450 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Thu, 10 Feb 2011 18:51:40 +0100 Subject: do not (re)render metadata in backlinks A page could have possibly hundreds of backlinks, when the cache is outdated they should not be rererendered at once --- inc/fulltext.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 0f2414213..bb2647165 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -142,7 +142,7 @@ function ft_backlinks($id){ // check metadata for matching links foreach($docs as $match){ // metadata relation reference links are already resolved - $links = p_get_metadata($match,'relation references'); + $links = p_get_metadata($match,'relation references',false); if (isset($links[$id])) $result[] = $match; } -- cgit v1.2.3 From 7233c152c0a107c0f12dbc09f5493022b264dddb Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Thu, 24 Feb 2011 23:53:51 +0100 Subject: Fix pass by reference error, always return an array in lookupKey() --- inc/fulltext.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 891558f96..8155325ee 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -234,7 +234,8 @@ function _ft_pageLookup(&$data){ } } if ($in_title) { - foreach ($Indexer->lookupKey('title', "*$id*") as $p_id) { + $wildcard_id = "*$id*"; + foreach ($Indexer->lookupKey('title', $wildcard_id) as $p_id) { if (!isset($pages[$p_id])) $pages[$p_id] = p_get_first_heading($p_id, false); } -- cgit v1.2.3