From f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Sun, 28 Aug 2005 17:28:21 +0200 Subject: new fulltext search function using the index The new search function was added but is not yet integrated into DokuWikis interface. darcs-hash:20050828152821-7ad00-a6e79a9dc5aaf41c547cf42dccdbc3b5bc8d303e.gz --- inc/fulltext.php | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 inc/fulltext.php (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php new file mode 100644 index 000000000..8549a67c1 --- /dev/null +++ b/inc/fulltext.php @@ -0,0 +1,147 @@ + + */ + + if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); + require_once(DOKU_INC.'inc/indexer.php'); + + +/** + * The fulltext search + * + * Returns a list of matching documents for the given query + */ +function ft_pageSearch($query){ + $q = ft_queryParser($query); + + // lookup all words found in the query + $words = array_merge($q['and'],$q['not']); + foreach($q['phrases'] as $phrase){ + $words = array_merge($words,$phrase['words']); + } + if(!count($words)) return array(); + $result = idx_lookup($words); + + // merge search results with query + foreach($q['and'] as $pos => $w){ + $q['and'][$pos] = $result[$w]; + } + // create a list of unwanted docs + $not = array(); + foreach($q['not'] as $pos => $w){ + $not = array_merge($not,array_keys($result[$w])); + } + + + // combine and words + if(count($q['and']) > 1){ + $docs = ft_resultCombine($q['and']); + }else{ + $docs = $q['and'][0]; + } + if(!count($docs)) return array(); + + // remove negative matches + foreach($not as $n){ + unset($docs[$n]); + } + + if(!count($docs)) return array(); + + + // handle phrases + if(count($q['phrases'])){ + //build a regexp + $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); + $q['phrases'] = array_map('preg_quote',$q['phrases']); + $regex = '('.join('|',$q['phrases']).')'; + + // check the source of all documents for the exact phrases + foreach(array_keys($docs) as $id){ + $text = utf8_strtolower(rawWiki($id)); + if(!preg_match_all('/'.$regex.'/usi',$text)){ + unset($docs[$id]); // no hit - remove + } + } + } + + if(!count($docs)) return array(); + + // if there are any hits left, sort them by count + arsort($docs); + + return $docs; +} + +/** + * Combine found documents and sum up their scores + * + * This function is used to combine searched words with a logical + * AND. Only documents available in all arrays are returned. + * + * based upon PEAR's PHP_Compat function for array_intersect_key() + * + * @param array $args An array of page arrays + */ +function ft_resultCombine($args){ + $array_count = count($args); + $result = array(); + foreach ($args[0] as $key1 => $value1) { + for ($i = 1; $i !== $array_count; $i++) { + foreach ($args[$i] as $key2 => $value2) { + if ((string) $key1 === (string) $key2) { + if(!isset($result[$key1])) $result[$key1] = $value1; + $result[$key1] += $value2; + } + } + } + } + return $result; +} + +/** + * Builds an array of search words from a query + * + * @todo support OR and parenthesises? + */ +function ft_queryParser($query){ + global $conf; + $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; + if(@file_exists($swfile)){ + $stopwords = file($swfile); + }else{ + $stopwords = array(); + } + + $q = array(); + $q['query'] = $query; + $q['phrases'] = array(); + $q['and'] = array(); + $q['not'] = array(); + + // handle phrase searches + while(preg_match('/"(.*?)"/',$query,$match)){ + $q['phrases'][] = $match[0]; + $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); + $query = preg_replace('/"(.*?)"/','',$query,1); + } + + $words = explode(' ',$query); + foreach($words as $w){ + if($w{0} == '-'){ + $token = idx_tokenizer($w,$stopwords); + if(count($token)) $q['not'] = array_merge($q['not'],$token); + }else{ + $token = idx_tokenizer($w,$stopwords); + if(count($token)) $q['and'] = array_merge($q['and'],$token); + } + } + + return $q; +} + + -- cgit v1.2.3