From 506fa8936561993b7f70aa507d0c39a44a6ebab9 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Sun, 4 Sep 2005 00:02:29 +0200 Subject: the search now uses the index darcs-hash:20050903220229-7ad00-5d95f905eaeb3f6b867aa3ee43c2a8bccc533c00.gz --- inc/fulltext.php | 74 +++++++++++++++++++++++++++++++++++++++++++++++++------- inc/html.php | 24 ++++++++++-------- inc/search.php | 3 +++ 3 files changed, 82 insertions(+), 19 deletions(-) (limited to 'inc') diff --git a/inc/fulltext.php b/inc/fulltext.php index 8549a67c1..6c4e148a2 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -14,15 +14,16 @@ * The fulltext search * * Returns a list of matching documents for the given query + * */ -function ft_pageSearch($query){ +function ft_pageSearch($query,&$poswords){ $q = ft_queryParser($query); + // use this for higlighting later: + $poswords = join(' ',$q['and']); + // lookup all words found in the query $words = array_merge($q['and'],$q['not']); - foreach($q['phrases'] as $phrase){ - $words = array_merge($words,$phrase['words']); - } if(!count($words)) return array(); $result = idx_lookup($words); @@ -36,8 +37,7 @@ function ft_pageSearch($query){ $not = array_merge($not,array_keys($result[$w])); } - - // combine and words + // combine and-words if(count($q['and']) > 1){ $docs = ft_resultCombine($q['and']); }else{ @@ -52,7 +52,6 @@ function ft_pageSearch($query){ if(!count($docs)) return array(); - // handle phrases if(count($q['phrases'])){ //build a regexp @@ -63,7 +62,7 @@ function ft_pageSearch($query){ // check the source of all documents for the exact phrases foreach(array_keys($docs) as $id){ $text = utf8_strtolower(rawWiki($id)); - if(!preg_match_all('/'.$regex.'/usi',$text)){ + if(!preg_match('/'.$regex.'/usi',$text)){ unset($docs[$id]); // no hit - remove } } @@ -77,6 +76,63 @@ function ft_pageSearch($query){ return $docs; } +/** + * Quicksearch for pagenames + * + * By default it only matches the pagename and ignores the + * namespace. This can be changed with the second parameter + * + * @author Andreas Gohr + */ +function ft_pageLookup($id,$pageonly=true){ + global $conf; + $id = preg_quote($id,'/'); + $pages = file($conf['cachedir'].'/page.idx'); + $pages = array_values(preg_grep('/'.$id.'/',$pages)); + + $cnt = count($pages); + for($i=0; $i<$cnt; $i++){ + if($pageonly){ + if(!preg_match('/'.$id.'/',noNS($pages[$i]))){ + unset($pages[$i]); + continue; + } + } + if(!@file_exists(wikiFN($pages[$i]))){ + unset($pages[$i]); + continue; + } + } + sort($pages); + return $pages; +} + +/** + * Creates a snippet extract + * + * @author Andreas Gohr + */ +function ft_snippet($id,$poswords){ + $poswords = preg_quote($poswords,'#'); + $re = '('.str_replace(' ','|',$poswords).')'; + $text = rawWiki($id); + //FIXME caseinsensitive matching doesn't work with UTF-8!? + preg_match_all('#(.{0,50})'.$re.'(.{0,50})#iu',$text,$matches,PREG_SET_ORDER); + + $cnt = 0; + $snippet = ''; + foreach($matches as $match){ + $snippet .= '...'.htmlspecialchars($match[1]); + $snippet .= ''; + $snippet .= htmlspecialchars($match[2]); + $snippet .= ''; + $snippet .= htmlspecialchars($match[3]).'... '; + if($cnt++ == 2) break; + } + + return $snippet; +} + /** * Combine found documents and sum up their scores * @@ -144,4 +200,4 @@ function ft_queryParser($query){ return $q; } - +//Setup VIM: ex: et ts=4 enc=utf-8 : diff --git a/inc/html.php b/inc/html.php index dcd11feb1..b73eebf8c 100644 --- a/inc/html.php +++ b/inc/html.php @@ -295,6 +295,7 @@ function html_hilight($html,$query){ */ function html_search(){ require_once(DOKU_INC.'inc/search.php'); + require_once(DOKU_INC.'inc/fulltext.php'); global $conf; global $QUERY; global $ID; @@ -312,14 +313,14 @@ function html_search(){ //do quick pagesearch $data = array(); - search($data,$conf['datadir'],'search_pagename',array(query => cleanID($QUERY))); + $data = ft_pageLookup(cleanID($QUERY)); if(count($data)){ sort($data); print '
'; print ''.$lang[quickhits].':
'; - foreach($data as $row){ + foreach($data as $id){ print '
'; - print html_wikilink(':'.$row['id'],$conf['useheading']?NULL:$row['id']); + print html_wikilink(':'.$id,$conf['useheading']?NULL:$id); print '
'; } //clear float (see http://www.complexspiral.com/publications/containing-floats/) @@ -329,16 +330,19 @@ function html_search(){ flush(); //do fulltext search - $data = array(); - search($data,$conf['datadir'],'search_fulltext',array(query => utf8_strtolower($QUERY))); + $data = ft_pageSearch($QUERY,$poswords); if(count($data)){ - usort($data,'sort_search_fulltext'); - foreach($data as $row){ + $num = 1; + foreach($data as $id => $cnt){ print '
'; - print html_wikilink(':'.$row['id'],$conf['useheading']?NULL:$row['id'],$row['poswords']); - print ': '.$row['count'].' '.$lang['hits'].'
'; - print '
'.$row['snippet'].'
'; + print html_wikilink(':'.$id,$conf['useheading']?NULL:$id,$poswords); + print ': '.$cnt.' '.$lang['hits'].'
'; + if($num < 15){ // create snippets for the first number of matches only #FIXME add to conf ? + print '
'.ft_snippet($id,$poswords).'
'; + } print '
'; + flush(); + $num++; } }else{ print '
'.$lang['nothingfound'].'
'; diff --git a/inc/search.php b/inc/search.php index 3604db15e..ea20c4f3b 100644 --- a/inc/search.php +++ b/inc/search.php @@ -283,6 +283,7 @@ function search_backlinks(&$data,$base,$file,$type,$lvl,$opts){ * $opts['query'] is the search query * * @author Andreas Gohr + * @deprecated - fulltext indexer is used instead */ function search_fulltext(&$data,$base,$file,$type,$lvl,$opts){ //we do nothing with directories @@ -383,6 +384,8 @@ function search_reference(&$data,$base,$file,$type,$lvl,$opts){ * * @author Andreas Gohr * @author Matthias Grimm + * + * @deprecated - fulltext indexer is used instead */ function search_regex(&$data,$base,$file,$reg,$words){ -- cgit v1.2.3