From 60c15d7deb9c53bcb1cf7881f441744bb29a6b63 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Fri, 15 Feb 2008 18:46:53 +0100 Subject: better highlighting for phrase searches FS#1193 This patch makes the highlighting of phrases in search snippets and on the pages itself much better. Now a regexp gets passed to the ?s darcs-hash:20080215174653-7ad00-cd2d6f7d408db7b7dd3cb9974c3eb27f3a9baeac.gz --- inc/fulltext.php | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'inc/fulltext.php') diff --git a/inc/fulltext.php b/inc/fulltext.php index 3131b7433..b10cbde8e 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -23,10 +23,11 @@ function ft_preg_quote_cb($string){ * Returns a list of matching documents for the given query * */ -function ft_pageSearch($query,&$poswords){ +function ft_pageSearch($query,&$regex){ $q = ft_queryParser($query); - // use this for higlighting later: - $poswords = str_replace('*','',join(' ',$q['and'])); + + // remember for hilighting later + $regex = str_replace('*','',join('|',$q['words'])); // lookup all words found in the query $words = array_merge($q['and'],$q['not']); @@ -78,6 +79,9 @@ function ft_pageSearch($query,&$poswords){ //build a regexp $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); $q['phrases'] = array_map('ft_preg_quote_cb',$q['phrases']); + // use this for higlighting later: + if($regex !== '') $regex .= '|'; + $regex .= join('|',$q['phrases']); // check the source of all documents for the exact phrases foreach(array_keys($docs) as $id){ $text = utf8_strtolower(rawWiki($id)); @@ -196,18 +200,15 @@ function ft_pageLookup($id,$pageonly=true){ * * @author Andreas Gohr */ -function ft_snippet($id,$poswords){ - $poswords = preg_quote($poswords,'#'); - $re = '('.str_replace(' ','|',$poswords).')'; +function ft_snippet($id,$re){ $text = rawWiki($id); - $match = array(); $snippets = array(); $utf8_offset = $offset = $end = 0; $len = utf8_strlen($text); for ($cnt=3; $cnt--;) { - if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; + if (!preg_match('#('.$re.')#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; list($str,$idx) = $match[0]; @@ -258,7 +259,7 @@ function ft_snippet($id,$poswords){ } $m = "\1"; - $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); + $snippets = preg_replace('#('.$re.')#iu',$m.'$1'.$m,$snippets); $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','$1',hsc(join('... ',$snippets))); return $snippet; @@ -314,6 +315,7 @@ function ft_queryParser($query){ $q['query'] = $query; $q['ns'] = array(); $q['phrases'] = array(); + $q['words'] = array(); $q['and'] = array(); $q['not'] = array(); @@ -337,12 +339,15 @@ function ft_queryParser($query){ if(count($token)) $q['not'] = array_merge($q['not'],$token); }else{ // asian "words" need to be searched as phrases - if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){ + if(@preg_match_all('/(('.IDX_ASIAN.')+)/u',$w,$matches)){ $q['phrases'] = array_merge($q['phrases'],$matches[1]); } $token = idx_tokenizer($w,$stopwords,true); - if(count($token)) $q['and'] = array_merge($q['and'],$token); + if(count($token)){ + $q['and'] = array_merge($q['and'],$token); + $q['words'] = array_merge($q['words'],$token); + } } } -- cgit v1.2.3