summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--inc/fulltext.php68
1 files changed, 65 insertions, 3 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 6ab22a5c2..b9450c172 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -242,8 +242,8 @@ switch ($algorithm) {
if ($idx < $end) continue;
$pre = min($idx,50);
- $start = $idx - $pre;
- $end = min($idx+100+strlen($str)-$pre,$len);
+ $start = utf8_correctIdx($text, $idx - $pre);
+ $end = utf8_correctIdx($text, min($idx+100+strlen($str)-$pre,$len));
$snippets[] = substr($text,$start,$end-$start);
if (!($cnt--)) break;
}
@@ -255,7 +255,7 @@ switch ($algorithm) {
break;
case 'opt2' :
- default :
+ default :
// option 2 ... CS 2006-08-25
// above + reduce amount of the file searched
$match = array();
@@ -280,6 +280,11 @@ switch ($algorithm) {
$pre = min($pre,100-$post);
} else if ($post>50) {
$post = min($post, 100-$pre);
+ } else {
+ // means both pre & post are less than 50, the context is the whole string
+ // make it so and break out of this loop - there is no need for the complex snippet calculations
+ $snippets = array($text);
+ break;
}
// establish context start and end points, try to append to previous context if possible
@@ -304,6 +309,63 @@ switch ($algorithm) {
$snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
break;
+
+ case 'utf8':
+ $match = array();
+ $snippets = array();
+ $utf8_offset = $offset = 0;
+ $len = utf8_strlen($text);
+ for ($cnt=3; $cnt--;) {
+ if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
+
+ list($str,$idx) = $match[0];
+
+ // convert $idx (a byte offset) into a utf8 character offset
+ $utf8_idx = utf8_strlen(substr($text,0,$idx));
+ $utf8_len = utf8_strlen($str);
+
+ // establish context, 100 bytes surrounding the match string
+ // first look to see if we can go 100 either side,
+ // then drop to 50 adding any excess if the other side can't go to 50,
+ // NOTE: these are byte adjustments and will have to be corrected for utf-8
+ $pre = min($utf8_idx-$utf8_offset,100);
+ $post = min($len-$utf8_idx-$utf8_len,100);
+
+ if ($pre>50 && $post>50) {
+ $pre = $post = 50;
+ } else if ($pre>50) {
+ $pre = min($pre,100-$post);
+ } else if ($post>50) {
+ $post = min($post, 100-$pre);
+ } else {
+ // both are less than 50, means the context is the whole string
+ // make it so and break out of this loop - there is no need for the complex snippet calculations
+ $snippets = array($text);
+ break;
+ }
+
+ // establish context start and end points, try to append to previous context if possible
+ $start = $idx - $pre;
+ $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
+ $end = $idx + $utf8_len + $post; // now set it to the end of this context
+
+ if ($append) {
+ $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+ } else {
+ $snippets[] = utf8_substr($text,$start,$end-$start);
+ }
+
+ // set $offset for next match attempt
+ // substract strlen to avoid splitting a potential search success, this is an approximation as the
+ // search pattern may match strings of varying length and it will fail if the context snippet
+ // boundary breaks a matching string longer than the current match
+ $utf8_offset = $end - $utf8_len;
+ $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset)));
+ }
+ $m = "\1";
+ $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
+ $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
+ break;
}
return $snippet;