1 files changed, 65 insertions, 3 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 6ab22a5c2..b9450c172 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -242,8 +242,8 @@ switch ($algorithm) {
       if ($idx < $end) continue;
 
       $pre = min($idx,50);
-      $start = $idx - $pre;
-      $end = min($idx+100+strlen($str)-$pre,$len);
+      $start = utf8_correctIdx($text, $idx - $pre);
+      $end = utf8_correctIdx($text, min($idx+100+strlen($str)-$pre,$len));
       $snippets[] = substr($text,$start,$end-$start);
       if (!($cnt--)) break;
     }
@@ -255,7 +255,7 @@ switch ($algorithm) {
   break;
 
   case 'opt2' :
-	default :
+  default :
 // option 2 ... CS 2006-08-25
 // above + reduce amount of the file searched
     $match = array();
@@ -280,6 +280,11 @@ switch ($algorithm) {
         $pre = min($pre,100-$post);
       } else if ($post>50) {
         $post = min($post, 100-$pre);
+      } else {
+        // means both pre & post are less than 50, the context is the whole string
+        // make it so and break out of this loop - there is no need for the complex snippet calculations
+        $snippets = array($text);
+        break;
       }
 
       // establish context start and end points, try to append to previous context if possible
@@ -304,6 +309,63 @@ switch ($algorithm) {
     $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
 
   break;
+  
+  case 'utf8':
+    $match = array();
+    $snippets = array();
+    $utf8_offset = $offset = 0;
+    $len = utf8_strlen($text);
+    for ($cnt=3; $cnt--;) {
+      if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
+
+      list($str,$idx) = $match[0];
+      
+      // convert $idx (a byte offset) into a utf8 character offset
+      $utf8_idx = utf8_strlen(substr($text,0,$idx));
+      $utf8_len = utf8_strlen($str);
+
+      // establish context, 100 bytes surrounding the match string
+      // first look to see if we can go 100 either side,
+      // then drop to 50 adding any excess if the other side can't go to 50,
+      // NOTE: these are byte adjustments and will have to be corrected for utf-8
+      $pre = min($utf8_idx-$utf8_offset,100);
+      $post = min($len-$utf8_idx-$utf8_len,100);
+
+      if ($pre>50 && $post>50) {
+        $pre = $post = 50;
+      } else if ($pre>50) {
+        $pre = min($pre,100-$post);
+      } else if ($post>50) {
+        $post = min($post, 100-$pre);
+      } else {
+        // both are less than 50, means the context is the whole string
+        // make it so and break out of this loop - there is no need for the complex snippet calculations
+        $snippets = array($text);
+        break;
+      }
+
+      // establish context start and end points, try to append to previous context if possible
+      $start = $idx - $pre;
+      $append = ($start < $end) ? $end : false;       // still the end of the previous context snippet
+      $end = $idx + $utf8_len + $post;                // now set it to the end of this context
+
+      if ($append) {
+        $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+      } else {
+        $snippets[] = utf8_substr($text,$start,$end-$start);
+      }
+
+      // set $offset for next match attempt
+      //   substract strlen to avoid splitting a potential search success, this is an approximation as the
+      //   search pattern may match strings of varying length and it will fail if the context snippet
+      //   boundary breaks a matching string longer than the current match
+      $utf8_offset = $end - $utf8_len;
+      $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset)));
+    }
+    $m = "\1";
+    $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
+    $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
+  break;
 }
 
     return $snippet;