ft_snippet() update, fix utf8 problems

darcs-hash:20060826095311-9b6ab-9a6f272cc7c7532eb2bad8f7b4404c5a16b71109.gz
author: chris <chris@jalakai.co.uk> 2006-08-26 11:53:11 +0200
committer: chris <chris@jalakai.co.uk> 2006-08-26 11:53:11 +0200
commit: 5953e88907368380d326c187b3d1071f575c7daf (patch)
tree: 97342e4fb4614370b7dca614196296e16e1c8c82 /inc
parent: 0eac1afbfcac819df65253478138374667f9b8df (diff)
download: rpg-5953e88907368380d326c187b3d1071f575c7daf.tar.gz
rpg-5953e88907368380d326c187b3d1071f575c7daf.tar.bz2
2 files changed, 32 insertions, 6 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index de1a4217b..6ab22a5c2 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -267,9 +267,10 @@ switch ($algorithm) {
 
       list($str,$idx) = $match[0];
 
-      // establish context, 100 characters surrounding the match string
+      // establish context, 100 bytes surrounding the match string
       // first look to see if we can go 100 either side,
-      // then drop to 50 adding any excess if the other side can't go to 50.
+      // then drop to 50 adding any excess if the other side can't go to 50,
+      // NOTE: these are byte adjustments and will have to be corrected for utf-8
       $pre = min($idx-$offset,100);
       $post = min($len-$idx-strlen($str),100);
 
@@ -282,9 +283,9 @@ switch ($algorithm) {
       }
 
       // establish context start and end points, try to append to previous context if possible
-      $start = $idx - $pre;
-      $append = ($start < $end) ? $end : false;   // still the end of the previous context snippet
-      $end = $idx + strlen($str) + $post;         // now set it to the end of this context
+      $start = utf8_correctIdx($text,$idx - $pre);
+      $append = ($start < $end) ? $end : false;                     // still the end of the previous context snippet
+      $end = utf8_correctIdx($text, $idx + strlen($str) + $post);   // now set it to the end of this context
 
       if ($append) {
         $snippets[count($snippets)-1] .= substr($text,$append,$end-$append);
@@ -305,7 +306,7 @@ switch ($algorithm) {
   break;
 }
 
-    return utf8_bad_replace($snippet);
+    return $snippet;
 }
 
 /**
diff --git a/inc/utf8.php b/inc/utf8.php
index 16722ab2e..0323bed4b 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -762,6 +762,31 @@ function utf8_bad_replace($str, $replace = '') {
     return $result;
 }
 
+/**
+ * adjust a byte index into a utf8 string to a utf8 character boundary
+ *
+ * @param $str   string   utf8 character string
+ * @param $i     int      byte index into $str
+ * @param $next  bool     direction to search for boundary, 
+ *                           false = up (current character)
+ *                           true = down (next character)
+ *
+ * @return int            byte index into $str now pointing to a utf8 character boundary
+ *
+ * @author       chris smith <chris@jalakai.co.uk>
+ */
+function utf8_correctIdx(&$str,$i,$next=false) {
+	
+  if ($next) {
+	  $limit = strlen($str);
+	  while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
+	} else {
+	  while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
+	}
+	
+	return $i;
+}
+
 // only needed if no mb_string available
 if(!UTF8_MBSTRING){
author	chris <chris@jalakai.co.uk>	2006-08-26 11:53:11 +0200
committer	chris <chris@jalakai.co.uk>	2006-08-26 11:53:11 +0200
commit	5953e88907368380d326c187b3d1071f575c7daf (patch)
tree	97342e4fb4614370b7dca614196296e16e1c8c82 /inc
parent	0eac1afbfcac819df65253478138374667f9b8df (diff)
download	rpg-5953e88907368380d326c187b3d1071f575c7daf.tar.gz rpg-5953e88907368380d326c187b3d1071f575c7daf.tar.bz2