summaryrefslogtreecommitdiff
path: root/inc
diff options
context:
space:
mode:
authorchris <chris@jalakai.co.uk>2006-08-26 11:53:11 +0200
committerchris <chris@jalakai.co.uk>2006-08-26 11:53:11 +0200
commit5953e88907368380d326c187b3d1071f575c7daf (patch)
tree97342e4fb4614370b7dca614196296e16e1c8c82 /inc
parent0eac1afbfcac819df65253478138374667f9b8df (diff)
downloadrpg-5953e88907368380d326c187b3d1071f575c7daf.tar.gz
rpg-5953e88907368380d326c187b3d1071f575c7daf.tar.bz2
ft_snippet() update, fix utf8 problems
darcs-hash:20060826095311-9b6ab-9a6f272cc7c7532eb2bad8f7b4404c5a16b71109.gz
Diffstat (limited to 'inc')
-rw-r--r--inc/fulltext.php13
-rw-r--r--inc/utf8.php25
2 files changed, 32 insertions, 6 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index de1a4217b..6ab22a5c2 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -267,9 +267,10 @@ switch ($algorithm) {
list($str,$idx) = $match[0];
- // establish context, 100 characters surrounding the match string
+ // establish context, 100 bytes surrounding the match string
// first look to see if we can go 100 either side,
- // then drop to 50 adding any excess if the other side can't go to 50.
+ // then drop to 50 adding any excess if the other side can't go to 50,
+ // NOTE: these are byte adjustments and will have to be corrected for utf-8
$pre = min($idx-$offset,100);
$post = min($len-$idx-strlen($str),100);
@@ -282,9 +283,9 @@ switch ($algorithm) {
}
// establish context start and end points, try to append to previous context if possible
- $start = $idx - $pre;
- $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
- $end = $idx + strlen($str) + $post; // now set it to the end of this context
+ $start = utf8_correctIdx($text,$idx - $pre);
+ $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
+ $end = utf8_correctIdx($text, $idx + strlen($str) + $post); // now set it to the end of this context
if ($append) {
$snippets[count($snippets)-1] .= substr($text,$append,$end-$append);
@@ -305,7 +306,7 @@ switch ($algorithm) {
break;
}
- return utf8_bad_replace($snippet);
+ return $snippet;
}
/**
diff --git a/inc/utf8.php b/inc/utf8.php
index 16722ab2e..0323bed4b 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -762,6 +762,31 @@ function utf8_bad_replace($str, $replace = '') {
return $result;
}
+/**
+ * adjust a byte index into a utf8 string to a utf8 character boundary
+ *
+ * @param $str string utf8 character string
+ * @param $i int byte index into $str
+ * @param $next bool direction to search for boundary,
+ * false = up (current character)
+ * true = down (next character)
+ *
+ * @return int byte index into $str now pointing to a utf8 character boundary
+ *
+ * @author chris smith <chris@jalakai.co.uk>
+ */
+function utf8_correctIdx(&$str,$i,$next=false) {
+
+ if ($next) {
+ $limit = strlen($str);
+ while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
+ } else {
+ while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
+ }
+
+ return $i;
+}
+
// only needed if no mb_string available
if(!UTF8_MBSTRING){