summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorchris <chris@jalakai.co.uk>2006-09-27 05:37:13 +0200
committerchris <chris@jalakai.co.uk>2006-09-27 05:37:13 +0200
commit5e613a5c5e3fb29292e23a5fd83f17f25567a747 (patch)
tree2922639b3de2fcbc4e3dbc709f6f8f7483103582
parent8fcc3410e3603bb823483a739753a3c4a12e6d7c (diff)
downloadrpg-5e613a5c5e3fb29292e23a5fd83f17f25567a747.tar.gz
rpg-5e613a5c5e3fb29292e23a5fd83f17f25567a747.tar.bz2
utf8_substr fix for FS#891
darcs-hash:20060927033713-9b6ab-4b35e0a85b6d11d5a3a98858cd2f860b383ff153.gz
-rw-r--r--inc/utf8.php118
1 files changed, 101 insertions, 17 deletions
diff --git a/inc/utf8.php b/inc/utf8.php
index 0479d8809..a6516f498 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -6,19 +6,19 @@
* @author Andreas Gohr <andi@splitbrain.org>
*/
-
/**
* check for mb_string support
*/
if(!defined('UTF8_MBSTRING')){
if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
define('UTF8_MBSTRING',1);
- mb_internal_encoding('UTF-8');
}else{
define('UTF8_MBSTRING',0);
}
}
+if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
+
/**
* URL-Encode a filename to allow unicodecharacters
@@ -92,12 +92,13 @@ function utf8_strip($str){
*/
function utf8_check($Str) {
for ($i=0; $i<strlen($Str); $i++) {
- if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
- elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
- elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
- elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
- elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
- elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
+ $b = ord($Str[$i]);
+ if ($b < 0x80) continue; # 0bbbbbbb
+ elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
+ elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
+ elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
+ elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
+ elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
else return false; # Does not match any model
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
@@ -130,6 +131,7 @@ function utf8_strlen($string){
* when doing so
*
* @author Harry Fuecks <hfuecks@gmail.com>
+ * @author Chris Smith <chris@jalakai.co.uk>
* @param string
* @param integer number of UTF-8 characters offset (from left)
* @param integer (optional) length in UTF-8 characters from offset
@@ -144,7 +146,7 @@ function utf8_substr($str, $offset, $length = null) {
}
}
- if ( $offset >= 0 && $length >= 0 ) {
+ if ( $offset >= 0 && $length >= 0 && $offset < 65534 && $length < 65534) {
if ( $length === null ) {
$length = '*';
} else {
@@ -169,14 +171,33 @@ function utf8_substr($str, $offset, $length = null) {
return false;
} else {
- // Handle negatives using different, slower technique
- // From: http://www.php.net/manual/en/function.substr.php#44838
- preg_match_all('/./u', $str, $ar);
- if( $length !== null ) {
- return join('',array_slice($ar[0],$offset,$length));
- } else {
- return join('',array_slice($ar[0],$offset));
- }
+
+ // convert character offsets to byte offsets and use normal substr()
+ // 1. normalise paramters into positive offset and length and carry out simple checks
+ $strlen = strlen(utf8_decode($str));
+
+ if ($offset < 0) {
+ $offset = max($strlen+$offset,0);
+ }
+ if ($offset >= $strlen) return false;
+
+ if ($length === null) {
+ // 2a. convert to start byte offset
+ list($start) = _utf8_byteindex($str,$offset);
+ return substr($str,$start);
+ }
+
+ if ($length < 0) {
+ $length = $strlen-$offset+$length;
+ if ($length < 0) return '';
+ }
+
+ if ($length === 0) return '';
+ if ($strlen - $offset < $length) $length = $strlen-$offset;
+
+ // 2b. convert to start and end byte offsets
+ list($start,$end) = _utf8_byteindex($str,$offset,$offset+$length);
+ return substr($str,$start,$end-$start);
}
}
@@ -794,6 +815,69 @@ function utf8_correctIdx(&$str,$i,$next=false) {
return $i;
}
+/**
+ * determine the byte indexes into a utf-8 string for one or more character offsets
+ * PRIVATE (could be made public with proper paramter checking)
+ *
+ * @author Chris Smith <chris@jalakai.co.uk>
+ *
+ * @param string $str utf8 string
+ * @param int $offset any number of character offsets into $str
+ *
+ * @return array byte indexes into $str, one index for each offset argument
+ */
+function _utf8_byteindex() {
+
+ $args = func_get_args();
+ $str =& array_shift($args);
+ if (!is_string($str)) return false;
+
+ $result = array();
+
+ // use a short piece of str to estimate bytes per character
+ $i = utf8_correctIdx($str, 300, true); // $i (& $j) -> byte indexes into $str
+ $c = utf8_strlen(substr($str,0,$i)); // $c -> character offset into $str
+
+ sort($args); // deal with arguments from lowest to highest
+ foreach ($args as $offset) {
+ // sanity checks FIXME
+
+ // 0 is an easy check
+ if ($offset == 0) { $result[] = 0; continue; }
+
+ $safety_valve = 50; // ensure no endless looping
+
+ do {
+ $j = (int)($offset * $i/$c); // apply latest bytes/character estimate to offset
+ $j = utf8_correctIdx($str, $j, true); // correct to utf8 character boundary
+
+ if ($j > $i) {
+ $c += utf8_strlen(substr($str,$i,$j-$i)); // determine new character offset
+ } else {
+ $c -= utf8_strlen(substr($str,$j,$i-$j)); // ditto
+ }
+
+ $error = abs($c-$offset);
+
+ $i = $j; // ready for next time around
+ } while (($error > 7) && --$safety_valve) ; // from 7 it is faster to iterate over the string
+
+ if ($error && $error <= 7) {
+ if ($c < $offset) {
+ // move up
+ while ($error--) { $i = utf8_correctIdx($str,++$i,true); }
+ } else {
+ // move down
+ while ($error--) { $i = utf8_correctIdx($str,--$i,false); }
+ }
+ $c = $offset; // ready for next arg
+ }
+ $result[] = $i;
+ }
+
+ return $result;
+}
+
// only needed if no mb_string available
if(!UTF8_MBSTRING){