diff options
author | chris <chris@jalakai.co.uk> | 2006-09-27 05:37:13 +0200 |
---|---|---|
committer | chris <chris@jalakai.co.uk> | 2006-09-27 05:37:13 +0200 |
commit | 5e613a5c5e3fb29292e23a5fd83f17f25567a747 (patch) | |
tree | 2922639b3de2fcbc4e3dbc709f6f8f7483103582 | |
parent | 8fcc3410e3603bb823483a739753a3c4a12e6d7c (diff) | |
download | rpg-5e613a5c5e3fb29292e23a5fd83f17f25567a747.tar.gz rpg-5e613a5c5e3fb29292e23a5fd83f17f25567a747.tar.bz2 |
utf8_substr fix for FS#891
darcs-hash:20060927033713-9b6ab-4b35e0a85b6d11d5a3a98858cd2f860b383ff153.gz
-rw-r--r-- | inc/utf8.php | 118 |
1 files changed, 101 insertions, 17 deletions
diff --git a/inc/utf8.php b/inc/utf8.php index 0479d8809..a6516f498 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -6,19 +6,19 @@ * @author Andreas Gohr <andi@splitbrain.org> */ - /** * check for mb_string support */ if(!defined('UTF8_MBSTRING')){ if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ define('UTF8_MBSTRING',1); - mb_internal_encoding('UTF-8'); }else{ define('UTF8_MBSTRING',0); } } +if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } + /** * URL-Encode a filename to allow unicodecharacters @@ -92,12 +92,13 @@ function utf8_strip($str){ */ function utf8_check($Str) { for ($i=0; $i<strlen($Str); $i++) { - if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb - elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb - elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb - elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb - elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb - elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b + $b = ord($Str[$i]); + if ($b < 0x80) continue; # 0bbbbbbb + elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb + elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb + elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb + elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb + elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b else return false; # Does not match any model for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) @@ -130,6 +131,7 @@ function utf8_strlen($string){ * when doing so * * @author Harry Fuecks <hfuecks@gmail.com> + * @author Chris Smith <chris@jalakai.co.uk> * @param string * @param integer number of UTF-8 characters offset (from left) * @param integer (optional) length in UTF-8 characters from offset @@ -144,7 +146,7 @@ function utf8_substr($str, $offset, $length = null) { } } - if ( $offset >= 0 && $length >= 0 ) { + if ( $offset >= 0 && $length >= 0 && $offset < 65534 && $length < 65534) { if ( $length === null ) { $length = '*'; } else { @@ -169,14 +171,33 @@ function utf8_substr($str, $offset, $length = null) { return false; } else { - // Handle negatives using different, slower technique - // From: http://www.php.net/manual/en/function.substr.php#44838 - preg_match_all('/./u', $str, $ar); - if( $length !== null ) { - return join('',array_slice($ar[0],$offset,$length)); - } else { - return join('',array_slice($ar[0],$offset)); - } + + // convert character offsets to byte offsets and use normal substr() + // 1. normalise paramters into positive offset and length and carry out simple checks + $strlen = strlen(utf8_decode($str)); + + if ($offset < 0) { + $offset = max($strlen+$offset,0); + } + if ($offset >= $strlen) return false; + + if ($length === null) { + // 2a. convert to start byte offset + list($start) = _utf8_byteindex($str,$offset); + return substr($str,$start); + } + + if ($length < 0) { + $length = $strlen-$offset+$length; + if ($length < 0) return ''; + } + + if ($length === 0) return ''; + if ($strlen - $offset < $length) $length = $strlen-$offset; + + // 2b. convert to start and end byte offsets + list($start,$end) = _utf8_byteindex($str,$offset,$offset+$length); + return substr($str,$start,$end-$start); } } @@ -794,6 +815,69 @@ function utf8_correctIdx(&$str,$i,$next=false) { return $i; } +/** + * determine the byte indexes into a utf-8 string for one or more character offsets + * PRIVATE (could be made public with proper paramter checking) + * + * @author Chris Smith <chris@jalakai.co.uk> + * + * @param string $str utf8 string + * @param int $offset any number of character offsets into $str + * + * @return array byte indexes into $str, one index for each offset argument + */ +function _utf8_byteindex() { + + $args = func_get_args(); + $str =& array_shift($args); + if (!is_string($str)) return false; + + $result = array(); + + // use a short piece of str to estimate bytes per character + $i = utf8_correctIdx($str, 300, true); // $i (& $j) -> byte indexes into $str + $c = utf8_strlen(substr($str,0,$i)); // $c -> character offset into $str + + sort($args); // deal with arguments from lowest to highest + foreach ($args as $offset) { + // sanity checks FIXME + + // 0 is an easy check + if ($offset == 0) { $result[] = 0; continue; } + + $safety_valve = 50; // ensure no endless looping + + do { + $j = (int)($offset * $i/$c); // apply latest bytes/character estimate to offset + $j = utf8_correctIdx($str, $j, true); // correct to utf8 character boundary + + if ($j > $i) { + $c += utf8_strlen(substr($str,$i,$j-$i)); // determine new character offset + } else { + $c -= utf8_strlen(substr($str,$j,$i-$j)); // ditto + } + + $error = abs($c-$offset); + + $i = $j; // ready for next time around + } while (($error > 7) && --$safety_valve) ; // from 7 it is faster to iterate over the string + + if ($error && $error <= 7) { + if ($c < $offset) { + // move up + while ($error--) { $i = utf8_correctIdx($str,++$i,true); } + } else { + // move down + while ($error--) { $i = utf8_correctIdx($str,--$i,false); } + } + $c = $offset; // ready for next arg + } + $result[] = $i; + } + + return $result; +} + // only needed if no mb_string available if(!UTF8_MBSTRING){ |