diff options
-rw-r--r-- | _test/cases/inc/utf8_html.test.php | 72 | ||||
-rw-r--r-- | inc/utf8.php | 89 |
2 files changed, 144 insertions, 17 deletions
diff --git a/_test/cases/inc/utf8_html.test.php b/_test/cases/inc/utf8_html.test.php new file mode 100644 index 000000000..57c9df259 --- /dev/null +++ b/_test/cases/inc/utf8_html.test.php @@ -0,0 +1,72 @@ +<?php + +require_once DOKU_INC.'inc/utf8.php'; + +// use no mbstring help here +if(!defined('UTF8_NOMBSTRING')) define('UTF8_NOMBSTRING',1); + +class utf8_html_test extends UnitTestCase { + + function test_from_1byte(){ + $in = 'a'; + $out = 'a'; + $this->assertEqual(utf8_tohtml($in),$out); + } + + function test_from_2byte(){ + $in = "\xc3\xbc"; + $out = 'ü'; + $this->assertEqual(utf8_tohtml($in),$out); + } + + function test_from_3byte(){ + $in = "\xe2\x99\x8a"; + $out = '♊'; + $this->assertEqual(utf8_tohtml($in),$out); + } + + function test_from_4byte(){ + $in = "\xf4\x80\x80\x81"; + $out = '􀀁'; + $this->assertEqual(utf8_tohtml($in),$out); + } + + function test_to_1byte(){ + $out = 'a'; + $in = 'a'; + $this->assertEqual(utf8_unhtml($in),$out); + } + + function test_to_2byte(){ + $out = "\xc3\xbc"; + $in = 'ü'; + $this->assertEqual(utf8_unhtml($in),$out); + } + + function test_to_3byte(){ + $out = "\xe2\x99\x8a"; + $in = '♊'; + $this->assertEqual(utf8_unhtml($in),$out); + } + + function test_to_4byte(){ + $out = "\xf4\x80\x80\x81"; + $in = '􀀁'; + $this->assertEqual(utf8_unhtml($in),$out); + } + + function test_without_entities(){ + $out = '&#38;&#38;'; + $in = '&#38;&amp;#38;'; + $this->assertEqual(utf8_unhtml($in),$out); + } + + function test_with_entities(){ + $out = '&&#38;'; + $in = '&#38;&amp;#38;'; + $this->assertEqual(utf8_unhtml($in,HTML_ENTITIES),$out); + } + +} + +//Setup VIM: ex: et ts=4 enc=utf-8 : diff --git a/inc/utf8.php b/inc/utf8.php index d2774fd04..04f43a639 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -456,28 +456,83 @@ function utf8_strpos($haystack, $needle,$offset=0) { /** * Encodes UTF-8 characters to HTML entities * + * @author Tom N Harris <tnharris@whoopdedo.org> * @author <vpribish at shopping dot com> * @link http://www.php.net/manual/en/function.utf8-decode.php */ function utf8_tohtml ($str) { - $ret = ''; - $max = strlen($str); - $last = 0; // keeps the index of the last regular character - for ($i=0; $i<$max; $i++) { - $c = $str{$i}; - $c1 = ord($c); - if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode - $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed - $c1 &= 31; // remove the 3 bit two bytes prefix - $c2 = ord($str{++$i}); // the next byte - $c2 &= 63; // remove the 2 bit trailing byte prefix - $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 - $c1 >>= 2; // c1 shifts 2 to the right - $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation - $last = $i+1; + $ret = ''; + foreach (utf8_to_unicode($str) as $cp) { + if ($cp < 0x80) + $ret .= chr($cp); + elseif ($cp < 0x100) + $ret .= "&#$cp;"; + else + $ret .= '&#x'.dechex($cp).';'; + } + return $ret; +} + +/** + * Decodes HTML entities to UTF-8 characters + * + * Convert any &#..; entity to a codepoint, + * The entities flag defaults to only decoding numeric entities. + * Pass HTML_ENTITIES and named entities, including & < etc. + * are handled as well. Avoids the problem that would occur if you + * had to decode "&#38;&amp;#38;" + * + * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" + * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" + * what it should be -> "&&#38;" + * + * @author Tom N Harris <tnharris@whoopdedo.org> + * @param string $str UTF-8 encoded string + * @param boolean $entities Flag controlling decoding of named entities. + * @return UTF-8 encoded string with numeric (and named) entities replaced. + */ +function utf8_unhtml($str, $entities=null) { + static $decoder = null; + if (is_null($decoder)) + $decoder = new utf8_entity_decoder(); + if (is_null($entities)) + return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m', + 'utf8_decode_numeric', $str); + else + return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m', + array(&$decoder, 'decode'), $str); +} +function utf8_decode_numeric($ent) { + switch ($ent[2]) { + case 'X': + case 'x': + $cp = hexdec($ent[3]); + break; + default: + $cp = intval($ent[3]); + break; + } + return unicode_to_utf8(array($cp)); +} +class utf8_entity_decoder { + var $table; + function utf8_entity_decoder() { + $table = get_html_translation_table(HTML_ENTITIES); + $table = array_flip($table); + $this->table = array_map(array(&$this,'makeutf8'), $table); + } + function makeutf8($c) { + return unicode_to_utf8(array(ord($c))); + } + function decode($ent) { + if ($ent[1] == '#') { + return utf8_decode_numeric($ent); + } elseif (array_key_exists($ent[0],$this->table)) { + return $this->table[$ent[0]]; + } else { + return $ent[0]; + } } - } - return $ret . substr($str, $last, $i); // append the last batch of regular characters } /** |