summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--_test/cases/inc/utf8_html.test.php72
-rw-r--r--inc/utf8.php89
2 files changed, 144 insertions, 17 deletions
diff --git a/_test/cases/inc/utf8_html.test.php b/_test/cases/inc/utf8_html.test.php
new file mode 100644
index 000000000..57c9df259
--- /dev/null
+++ b/_test/cases/inc/utf8_html.test.php
@@ -0,0 +1,72 @@
+<?php
+
+require_once DOKU_INC.'inc/utf8.php';
+
+// use no mbstring help here
+if(!defined('UTF8_NOMBSTRING')) define('UTF8_NOMBSTRING',1);
+
+class utf8_html_test extends UnitTestCase {
+
+ function test_from_1byte(){
+ $in = 'a';
+ $out = 'a';
+ $this->assertEqual(utf8_tohtml($in),$out);
+ }
+
+ function test_from_2byte(){
+ $in = "\xc3\xbc";
+ $out = '&#252;';
+ $this->assertEqual(utf8_tohtml($in),$out);
+ }
+
+ function test_from_3byte(){
+ $in = "\xe2\x99\x8a";
+ $out = '&#x264a;';
+ $this->assertEqual(utf8_tohtml($in),$out);
+ }
+
+ function test_from_4byte(){
+ $in = "\xf4\x80\x80\x81";
+ $out = '&#x100001;';
+ $this->assertEqual(utf8_tohtml($in),$out);
+ }
+
+ function test_to_1byte(){
+ $out = 'a';
+ $in = 'a';
+ $this->assertEqual(utf8_unhtml($in),$out);
+ }
+
+ function test_to_2byte(){
+ $out = "\xc3\xbc";
+ $in = '&#252;';
+ $this->assertEqual(utf8_unhtml($in),$out);
+ }
+
+ function test_to_3byte(){
+ $out = "\xe2\x99\x8a";
+ $in = '&#x264a;';
+ $this->assertEqual(utf8_unhtml($in),$out);
+ }
+
+ function test_to_4byte(){
+ $out = "\xf4\x80\x80\x81";
+ $in = '&#x100001;';
+ $this->assertEqual(utf8_unhtml($in),$out);
+ }
+
+ function test_without_entities(){
+ $out = '&amp;#38;&amp;#38;';
+ $in = '&amp;#38;&#38;amp;#38;';
+ $this->assertEqual(utf8_unhtml($in),$out);
+ }
+
+ function test_with_entities(){
+ $out = '&#38;&amp;#38;';
+ $in = '&amp;#38;&#38;amp;#38;';
+ $this->assertEqual(utf8_unhtml($in,HTML_ENTITIES),$out);
+ }
+
+}
+
+//Setup VIM: ex: et ts=4 enc=utf-8 :
diff --git a/inc/utf8.php b/inc/utf8.php
index d2774fd04..04f43a639 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -456,28 +456,83 @@ function utf8_strpos($haystack, $needle,$offset=0) {
/**
* Encodes UTF-8 characters to HTML entities
*
+ * @author Tom N Harris <tnharris@whoopdedo.org>
* @author <vpribish at shopping dot com>
* @link http://www.php.net/manual/en/function.utf8-decode.php
*/
function utf8_tohtml ($str) {
- $ret = '';
- $max = strlen($str);
- $last = 0; // keeps the index of the last regular character
- for ($i=0; $i<$max; $i++) {
- $c = $str{$i};
- $c1 = ord($c);
- if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode
- $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
- $c1 &= 31; // remove the 3 bit two bytes prefix
- $c2 = ord($str{++$i}); // the next byte
- $c2 &= 63; // remove the 2 bit trailing byte prefix
- $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
- $c1 >>= 2; // c1 shifts 2 to the right
- $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
- $last = $i+1;
+ $ret = '';
+ foreach (utf8_to_unicode($str) as $cp) {
+ if ($cp < 0x80)
+ $ret .= chr($cp);
+ elseif ($cp < 0x100)
+ $ret .= "&#$cp;";
+ else
+ $ret .= '&#x'.dechex($cp).';';
+ }
+ return $ret;
+}
+
+/**
+ * Decodes HTML entities to UTF-8 characters
+ *
+ * Convert any &#..; entity to a codepoint,
+ * The entities flag defaults to only decoding numeric entities.
+ * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
+ * are handled as well. Avoids the problem that would occur if you
+ * had to decode "&amp;#38;&#38;amp;#38;"
+ *
+ * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
+ * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
+ * what it should be -> "&#38;&amp#38;"
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ * @param string $str UTF-8 encoded string
+ * @param boolean $entities Flag controlling decoding of named entities.
+ * @return UTF-8 encoded string with numeric (and named) entities replaced.
+ */
+function utf8_unhtml($str, $entities=null) {
+ static $decoder = null;
+ if (is_null($decoder))
+ $decoder = new utf8_entity_decoder();
+ if (is_null($entities))
+ return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
+ 'utf8_decode_numeric', $str);
+ else
+ return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
+ array(&$decoder, 'decode'), $str);
+}
+function utf8_decode_numeric($ent) {
+ switch ($ent[2]) {
+ case 'X':
+ case 'x':
+ $cp = hexdec($ent[3]);
+ break;
+ default:
+ $cp = intval($ent[3]);
+ break;
+ }
+ return unicode_to_utf8(array($cp));
+}
+class utf8_entity_decoder {
+ var $table;
+ function utf8_entity_decoder() {
+ $table = get_html_translation_table(HTML_ENTITIES);
+ $table = array_flip($table);
+ $this->table = array_map(array(&$this,'makeutf8'), $table);
+ }
+ function makeutf8($c) {
+ return unicode_to_utf8(array(ord($c)));
+ }
+ function decode($ent) {
+ if ($ent[1] == '#') {
+ return utf8_decode_numeric($ent);
+ } elseif (array_key_exists($ent[0],$this->table)) {
+ return $this->table[$ent[0]];
+ } else {
+ return $ent[0];
+ }
}
- }
- return $ret . substr($str, $last, $i); // append the last batch of regular characters
}
/**