diff options
author | Dries Buytaert <dries@buytaert.net> | 2009-01-02 22:09:53 +0000 |
---|---|---|
committer | Dries Buytaert <dries@buytaert.net> | 2009-01-02 22:09:53 +0000 |
commit | 45f34467fbc5c398adccc9555a15dbec7353accf (patch) | |
tree | 397285ff394e82ed369b6cf85574ad260e871077 | |
parent | 86aa636c8b5b2a3ef9437352af28cb8ed5f04523 (diff) | |
download | brdo-45f34467fbc5c398adccc9555a15dbec7353accf.tar.gz brdo-45f34467fbc5c398adccc9555a15dbec7353accf.tar.bz2 |
- Patch #212130 by Damien Tournoud, grendzy: decode_entities() should support all (X)HTML entities.
-rw-r--r-- | includes/unicode.entities.inc | 266 | ||||
-rw-r--r-- | includes/unicode.inc | 29 | ||||
-rw-r--r-- | modules/simpletest/tests/unicode.test | 4 |
3 files changed, 283 insertions, 16 deletions
diff --git a/includes/unicode.entities.inc b/includes/unicode.entities.inc new file mode 100644 index 000000000..486fd9737 --- /dev/null +++ b/includes/unicode.entities.inc @@ -0,0 +1,266 @@ +<?php +// $Id $ + +/** + * @file + * (X)HTML entities, as defined in HTML 4.01. + * + * @see http://www.w3.org/TR/html401/sgml/entities.html + */ + +$html_entities = array( + 'Á' => 'Á', + 'á' => 'á', + 'Â' => 'Â', + 'â' => 'â', + '´' => '´', + 'Æ' => 'Æ', + 'æ' => 'æ', + 'À' => 'À', + 'à' => 'à', + 'ℵ' => 'ℵ', + 'Α' => 'Α', + 'α' => 'α', + '&' => '&', + '∧' => '∧', + '∠' => '∠', + 'Å' => 'Å', + 'å' => 'å', + '≈' => '≈', + 'Ã' => 'Ã', + 'ã' => 'ã', + 'Ä' => 'Ä', + 'ä' => 'ä', + '„' => '„', + 'Β' => 'Β', + 'β' => 'β', + '¦' => '¦', + '•' => '•', + '∩' => '∩', + 'Ç' => 'Ç', + 'ç' => 'ç', + '¸' => '¸', + '¢' => '¢', + 'Χ' => 'Χ', + 'χ' => 'χ', + 'ˆ' => 'ˆ', + '♣' => '♣', + '≅' => '≅', + '©' => '©', + '↵' => '↵', + '∪' => '∪', + '¤' => '¤', + '†' => '†', + '‡' => '‡', + '↓' => '↓', + '⇓' => '⇓', + '°' => '°', + 'Δ' => 'Δ', + 'δ' => 'δ', + '♦' => '♦', + '÷' => '÷', + 'É' => 'É', + 'é' => 'é', + 'Ê' => 'Ê', + 'ê' => 'ê', + 'È' => 'È', + 'è' => 'è', + '∅' => '∅', + ' ' => ' ', + ' ' => ' ', + 'Ε' => 'Ε', + 'ε' => 'ε', + '≡' => '≡', + 'Η' => 'Η', + 'η' => 'η', + 'Ð' => 'Ð', + 'ð' => 'ð', + 'Ë' => 'Ë', + 'ë' => 'ë', + '€' => '€', + '∃' => '∃', + 'ƒ' => 'ƒ', + '∀' => '∀', + '½' => '½', + '¼' => '¼', + '¾' => '¾', + '⁄' => '⁄', + 'Γ' => 'Γ', + 'γ' => 'γ', + '≥' => '≥', + '↔' => '↔', + '⇔' => '⇔', + '♥' => '♥', + '…' => '…', + 'Í' => 'Í', + 'í' => 'í', + 'Î' => 'Î', + 'î' => 'î', + '¡' => '¡', + 'Ì' => 'Ì', + 'ì' => 'ì', + 'ℑ' => 'ℑ', + '∞' => '∞', + '∫' => '∫', + 'Ι' => 'Ι', + 'ι' => 'ι', + '¿' => '¿', + '∈' => '∈', + 'Ï' => 'Ï', + 'ï' => 'ï', + 'Κ' => 'Κ', + 'κ' => 'κ', + 'Λ' => 'Λ', + 'λ' => 'λ', + '⟨' => '〈', + '«' => '«', + '←' => '←', + '⇐' => '⇐', + '⌈' => '⌈', + '“' => '“', + '≤' => '≤', + '⌊' => '⌊', + '∗' => '∗', + '◊' => '◊', + '‎' => '', + '‹' => '‹', + '‘' => '‘', + '¯' => '¯', + '—' => '—', + 'µ' => 'µ', + '·' => '·', + '−' => '−', + 'Μ' => 'Μ', + 'μ' => 'μ', + '∇' => '∇', + ' ' => ' ', + '–' => '–', + '≠' => '≠', + '∋' => '∋', + '¬' => '¬', + '∉' => '∉', + '⊄' => '⊄', + 'Ñ' => 'Ñ', + 'ñ' => 'ñ', + 'Ν' => 'Ν', + 'ν' => 'ν', + 'Ó' => 'Ó', + 'ó' => 'ó', + 'Ô' => 'Ô', + 'ô' => 'ô', + 'Œ' => 'Œ', + 'œ' => 'œ', + 'Ò' => 'Ò', + 'ò' => 'ò', + '‾' => '‾', + 'Ω' => 'Ω', + 'ω' => 'ω', + 'Ο' => 'Ο', + 'ο' => 'ο', + '⊕' => '⊕', + '∨' => '∨', + 'ª' => 'ª', + 'º' => 'º', + 'Ø' => 'Ø', + 'ø' => 'ø', + 'Õ' => 'Õ', + 'õ' => 'õ', + '⊗' => '⊗', + 'Ö' => 'Ö', + 'ö' => 'ö', + '¶' => '¶', + '∂' => '∂', + '‰' => '‰', + '⊥' => '⊥', + 'Φ' => 'Φ', + 'φ' => 'φ', + 'Π' => 'Π', + 'π' => 'π', + 'ϖ' => 'ϖ', + '±' => '±', + '£' => '£', + '′' => '′', + '″' => '″', + '∏' => '∏', + '∝' => '∝', + 'Ψ' => 'Ψ', + 'ψ' => 'ψ', + '√' => '√', + '⟩' => '〉', + '»' => '»', + '→' => '→', + '⇒' => '⇒', + '⌉' => '⌉', + '”' => '”', + 'ℜ' => 'ℜ', + '®' => '®', + '⌋' => '⌋', + 'Ρ' => 'Ρ', + 'ρ' => 'ρ', + '‏' => '', + '›' => '›', + '’' => '’', + '‚' => '‚', + 'Š' => 'Š', + 'š' => 'š', + '⋅' => '⋅', + '§' => '§', + '­' => '', + 'Σ' => 'Σ', + 'σ' => 'σ', + 'ς' => 'ς', + '∼' => '∼', + '♠' => '♠', + '⊂' => '⊂', + '⊆' => '⊆', + '∑' => '∑', + '¹' => '¹', + '²' => '²', + '³' => '³', + '⊃' => '⊃', + '⊇' => '⊇', + 'ß' => 'ß', + 'Τ' => 'Τ', + 'τ' => 'τ', + '∴' => '∴', + 'Θ' => 'Θ', + 'θ' => 'θ', + 'ϑ' => 'ϑ', + ' ' => ' ', + 'Þ' => 'Þ', + 'þ' => 'þ', + '˜' => '˜', + '×' => '×', + '™' => '™', + 'Ú' => 'Ú', + 'ú' => 'ú', + '↑' => '↑', + '⇑' => '⇑', + 'Û' => 'Û', + 'û' => 'û', + 'Ù' => 'Ù', + 'ù' => 'ù', + '¨' => '¨', + 'ϒ' => 'ϒ', + 'Υ' => 'Υ', + 'υ' => 'υ', + 'Ü' => 'Ü', + 'ü' => 'ü', + '℘' => '℘', + 'Ξ' => 'Ξ', + 'ξ' => 'ξ', + 'Ý' => 'Ý', + 'ý' => 'ý', + '¥' => '¥', + 'ÿ' => 'ÿ', + 'Ÿ' => 'Ÿ', + 'Ζ' => 'Ζ', + 'ζ' => 'ζ', + '‍' => '', + '‌' => '', + '>' => '>', + '<' => '<', + '"' => '"', + // Add apostrophe (XML). + ''' => "'", +); diff --git a/includes/unicode.inc b/includes/unicode.inc index d9b81f062..f25f472d2 100644 --- a/includes/unicode.inc +++ b/includes/unicode.inc @@ -323,33 +323,30 @@ function _mime_header_decode($matches) { * array('<', '&', '"'). This affects both named and numerical entities. */ function decode_entities($text, $exclude = array()) { - static $table; - // We store named entities in a table for quick processing. - if (!isset($table)) { - // Get all named HTML entities. - $table = array_flip(get_html_translation_table(HTML_ENTITIES)); - // PHP gives us ISO-8859-1 data, we need UTF-8. - $table = array_map('utf8_encode', $table); - // Add apostrophe (XML) - $table['''] = "'"; - } - $newtable = array_diff($table, $exclude); + static $html_entities; + if (!isset($html_entities)) { + include DRUPAL_ROOT . '/includes/unicode.entities.inc'; + } + + // Flip the exclude list so that we can do quick lookups later. + $exclude = array_flip($exclude); // Use a regexp to select all entities in one pass, to avoid decoding // double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is // being used to allow for a callback (see // http://php.net/manual/en/reference.pcre.pattern.modifiers). - return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $newtable, $exclude)', $text); + return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text); } /** * Helper function for decode_entities */ -function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) { +function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) { // Named entity if (!$prefix) { - if (isset($table[$original])) { - return $table[$original]; + // A named entity not in the exclude list. + if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) { + return $html_entities[$original]; } else { return $original; @@ -383,7 +380,7 @@ function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) { . chr(0x80 | ( $codepoint & 0x3F)); } // Check for excluded characters - if (in_array($str, $exclude)) { + if (isset($exclude[$str])) { return $original; } else { diff --git a/modules/simpletest/tests/unicode.test b/modules/simpletest/tests/unicode.test index 8970fca25..e2f3206be 100644 --- a/modules/simpletest/tests/unicode.test +++ b/modules/simpletest/tests/unicode.test @@ -162,6 +162,7 @@ class UnicodeUnitTest extends DrupalWebTestCase { 'Drupal' => 'Drupal', '<script>' => '<script>', '<script>' => '<script>', + '<script>' => '<script>', '&lt;script&gt;' => '<script>', '"' => '"', '"' => '"', @@ -178,6 +179,7 @@ class UnicodeUnitTest extends DrupalWebTestCase { '→' => '→', '➼' => '➼', '➼' => '➼', + '€' => '€', ); foreach ($testcase as $input => $output) { $this->assertEqual(decode_entities($input), $output, t('Make sure the decoded entity of @input is @output', array('@input' => $input, '@output' => $output))); @@ -189,6 +191,7 @@ class UnicodeUnitTest extends DrupalWebTestCase { 'Drupal' => 'Drupal', '<script>' => '<script>', '<script>' => '<script>', + '<script>' => '<script>', '&lt;script&gt;' => '&lt;script&gt;', '"' => '"', '"' => '"', @@ -205,6 +208,7 @@ class UnicodeUnitTest extends DrupalWebTestCase { '→' => '→', '➼' => '➼', '➼' => '➼', + '€' => '€', ); $exclude = array('<', '&', '"'); foreach ($testcase as $input => $output) { |