- Patch #212130 by Damien Tournoud, grendzy: decode_entities() should support all (X)HTML entities.

author: Dries Buytaert <dries@buytaert.net> 2009-01-02 22:09:53 +0000
committer: Dries Buytaert <dries@buytaert.net> 2009-01-02 22:09:53 +0000
commit: 45f34467fbc5c398adccc9555a15dbec7353accf (patch)
tree: 397285ff394e82ed369b6cf85574ad260e871077
parent: 86aa636c8b5b2a3ef9437352af28cb8ed5f04523 (diff)
download: brdo-45f34467fbc5c398adccc9555a15dbec7353accf.tar.gz
brdo-45f34467fbc5c398adccc9555a15dbec7353accf.tar.bz2
3 files changed, 283 insertions, 16 deletions
diff --git a/includes/unicode.entities.inc b/includes/unicode.entities.inc
new file mode 100644
index 000000000..486fd9737
--- /dev/null
+++ b/includes/unicode.entities.inc
@@ -0,0 +1,266 @@
+<?php
+// $Id $
+
+/**
+ * @file
+ * (X)HTML entities, as defined in HTML 4.01.
+ *
+ * @see http://www.w3.org/TR/html401/sgml/entities.html
+ */
+
+$html_entities = array(
+  '&Aacute;' => 'Á',
+  '&aacute;' => 'á',
+  '&Acirc;' => 'Â',
+  '&acirc;' => 'â',
+  '&acute;' => '´',
+  '&AElig;' => 'Æ',
+  '&aelig;' => 'æ',
+  '&Agrave;' => 'À',
+  '&agrave;' => 'à',
+  '&alefsym;' => 'ℵ',
+  '&Alpha;' => 'Α',
+  '&alpha;' => 'α',
+  '&amp;' => '&',
+  '&and;' => '∧',
+  '&ang;' => '∠',
+  '&Aring;' => 'Å',
+  '&aring;' => 'å',
+  '&asymp;' => '≈',
+  '&Atilde;' => 'Ã',
+  '&atilde;' => 'ã',
+  '&Auml;' => 'Ä',
+  '&auml;' => 'ä',
+  '&bdquo;' => '„',
+  '&Beta;' => 'Β',
+  '&beta;' => 'β',
+  '&brvbar;' => '¦',
+  '&bull;' => '•',
+  '&cap;' => '∩',
+  '&Ccedil;' => 'Ç',
+  '&ccedil;' => 'ç',
+  '&cedil;' => '¸',
+  '&cent;' => '¢',
+  '&Chi;' => 'Χ',
+  '&chi;' => 'χ',
+  '&circ;' => 'ˆ',
+  '&clubs;' => '♣',
+  '&cong;' => '≅',
+  '&copy;' => '©',
+  '&crarr;' => '↵',
+  '&cup;' => '∪',
+  '&curren;' => '¤',
+  '&dagger;' => '†',
+  '&Dagger;' => '‡',
+  '&darr;' => '↓',
+  '&dArr;' => '⇓',
+  '&deg;' => '°',
+  '&Delta;' => 'Δ',
+  '&delta;' => 'δ',
+  '&diams;' => '♦',
+  '&divide;' => '÷',
+  '&Eacute;' => 'É',
+  '&eacute;' => 'é',
+  '&Ecirc;' => 'Ê',
+  '&ecirc;' => 'ê',
+  '&Egrave;' => 'È',
+  '&egrave;' => 'è',
+  '&empty;' => '∅',
+  '&emsp;' => ' ',
+  '&ensp;' => ' ',
+  '&Epsilon;' => 'Ε',
+  '&epsilon;' => 'ε',
+  '&equiv;' => '≡',
+  '&Eta;' => 'Η',
+  '&eta;' => 'η',
+  '&ETH;' => 'Ð',
+  '&eth;' => 'ð',
+  '&Euml;' => 'Ë',
+  '&euml;' => 'ë',
+  '&euro;' => '€',
+  '&exist;' => '∃',
+  '&fnof;' => 'ƒ',
+  '&forall;' => '∀',
+  '&frac12;' => '½',
+  '&frac14;' => '¼',
+  '&frac34;' => '¾',
+  '&frasl;' => '⁄',
+  '&Gamma;' => 'Γ',
+  '&gamma;' => 'γ',
+  '&ge;' => '≥',
+  '&harr;' => '↔',
+  '&hArr;' => '⇔',
+  '&hearts;' => '♥',
+  '&hellip;' => '…',
+  '&Iacute;' => 'Í',
+  '&iacute;' => 'í',
+  '&Icirc;' => 'Î',
+  '&icirc;' => 'î',
+  '&iexcl;' => '¡',
+  '&Igrave;' => 'Ì',
+  '&igrave;' => 'ì',
+  '&image;' => 'ℑ',
+  '&infin;' => '∞',
+  '&int;' => '∫',
+  '&Iota;' => 'Ι',
+  '&iota;' => 'ι',
+  '&iquest;' => '¿',
+  '&isin;' => '∈',
+  '&Iuml;' => 'Ï',
+  '&iuml;' => 'ï',
+  '&Kappa;' => 'Κ',
+  '&kappa;' => 'κ',
+  '&Lambda;' => 'Λ',
+  '&lambda;' => 'λ',
+  '&lang;' => '〈',
+  '&laquo;' => '«',
+  '&larr;' => '←',
+  '&lArr;' => '⇐',
+  '&lceil;' => '⌈',
+  '&ldquo;' => '“',
+  '&le;' => '≤',
+  '&lfloor;' => '⌊',
+  '&lowast;' => '∗',
+  '&loz;' => '◊',
+  '&lrm;' => '‎',
+  '&lsaquo;' => '‹',
+  '&lsquo;' => '‘',
+  '&macr;' => '¯',
+  '&mdash;' => '—',
+  '&micro;' => 'µ',
+  '&middot;' => '·',
+  '&minus;' => '−',
+  '&Mu;' => 'Μ',
+  '&mu;' => 'μ',
+  '&nabla;' => '∇',
+  '&nbsp;' => ' ',
+  '&ndash;' => '–',
+  '&ne;' => '≠',
+  '&ni;' => '∋',
+  '&not;' => '¬',
+  '&notin;' => '∉',
+  '&nsub;' => '⊄',
+  '&Ntilde;' => 'Ñ',
+  '&ntilde;' => 'ñ',
+  '&Nu;' => 'Ν',
+  '&nu;' => 'ν',
+  '&Oacute;' => 'Ó',
+  '&oacute;' => 'ó',
+  '&Ocirc;' => 'Ô',
+  '&ocirc;' => 'ô',
+  '&OElig;' => 'Œ',
+  '&oelig;' => 'œ',
+  '&Ograve;' => 'Ò',
+  '&ograve;' => 'ò',
+  '&oline;' => '‾',
+  '&Omega;' => 'Ω',
+  '&omega;' => 'ω',
+  '&Omicron;' => 'Ο',
+  '&omicron;' => 'ο',
+  '&oplus;' => '⊕',
+  '&or;' => '∨',
+  '&ordf;' => 'ª',
+  '&ordm;' => 'º',
+  '&Oslash;' => 'Ø',
+  '&oslash;' => 'ø',
+  '&Otilde;' => 'Õ',
+  '&otilde;' => 'õ',
+  '&otimes;' => '⊗',
+  '&Ouml;' => 'Ö',
+  '&ouml;' => 'ö',
+  '&para;' => '¶',
+  '&part;' => '∂',
+  '&permil;' => '‰',
+  '&perp;' => '⊥',
+  '&Phi;' => 'Φ',
+  '&phi;' => 'φ',
+  '&Pi;' => 'Π',
+  '&pi;' => 'π',
+  '&piv;' => 'ϖ',
+  '&plusmn;' => '±',
+  '&pound;' => '£',
+  '&prime;' => '′',
+  '&Prime;' => '″',
+  '&prod;' => '∏',
+  '&prop;' => '∝',
+  '&Psi;' => 'Ψ',
+  '&psi;' => 'ψ',
+  '&radic;' => '√',
+  '&rang;' => '〉',
+  '&raquo;' => '»',
+  '&rarr;' => '→',
+  '&rArr;' => '⇒',
+  '&rceil;' => '⌉',
+  '&rdquo;' => '”',
+  '&real;' => 'ℜ',
+  '&reg;' => '®',
+  '&rfloor;' => '⌋',
+  '&Rho;' => 'Ρ',
+  '&rho;' => 'ρ',
+  '&rlm;' => '‏',
+  '&rsaquo;' => '›',
+  '&rsquo;' => '’',
+  '&sbquo;' => '‚',
+  '&Scaron;' => 'Š',
+  '&scaron;' => 'š',
+  '&sdot;' => '⋅',
+  '&sect;' => '§',
+  '&shy;' => '',
+  '&Sigma;' => 'Σ',
+  '&sigma;' => 'σ',
+  '&sigmaf;' => 'ς',
+  '&sim;' => '∼',
+  '&spades;' => '♠',
+  '&sub;' => '⊂',
+  '&sube;' => '⊆',
+  '&sum;' => '∑',
+  '&sup1;' => '¹',
+  '&sup2;' => '²',
+  '&sup3;' => '³',
+  '&sup;' => '⊃',
+  '&supe;' => '⊇',
+  '&szlig;' => 'ß',
+  '&Tau;' => 'Τ',
+  '&tau;' => 'τ',
+  '&there4;' => '∴',
+  '&Theta;' => 'Θ',
+  '&theta;' => 'θ',
+  '&thetasym;' => 'ϑ',
+  '&thinsp;' => ' ',
+  '&THORN;' => 'Þ',
+  '&thorn;' => 'þ',
+  '&tilde;' => '˜',
+  '&times;' => '×',
+  '&trade;' => '™',
+  '&Uacute;' => 'Ú',
+  '&uacute;' => 'ú',
+  '&uarr;' => '↑',
+  '&uArr;' => '⇑',
+  '&Ucirc;' => 'Û',
+  '&ucirc;' => 'û',
+  '&Ugrave;' => 'Ù',
+  '&ugrave;' => 'ù',
+  '&uml;' => '¨',
+  '&upsih;' => 'ϒ',
+  '&Upsilon;' => 'Υ',
+  '&upsilon;' => 'υ',
+  '&Uuml;' => 'Ü',
+  '&uuml;' => 'ü',
+  '&weierp;' => '℘',
+  '&Xi;' => 'Ξ',
+  '&xi;' => 'ξ',
+  '&Yacute;' => 'Ý',
+  '&yacute;' => 'ý',
+  '&yen;' => '¥',
+  '&yuml;' => 'ÿ',
+  '&Yuml;' => 'Ÿ',
+  '&Zeta;' => 'Ζ',
+  '&zeta;' => 'ζ',
+  '&zwj;' => '‍',
+  '&zwnj;' => '‌',
+  '&gt;' => '>',
+  '&lt;' => '<',
+  '&quot;' => '"',
+  // Add apostrophe (XML).
+  '&apos;' => "'",
+);
diff --git a/includes/unicode.inc b/includes/unicode.inc
index d9b81f062..f25f472d2 100644
--- a/includes/unicode.inc
+++ b/includes/unicode.inc
@@ -323,33 +323,30 @@ function _mime_header_decode($matches) {
  *   array('<', '&', '"'). This affects both named and numerical entities.
  */
 function decode_entities($text, $exclude = array()) {
-  static $table;
-  // We store named entities in a table for quick processing.
-  if (!isset($table)) {
-    // Get all named HTML entities.
-    $table = array_flip(get_html_translation_table(HTML_ENTITIES));
-    // PHP gives us ISO-8859-1 data, we need UTF-8.
-    $table = array_map('utf8_encode', $table);
-    // Add apostrophe (XML)
-    $table['&apos;'] = "'";
-  }
-  $newtable = array_diff($table, $exclude);
+  static $html_entities;
+  if (!isset($html_entities)) {
+    include DRUPAL_ROOT . '/includes/unicode.entities.inc';
+  }
+
+  // Flip the exclude list so that we can do quick lookups later.
+  $exclude = array_flip($exclude);
 
   // Use a regexp to select all entities in one pass, to avoid decoding 
   // double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is
   // being used to allow for a callback (see 
   // http://php.net/manual/en/reference.pcre.pattern.modifiers).
-  return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $newtable, $exclude)', $text);
+  return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text);
 }
 
 /**
  * Helper function for decode_entities
  */
-function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) {
+function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) {
   // Named entity
   if (!$prefix) {
-    if (isset($table[$original])) {
-      return $table[$original];
+    // A named entity not in the exclude list.
+    if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) {
+      return $html_entities[$original];
     }
     else {
       return $original;
@@ -383,7 +380,7 @@ function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) {
          . chr(0x80 | ( $codepoint        & 0x3F));
   }
   // Check for excluded characters
-  if (in_array($str, $exclude)) {
+  if (isset($exclude[$str])) {
     return $original;
   }
   else {
diff --git a/modules/simpletest/tests/unicode.test b/modules/simpletest/tests/unicode.test
index 8970fca25..e2f3206be 100644
--- a/modules/simpletest/tests/unicode.test
+++ b/modules/simpletest/tests/unicode.test
@@ -162,6 +162,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
       'Drupal' => 'Drupal',
       '<script>' => '<script>',
       '&lt;script&gt;' => '<script>',
+      '&#60;script&#62;' => '<script>',
       '&amp;lt;script&amp;gt;' => '&lt;script&gt;',
       '"' => '"',
       '&#34;' => '"',
@@ -178,6 +179,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
       '&#8594;' => '→',
       '➼' => '➼',
       '&#10172;' => '➼',
+      '&euro;' => '€',
     );
     foreach ($testcase as $input => $output) {
       $this->assertEqual(decode_entities($input), $output, t('Make sure the decoded entity of @input is @output', array('@input' => $input, '@output' => $output)));
@@ -189,6 +191,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
       'Drupal' => 'Drupal',
       '<script>' => '<script>',
       '&lt;script&gt;' => '&lt;script>',
+      '&#60;script&#62;' => '&#60;script>',
       '&amp;lt;script&amp;gt;' => '&amp;lt;script&amp;gt;',
       '"' => '"',
       '&#34;' => '&#34;',
@@ -205,6 +208,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
       '&#8594;' => '→',
       '➼' => '➼',
       '&#10172;' => '➼',
+      '&euro;' => '€',
     );
     $exclude = array('<', '&', '"');
     foreach ($testcase as $input => $output) {
author	Dries Buytaert <dries@buytaert.net>	2009-01-02 22:09:53 +0000
committer	Dries Buytaert <dries@buytaert.net>	2009-01-02 22:09:53 +0000
commit	45f34467fbc5c398adccc9555a15dbec7353accf (patch)
tree	397285ff394e82ed369b6cf85574ad260e871077
parent	86aa636c8b5b2a3ef9437352af28cb8ed5f04523 (diff)
download	brdo-45f34467fbc5c398adccc9555a15dbec7353accf.tar.gz brdo-45f34467fbc5c398adccc9555a15dbec7353accf.tar.bz2