summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--includes/unicode.entities.inc266
-rw-r--r--includes/unicode.inc29
-rw-r--r--modules/simpletest/tests/unicode.test4
3 files changed, 283 insertions, 16 deletions
diff --git a/includes/unicode.entities.inc b/includes/unicode.entities.inc
new file mode 100644
index 000000000..486fd9737
--- /dev/null
+++ b/includes/unicode.entities.inc
@@ -0,0 +1,266 @@
+<?php
+// $Id $
+
+/**
+ * @file
+ * (X)HTML entities, as defined in HTML 4.01.
+ *
+ * @see http://www.w3.org/TR/html401/sgml/entities.html
+ */
+
+$html_entities = array(
+ '&Aacute;' => 'Á',
+ '&aacute;' => 'á',
+ '&Acirc;' => 'Â',
+ '&acirc;' => 'â',
+ '&acute;' => '´',
+ '&AElig;' => 'Æ',
+ '&aelig;' => 'æ',
+ '&Agrave;' => 'À',
+ '&agrave;' => 'à',
+ '&alefsym;' => 'ℵ',
+ '&Alpha;' => 'Α',
+ '&alpha;' => 'α',
+ '&amp;' => '&',
+ '&and;' => '∧',
+ '&ang;' => '∠',
+ '&Aring;' => 'Å',
+ '&aring;' => 'å',
+ '&asymp;' => '≈',
+ '&Atilde;' => 'Ã',
+ '&atilde;' => 'ã',
+ '&Auml;' => 'Ä',
+ '&auml;' => 'ä',
+ '&bdquo;' => '„',
+ '&Beta;' => 'Β',
+ '&beta;' => 'β',
+ '&brvbar;' => '¦',
+ '&bull;' => '•',
+ '&cap;' => '∩',
+ '&Ccedil;' => 'Ç',
+ '&ccedil;' => 'ç',
+ '&cedil;' => '¸',
+ '&cent;' => '¢',
+ '&Chi;' => 'Χ',
+ '&chi;' => 'χ',
+ '&circ;' => 'ˆ',
+ '&clubs;' => '♣',
+ '&cong;' => '≅',
+ '&copy;' => '©',
+ '&crarr;' => '↵',
+ '&cup;' => '∪',
+ '&curren;' => '¤',
+ '&dagger;' => '†',
+ '&Dagger;' => '‡',
+ '&darr;' => '↓',
+ '&dArr;' => '⇓',
+ '&deg;' => '°',
+ '&Delta;' => 'Δ',
+ '&delta;' => 'δ',
+ '&diams;' => '♦',
+ '&divide;' => '÷',
+ '&Eacute;' => 'É',
+ '&eacute;' => 'é',
+ '&Ecirc;' => 'Ê',
+ '&ecirc;' => 'ê',
+ '&Egrave;' => 'È',
+ '&egrave;' => 'è',
+ '&empty;' => '∅',
+ '&emsp;' => ' ',
+ '&ensp;' => ' ',
+ '&Epsilon;' => 'Ε',
+ '&epsilon;' => 'ε',
+ '&equiv;' => '≡',
+ '&Eta;' => 'Η',
+ '&eta;' => 'η',
+ '&ETH;' => 'Ð',
+ '&eth;' => 'ð',
+ '&Euml;' => 'Ë',
+ '&euml;' => 'ë',
+ '&euro;' => '€',
+ '&exist;' => '∃',
+ '&fnof;' => 'ƒ',
+ '&forall;' => '∀',
+ '&frac12;' => '½',
+ '&frac14;' => '¼',
+ '&frac34;' => '¾',
+ '&frasl;' => '⁄',
+ '&Gamma;' => 'Γ',
+ '&gamma;' => 'γ',
+ '&ge;' => '≥',
+ '&harr;' => '↔',
+ '&hArr;' => '⇔',
+ '&hearts;' => '♥',
+ '&hellip;' => '…',
+ '&Iacute;' => 'Í',
+ '&iacute;' => 'í',
+ '&Icirc;' => 'Î',
+ '&icirc;' => 'î',
+ '&iexcl;' => '¡',
+ '&Igrave;' => 'Ì',
+ '&igrave;' => 'ì',
+ '&image;' => 'ℑ',
+ '&infin;' => '∞',
+ '&int;' => '∫',
+ '&Iota;' => 'Ι',
+ '&iota;' => 'ι',
+ '&iquest;' => '¿',
+ '&isin;' => '∈',
+ '&Iuml;' => 'Ï',
+ '&iuml;' => 'ï',
+ '&Kappa;' => 'Κ',
+ '&kappa;' => 'κ',
+ '&Lambda;' => 'Λ',
+ '&lambda;' => 'λ',
+ '&lang;' => '〈',
+ '&laquo;' => '«',
+ '&larr;' => '←',
+ '&lArr;' => '⇐',
+ '&lceil;' => '⌈',
+ '&ldquo;' => '“',
+ '&le;' => '≤',
+ '&lfloor;' => '⌊',
+ '&lowast;' => '∗',
+ '&loz;' => '◊',
+ '&lrm;' => '‎',
+ '&lsaquo;' => '‹',
+ '&lsquo;' => '‘',
+ '&macr;' => '¯',
+ '&mdash;' => '—',
+ '&micro;' => 'µ',
+ '&middot;' => '·',
+ '&minus;' => '−',
+ '&Mu;' => 'Μ',
+ '&mu;' => 'μ',
+ '&nabla;' => '∇',
+ '&nbsp;' => ' ',
+ '&ndash;' => '–',
+ '&ne;' => '≠',
+ '&ni;' => '∋',
+ '&not;' => '¬',
+ '&notin;' => '∉',
+ '&nsub;' => '⊄',
+ '&Ntilde;' => 'Ñ',
+ '&ntilde;' => 'ñ',
+ '&Nu;' => 'Ν',
+ '&nu;' => 'ν',
+ '&Oacute;' => 'Ó',
+ '&oacute;' => 'ó',
+ '&Ocirc;' => 'Ô',
+ '&ocirc;' => 'ô',
+ '&OElig;' => 'Œ',
+ '&oelig;' => 'œ',
+ '&Ograve;' => 'Ò',
+ '&ograve;' => 'ò',
+ '&oline;' => '‾',
+ '&Omega;' => 'Ω',
+ '&omega;' => 'ω',
+ '&Omicron;' => 'Ο',
+ '&omicron;' => 'ο',
+ '&oplus;' => '⊕',
+ '&or;' => '∨',
+ '&ordf;' => 'ª',
+ '&ordm;' => 'º',
+ '&Oslash;' => 'Ø',
+ '&oslash;' => 'ø',
+ '&Otilde;' => 'Õ',
+ '&otilde;' => 'õ',
+ '&otimes;' => '⊗',
+ '&Ouml;' => 'Ö',
+ '&ouml;' => 'ö',
+ '&para;' => '¶',
+ '&part;' => '∂',
+ '&permil;' => '‰',
+ '&perp;' => '⊥',
+ '&Phi;' => 'Φ',
+ '&phi;' => 'φ',
+ '&Pi;' => 'Π',
+ '&pi;' => 'π',
+ '&piv;' => 'ϖ',
+ '&plusmn;' => '±',
+ '&pound;' => '£',
+ '&prime;' => '′',
+ '&Prime;' => '″',
+ '&prod;' => '∏',
+ '&prop;' => '∝',
+ '&Psi;' => 'Ψ',
+ '&psi;' => 'ψ',
+ '&radic;' => '√',
+ '&rang;' => '〉',
+ '&raquo;' => '»',
+ '&rarr;' => '→',
+ '&rArr;' => '⇒',
+ '&rceil;' => '⌉',
+ '&rdquo;' => '”',
+ '&real;' => 'ℜ',
+ '&reg;' => '®',
+ '&rfloor;' => '⌋',
+ '&Rho;' => 'Ρ',
+ '&rho;' => 'ρ',
+ '&rlm;' => '‏',
+ '&rsaquo;' => '›',
+ '&rsquo;' => '’',
+ '&sbquo;' => '‚',
+ '&Scaron;' => 'Š',
+ '&scaron;' => 'š',
+ '&sdot;' => '⋅',
+ '&sect;' => '§',
+ '&shy;' => '­',
+ '&Sigma;' => 'Σ',
+ '&sigma;' => 'σ',
+ '&sigmaf;' => 'ς',
+ '&sim;' => '∼',
+ '&spades;' => '♠',
+ '&sub;' => '⊂',
+ '&sube;' => '⊆',
+ '&sum;' => '∑',
+ '&sup1;' => '¹',
+ '&sup2;' => '²',
+ '&sup3;' => '³',
+ '&sup;' => '⊃',
+ '&supe;' => '⊇',
+ '&szlig;' => 'ß',
+ '&Tau;' => 'Τ',
+ '&tau;' => 'τ',
+ '&there4;' => '∴',
+ '&Theta;' => 'Θ',
+ '&theta;' => 'θ',
+ '&thetasym;' => 'ϑ',
+ '&thinsp;' => ' ',
+ '&THORN;' => 'Þ',
+ '&thorn;' => 'þ',
+ '&tilde;' => '˜',
+ '&times;' => '×',
+ '&trade;' => '™',
+ '&Uacute;' => 'Ú',
+ '&uacute;' => 'ú',
+ '&uarr;' => '↑',
+ '&uArr;' => '⇑',
+ '&Ucirc;' => 'Û',
+ '&ucirc;' => 'û',
+ '&Ugrave;' => 'Ù',
+ '&ugrave;' => 'ù',
+ '&uml;' => '¨',
+ '&upsih;' => 'ϒ',
+ '&Upsilon;' => 'Υ',
+ '&upsilon;' => 'υ',
+ '&Uuml;' => 'Ü',
+ '&uuml;' => 'ü',
+ '&weierp;' => '℘',
+ '&Xi;' => 'Ξ',
+ '&xi;' => 'ξ',
+ '&Yacute;' => 'Ý',
+ '&yacute;' => 'ý',
+ '&yen;' => '¥',
+ '&yuml;' => 'ÿ',
+ '&Yuml;' => 'Ÿ',
+ '&Zeta;' => 'Ζ',
+ '&zeta;' => 'ζ',
+ '&zwj;' => '‍',
+ '&zwnj;' => '‌',
+ '&gt;' => '>',
+ '&lt;' => '<',
+ '&quot;' => '"',
+ // Add apostrophe (XML).
+ '&apos;' => "'",
+);
diff --git a/includes/unicode.inc b/includes/unicode.inc
index d9b81f062..f25f472d2 100644
--- a/includes/unicode.inc
+++ b/includes/unicode.inc
@@ -323,33 +323,30 @@ function _mime_header_decode($matches) {
* array('<', '&', '"'). This affects both named and numerical entities.
*/
function decode_entities($text, $exclude = array()) {
- static $table;
- // We store named entities in a table for quick processing.
- if (!isset($table)) {
- // Get all named HTML entities.
- $table = array_flip(get_html_translation_table(HTML_ENTITIES));
- // PHP gives us ISO-8859-1 data, we need UTF-8.
- $table = array_map('utf8_encode', $table);
- // Add apostrophe (XML)
- $table['&apos;'] = "'";
- }
- $newtable = array_diff($table, $exclude);
+ static $html_entities;
+ if (!isset($html_entities)) {
+ include DRUPAL_ROOT . '/includes/unicode.entities.inc';
+ }
+
+ // Flip the exclude list so that we can do quick lookups later.
+ $exclude = array_flip($exclude);
// Use a regexp to select all entities in one pass, to avoid decoding
// double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is
// being used to allow for a callback (see
// http://php.net/manual/en/reference.pcre.pattern.modifiers).
- return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $newtable, $exclude)', $text);
+ return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text);
}
/**
* Helper function for decode_entities
*/
-function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) {
+function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) {
// Named entity
if (!$prefix) {
- if (isset($table[$original])) {
- return $table[$original];
+ // A named entity not in the exclude list.
+ if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) {
+ return $html_entities[$original];
}
else {
return $original;
@@ -383,7 +380,7 @@ function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) {
. chr(0x80 | ( $codepoint & 0x3F));
}
// Check for excluded characters
- if (in_array($str, $exclude)) {
+ if (isset($exclude[$str])) {
return $original;
}
else {
diff --git a/modules/simpletest/tests/unicode.test b/modules/simpletest/tests/unicode.test
index 8970fca25..e2f3206be 100644
--- a/modules/simpletest/tests/unicode.test
+++ b/modules/simpletest/tests/unicode.test
@@ -162,6 +162,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
'Drupal' => 'Drupal',
'<script>' => '<script>',
'&lt;script&gt;' => '<script>',
+ '&#60;script&#62;' => '<script>',
'&amp;lt;script&amp;gt;' => '&lt;script&gt;',
'"' => '"',
'&#34;' => '"',
@@ -178,6 +179,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
'&#8594;' => '→',
'➼' => '➼',
'&#10172;' => '➼',
+ '&euro;' => '€',
);
foreach ($testcase as $input => $output) {
$this->assertEqual(decode_entities($input), $output, t('Make sure the decoded entity of @input is @output', array('@input' => $input, '@output' => $output)));
@@ -189,6 +191,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
'Drupal' => 'Drupal',
'<script>' => '<script>',
'&lt;script&gt;' => '&lt;script>',
+ '&#60;script&#62;' => '&#60;script>',
'&amp;lt;script&amp;gt;' => '&amp;lt;script&amp;gt;',
'"' => '"',
'&#34;' => '&#34;',
@@ -205,6 +208,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
'&#8594;' => '→',
'➼' => '➼',
'&#10172;' => '➼',
+ '&euro;' => '€',
);
$exclude = array('<', '&', '"');
foreach ($testcase as $input => $output) {