From 93728e5d325371f6ec92213a6207a6bfddfc261b Mon Sep 17 00:00:00 2001 From: Dries Buytaert Date: Mon, 27 Jul 2009 20:15:35 +0000 Subject: - Patch #374441 by tic200, Damien Tournoud, scor: refactor Drupal HTML corrector using PHP5's XML/Dom parser. --- modules/field/modules/text/text.test | 12 +++--- modules/filter/filter.module | 79 ++++++--------------------------- modules/filter/filter.test | 84 ++++++++++++++++++++++++++++++------ modules/system/system.install | 11 +++++ 4 files changed, 102 insertions(+), 84 deletions(-) (limited to 'modules') diff --git a/modules/field/modules/text/text.test b/modules/field/modules/text/text.test index ae8a0a06a..db7c96c7c 100644 --- a/modules/field/modules/text/text.test +++ b/modules/field/modules/text/text.test @@ -286,12 +286,12 @@ class TextSummaryTestCase extends DrupalWebTestCase { // And using a text format WITH the line-break and htmlcorrector filters. $expected_lb = array( "

\nHi\n

\n

\nfolks\n
\n!\n

", - "<", - "

", - "

", - "

", - "

", + "", + "

", + "

", + "

", + "

", + "

", "

\nHi

", "

\nHi

", "

\nHi

", diff --git a/modules/filter/filter.module b/modules/filter/filter.module index 00c4b5568..6b578a0a0 100644 --- a/modules/filter/filter.module +++ b/modules/filter/filter.module @@ -757,74 +757,21 @@ function _filter_url($text, $format) { * Scan input and make sure that all HTML tags are properly closed and nested. */ function _filter_htmlcorrector($text) { - // Prepare tag lists. - static $no_nesting, $single_use; - if (!isset($no_nesting)) { - // Tags which cannot be nested but are typically left unclosed. - $no_nesting = drupal_map_assoc(array('li', 'p')); - - // Single use tags in HTML4 - $single_use = drupal_map_assoc(array('base', 'meta', 'link', 'hr', 'br', 'param', 'img', 'area', 'input', 'col', 'frame')); + // Ignore warnings during HTML soup loading. + $htmlDom = @DOMDocument::loadHTML('' . $text . ''); + + // The result of DOMDocument->saveXML($bodyNode) is a partial (X)HTML document. + // We only need what is inside the body tag. + $bodyNode = $htmlDom->getElementsByTagName('body')->item(0); + if (preg_match("|^]*>(.*)$|s", $htmlDom->saveXML($bodyNode), $matches)) { + $body_content = $matches[1]; + // The XHTML guidelines recommend to include a space before the trailing / + // and > of empty elements for better rendering on HTML user agents. + return preg_replace('|<([^>]*)/>|i', '<$1 />', $body_content); } - - // Properly entify angles. - $text = preg_replace('!<([^a-zA-Z/])!', '<\1', $text); - - // Split tags from text. - $split = preg_split('/<([^>]+?)>/', $text, -1, PREG_SPLIT_DELIM_CAPTURE); - // Note: PHP ensures the array consists of alternating delimiters and literals - // and begins and ends with a literal (inserting $null as required). - - $tag = FALSE; // Odd/even counter. Tag or no tag. - $stack = array(); - $output = ''; - foreach ($split as $value) { - // Process HTML tags. - if ($tag) { - list($tagname) = explode(' ', strtolower($value), 2); - // Closing tag - if ($tagname{0} == '/') { - $tagname = substr($tagname, 1); - // Discard XHTML closing tags for single use tags. - if (!isset($single_use[$tagname])) { - // See if we possibly have a matching opening tag on the stack. - if (in_array($tagname, $stack)) { - // Close other tags lingering first. - do { - $output .= ''; - } while (array_shift($stack) != $tagname); - } - // Otherwise, discard it. - } - } - // Opening tag - else { - // See if we have an identical 'no nesting' tag already open and close it if found. - if (count($stack) && ($stack[0] == $tagname) && isset($no_nesting[$stack[0]])) { - $output .= ''; - } - // Push non-single-use tags onto the stack - if (!isset($single_use[$tagname])) { - array_unshift($stack, $tagname); - } - // Add trailing slash to single-use tags as per X(HT)ML. - else { - $value = rtrim($value, ' /') . ' /'; - } - $output .= '<' . $value . '>'; - } - } - else { - // Passthrough all text. - $output .= $value; - } - $tag = !$tag; - } - // Close remaining tags. - while (count($stack) > 0) { - $output .= ''; + else { + return ''; } - return $output; } /** diff --git a/modules/filter/filter.test b/modules/filter/filter.test index 98834d6e4..90ee5e7e1 100644 --- a/modules/filter/filter.test +++ b/modules/filter/filter.test @@ -182,24 +182,20 @@ class FilterAdminTestCase extends DrupalWebTestCase { } } -class FilterTestCase extends DrupalWebTestCase { +/** + * Unit tests for core filters. + */ +class FilterUnitTest extends DrupalWebTestCase { protected $format; public static function getInfo() { return array( 'name' => 'Core filters', - 'description' => 'Filter each filter individually: Convert URLs into links, Convert line breaks, Correct broken HTML, Escape all HTML, Limit allowed HTML tags.', + 'description' => 'Filter each filter individually: convert line breaks, correct broken HTML.', 'group' => 'Filter', ); } - function setUp() { - parent::setUp(); - - $admin_user = $this->drupalCreateUser(array('administer filters', 'create page content')); - $this->drupalLogin($admin_user); - } - /** * Test the line break filter. */ @@ -594,17 +590,17 @@ class FilterTestCase extends DrupalWebTestCase { } /** - * Test the HTML corrector. + * Test the HTML corrector filter. * * @todo This test could really use some validity checking function. */ - function testHtmlCorrector() { + function testHtmlCorrectorFilter() { // Tag closing. $f = _filter_htmlcorrector('

text'); $this->assertEqual($f, '

text

', t('HTML corrector -- tag closing at the end of input.')); $f = _filter_htmlcorrector('

text

text'); - $this->assertEqual($f, '

text

text

', t('HTML corrector -- tag closing.')); + $this->assertEqual($f, '

text

text

', t('HTML corrector -- tag closing.')); $f = _filter_htmlcorrector("
  • e1
  • e2"); $this->assertEqual($f, "
    • e1
    • e2
    ", t('HTML corrector -- unclosed list tags.')); @@ -615,6 +611,70 @@ class FilterTestCase extends DrupalWebTestCase { // XHTML slash for empty elements. $f = _filter_htmlcorrector('

    '); $this->assertEqual($f, '

    ', t('HTML corrector -- XHTML closing slash.')); + + $f = _filter_htmlcorrector('

    test

    '); + $this->assertEqual($f, '

    test

    ', t('HTML corrector -- Convert uppercased tags to proper lowercased ones.')); + + $f = _filter_htmlcorrector('

    test

    '); + $this->assertEqual($f, '

    test

    ', t('HTML corrector -- Convert uppercased tags to proper lowercased ones.')); + + $f = _filter_htmlcorrector('test
    '); + $this->assertEqual($f, 'test
    ', t('HTML corrector -- Let proper XHTML pass thru.')); + + $f = _filter_htmlcorrector('test
    '); + $this->assertEqual($f, 'test
    ', t('HTML corrector -- Let proper XHTML pass thru.')); + + $f = _filter_htmlcorrector(''); + $this->assertEqual($f, '', t('HTML corrector -- Let proper XHTML pass thru.')); + + $f = _filter_htmlcorrector('test1
    test2'); + $this->assertEqual($f, 'test1
    test2', t('HTML corrector -- Automatically close single tags.')); + + $f = _filter_htmlcorrector('line1
    line2'); + $this->assertEqual($f, 'line1
    line2', t('HTML corrector -- Automatically close single tags.')); + + $f = _filter_htmlcorrector('line1
    line2'); + $this->assertEqual($f, 'line1
    line2', t('HTML corrector -- Automatically close single tags.')); + + $f = _filter_htmlcorrector('test'); + $this->assertEqual($f, 'test', t('HTML corrector -- Automatically close single tags.')); + + $f = _filter_htmlcorrector('

    line1


    line2

    '); + $this->assertEqual($f, '

    line1


    line2', t('HTML corrector -- Move non-inline elements outside of inline containers.')); + + $f = _filter_htmlcorrector('

    line1

    line2

    '); + $this->assertEqual($f, '

    line1

    line2
    ', t('HTML corrector -- Move non-inline elements outside of inline containers.')); + + $f = _filter_htmlcorrector('

    test

    test

    \n'); + $this->assertEqual($f, '

    test

    test

    \n', t('HTML corrector -- Auto-close improperly nested tags.')); + + $f = _filter_htmlcorrector('

    Line1
    bold stuff'); + $this->assertEqual($f, '

    Line1
    bold stuff

    ', t('HTML corrector -- Properly close unclosed tags, and remove useless closing tags.')); + + $f = _filter_htmlcorrector('test '); + $this->assertEqual($f, 'test ', t('HTML corrector -- Do not touch HTML comments.')); + + $f = _filter_htmlcorrector('test '); + $this->assertEqual($f, 'test ', t('HTML corrector -- Do not touch HTML comments.')); + + $f = _filter_htmlcorrector('test '); + $this->assertEqual($f, 'test ', t('HTML corrector -- Do not touch HTML comments.')); + + $f = _filter_htmlcorrector('test '); + $this->assertEqual($f, 'test ', t('HTML corrector -- Do not touch HTML comments.')); + + $f = _filter_htmlcorrector('test '); + $this->assertEqual($f, 'test ', t('HTML corrector -- Do not touch HTML comments.')); + + $f = _filter_htmlcorrector('

    test\n

    \n'); + $this->assertEqual($f, '

    test\n

    \n', t('HTML corrector -- New-lines are accepted and kept as-is.')); + + $f = _filter_htmlcorrector('

    دروبال'); + $this->assertEqual($f, '

    دروبال

    ', t('HTML corrector -- Encoding is correctly kept.')); } function createFormat($filter) { diff --git a/modules/system/system.install b/modules/system/system.install index 38640dd7e..6e2465fca 100644 --- a/modules/system/system.install +++ b/modules/system/system.install @@ -266,6 +266,17 @@ function system_requirements($phase) { include_once DRUPAL_ROOT . '/includes/unicode.inc'; $requirements = array_merge($requirements, unicode_requirements()); + // Verify if the DOM PHP 5 extension is available. + $has_dom = class_exists('DOMDocument'); + if (!$has_dom) { + $requirements['php_dom'] = array( + 'title' => $t('PHP DOM Extension'), + 'value' => $t('Not found'), + 'severity' => REQUIREMENT_ERROR, + 'description' => $t("The DOM extension is part of PHP 5 core, but doesn't seem to be enabled on your system. You need to enable the DOM extension on your PHP installation."), + ); + } + if ($phase == 'runtime') { // Check for update status module. if (!module_exists('update')) { -- cgit v1.2.3