From bdb91dbe95efc586a2e9fb3aa5beae80fc656956 Mon Sep 17 00:00:00 2001 From: Dries Buytaert Date: Tue, 10 Aug 2010 01:11:36 +0000 Subject: - Patch #269911 by jhodgdon, Freso, kiamlaluno, mradcliffe: search result trimming should not fall inside HTML entities/tags. --- modules/search/search.module | 29 ++++++++++++++++++-------- modules/search/search.test | 48 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 8 deletions(-) (limited to 'modules/search') diff --git a/modules/search/search.module b/modules/search/search.module index e37ba95c4..82d6a1b12 100644 --- a/modules/search/search.module +++ b/modules/search/search.module @@ -1090,8 +1090,11 @@ function search_excerpt($keys, $text) { preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches); $keys = array_merge($matches[2], $matches[3]); - // Prepare text - $text = ' ' . strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)) . ' '; + // Prepare text by stripping HTML tags and decoding HTML entities. + $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)); + $text = decode_entities($text); + + // Slash-escape quotes in the search keyword string. array_walk($keys, '_search_excerpt_replace'); $workkeys = $keys; @@ -1121,9 +1124,12 @@ function search_excerpt($keys, $text) { // $q) and behind it (position $s) if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) { $p = $match[0][1]; - if (($q = strpos($text, ' ', max(0, $p - 60))) !== FALSE) { - $end = substr($text, $p, 80); + if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) { + $end = substr($text . ' ', $p, 80); if (($s = strrpos($end, ' ')) !== FALSE) { + // Account for the added spaces. + $q = max($q - 1, 0); + $s = min($s, drupal_strlen($end) - 1); $ranges[$q] = $p + $s; $length += $p + $s - $q; $included[$key] = $p + 1; @@ -1142,9 +1148,11 @@ function search_excerpt($keys, $text) { } } - // If we didn't find anything, return the beginning. if (count($ranges) == 0) { - return truncate_utf8($text, 256, TRUE, TRUE); + // We didn't find any keyword matches, so just return the first part of the + // text. We also need to re-encode any HTML special characters that we + // entity-decoded above. + return check_plain(truncate_utf8($text, 256, TRUE, TRUE)); } // Sort the text ranges by starting position. @@ -1174,7 +1182,12 @@ function search_excerpt($keys, $text) { foreach ($newranges as $from => $to) { $out[] = substr($text, $from, $to - $from); } - $text = (isset($newranges[0]) ? '' : '... ') . implode(' ... ', $out) . ' ...'; + + // Let translators have the ... separator text as one chunk. + $dots = explode('!excerpt', t('... !excerpt ... !excerpt ...')); + + $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2]; + $text = check_plain($text); // Highlight keywords. Must be done at once to prevent conflicts ('strong' and ''). $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '\0', $text); @@ -1186,7 +1199,7 @@ function search_excerpt($keys, $text) { */ /** - * Helper function for array_walk in search_except. + * Helper function for array_walk() in search_excerpt(). */ function _search_excerpt_replace(&$text) { $text = preg_quote($text, '/'); diff --git a/modules/search/search.test b/modules/search/search.test index 9079326a1..c43f8d7d2 100644 --- a/modules/search/search.test +++ b/modules/search/search.test @@ -1137,6 +1137,54 @@ class SearchConfigSettingsForm extends DrupalWebTestCase { } } +/** + * Tests the search_excerpt() function. + */ +class SearchExcerptTestCase extends DrupalUnitTestCase { + public static function getInfo() { + return array( + 'name' => 'Search excerpt extraction', + 'description' => 'Tests that the search_excerpt() function works.', + 'group' => 'Search', + ); + } + + function setUp() { + parent::setUp('search'); + } + + /** + * Tests search_excerpt() with several simulated search keywords. + * + * Passes keywords and a sample marked up string, "The quick + * brown fox jumps over the lazy dog", and compares it to the + * correctly marked up string. The correctly marked up string + * contains either highlighted keywords or the original marked + * up string if no keywords matched the string. + */ + function testSearchExcerpt() { + // Make some text with entities and tags. + $text = 'The quick brown fox & jumps

over

the lazy dog'; + // Note: The search_excerpt() function adds some extra spaces -- not + // important for HTML formatting. Remove these for comparison. + $expected = 'The quick brown fox & jumps over the lazy dog'; + $result = preg_replace('| +|', ' ', search_excerpt('nothing', $text)); + $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Entire string is returned when keyword is not found in short string'); + + $result = preg_replace('| +|', ' ', search_excerpt('fox', $text)); + $this->assertEqual($result, 'The quick brown fox & jumps over the lazy dog ...', 'Found keyword is highlighted'); + + $longtext = str_repeat($text . ' ', 10); + $result = preg_replace('| +|', ' ', search_excerpt('nothing', $text)); + $this->assertTrue(strpos($result, $expected) === 0, 'When keyword is not found in long string, return value starts as expected'); + + $entities = str_repeat('készítése ', 20); + $result = preg_replace('| +|', ' ', search_excerpt('nothing', $entities)); + $this->assertFalse(strpos($result, '&'), 'Entities are not present in excerpt'); + $this->assertTrue(strpos($result, 'í') > 0, 'Entities are converted in excerpt'); + } +} + /** * Test the CJK tokenizer. */ -- cgit v1.2.3