diff options
author | Dries Buytaert <dries@buytaert.net> | 2010-07-22 16:16:42 +0000 |
---|---|---|
committer | Dries Buytaert <dries@buytaert.net> | 2010-07-22 16:16:42 +0000 |
commit | d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35 (patch) | |
tree | fb10b3cccb6ecc2166404e6049e18c44bb9cd288 /modules/search | |
parent | 19c7193c4c3b54efc053d8fee98cb5d00fd79c18 (diff) | |
download | brdo-d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35.tar.gz brdo-d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35.tar.bz2 |
- Patch #124980 by jhodgdon: Indexer is removing ... and -- instead of replacing with a space.
Diffstat (limited to 'modules/search')
-rw-r--r-- | modules/search/search.module | 9 | ||||
-rw-r--r-- | modules/search/search.test | 25 |
2 files changed, 27 insertions, 7 deletions
diff --git a/modules/search/search.module b/modules/search/search.module index 334e8c14c..2ab3a4210 100644 --- a/modules/search/search.module +++ b/modules/search/search.module @@ -372,10 +372,13 @@ function search_simplify($text) { // Readable regexp: ([number]+)[punctuation]+(?=[number]) $text = preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text); + // Multiple dot and dash groups are word boundaries and replaced with space. + // No need to use the unicode modifer here because 0-127 ASCII characters + // can't match higher UTF-8 characters as the leftmost bit of those are 1. + $text = preg_replace('/[.-]{2,}/', ' ', $text); + // The dot, underscore and dash are simply removed. This allows meaningful - // search behavior with acronyms and URLs. No need to use the unicode modifer - // here because 0-127 ASCII characters can't match higher UTF-8 characters as - // the leftmost bit of those are 1. + // search behavior with acronyms and URLs. See unicode note directly above. $text = preg_replace('/[._-]+/', '', $text); // With the exception of the rules above, we consider all punctuation, diff --git a/modules/search/search.test b/modules/search/search.test index ae93e76f8..b6a72d0bd 100644 --- a/modules/search/search.test +++ b/modules/search/search.test @@ -935,22 +935,22 @@ class SearchCommentCountToggleTestCase extends DrupalWebTestCase { $this->drupalPost('', $edit, t('Search')); $this->assertNoText(t('0 comments'), t('Empty comment count does not display for nodes with comment status set to Hidden')); $this->assertNoText(t('1 comment'), t('Non-empty comment count does not display for nodes with comment status set to Hidden')); - } + } } /** - * Test search_simplify() on every Unicode character. + * Test search_simplify() on every Unicode character, and some other cases. */ class SearchSimplifyTestCase extends DrupalWebTestCase { public static function getInfo() { return array( 'name' => 'Search simplify', - 'description' => 'Check that simplification works as intended.', + 'description' => 'Check that the search_simply() function works as intended.', 'group' => 'Search', ); } - function testSearchSimplify() { + function testSearchSimplifyUnicode() { $input = file_get_contents(DRUPAL_ROOT . '/modules/search/tests/UnicodeTest.txt'); $strings = explode(chr(10), $input); foreach ($strings as $key => $string) { @@ -969,6 +969,23 @@ class SearchSimplifyTestCase extends DrupalWebTestCase { // Diff really does not like files starting with \0 so test it separately. $this->assertIdentical(' ', search_simplify($string), t('Search simplify works for ASCII control characters.')); } + + /** + * Tests that search_simplify() does the right thing with punctuation. + */ + function testSearchSimplifyPunctuation() { + $cases = array( + array('20.03/94-28,876', '20039428876', 'Punctuation removed from numbers'), + array('great...drupal--module', 'great drupal module', 'Multiple dot and dashes are word boundaries'), + array('very_great-drupal.module', 'verygreatdrupalmodule', 'Single dot, dash, underscore are removed'), + array('regular,punctuation;word', 'regular punctuation word', 'Punctuation is a word boundary'), + ); + + foreach ($cases as $case) { + $out = trim(search_simplify($case[0])); + $this->assertEqual($out, $case[1], $case[2]); + } + } } /** |