- Patch #124980 by jhodgdon: Indexer is removing ... and -- instead of replacing with a space.

author: Dries Buytaert <dries@buytaert.net> 2010-07-22 16:16:42 +0000
committer: Dries Buytaert <dries@buytaert.net> 2010-07-22 16:16:42 +0000
commit: d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35 (patch)
tree: fb10b3cccb6ecc2166404e6049e18c44bb9cd288 /modules/search
parent: 19c7193c4c3b54efc053d8fee98cb5d00fd79c18 (diff)
download: brdo-d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35.tar.gz
brdo-d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35.tar.bz2
2 files changed, 27 insertions, 7 deletions
diff --git a/modules/search/search.module b/modules/search/search.module
index 334e8c14c..2ab3a4210 100644
--- a/modules/search/search.module
+++ b/modules/search/search.module
@@ -372,10 +372,13 @@ function search_simplify($text) {
   // Readable regexp: ([number]+)[punctuation]+(?=[number])
   $text = preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text);
 
+  // Multiple dot and dash groups are word boundaries and replaced with space.
+  // No need to use the unicode modifer here because 0-127 ASCII characters
+  // can't match higher UTF-8 characters as the leftmost bit of those are 1.
+  $text = preg_replace('/[.-]{2,}/', ' ', $text);
+
   // The dot, underscore and dash are simply removed. This allows meaningful
-  // search behavior with acronyms and URLs. No need to use the unicode modifer
-  // here because 0-127 ASCII characters can't match higher UTF-8 characters as
-  // the leftmost bit of those are 1.
+  // search behavior with acronyms and URLs. See unicode note directly above.
   $text = preg_replace('/[._-]+/', '', $text);
 
   // With the exception of the rules above, we consider all punctuation,
diff --git a/modules/search/search.test b/modules/search/search.test
index ae93e76f8..b6a72d0bd 100644
--- a/modules/search/search.test
+++ b/modules/search/search.test
@@ -935,22 +935,22 @@ class SearchCommentCountToggleTestCase extends DrupalWebTestCase {
     $this->drupalPost('', $edit, t('Search'));
     $this->assertNoText(t('0 comments'), t('Empty comment count does not display for nodes with comment status set to Hidden'));
     $this->assertNoText(t('1 comment'), t('Non-empty comment count does not display for nodes with comment status set to Hidden'));
-  }  
+  }
 }
 
 /**
- * Test search_simplify() on every Unicode character.
+ * Test search_simplify() on every Unicode character, and some other cases.
  */
 class SearchSimplifyTestCase extends DrupalWebTestCase {
   public static function getInfo() {
     return array(
       'name' => 'Search simplify',
-      'description' => 'Check that simplification works as intended.',
+      'description' => 'Check that the search_simply() function works as intended.',
       'group' => 'Search',
     );
   }
 
-  function testSearchSimplify() {
+  function testSearchSimplifyUnicode() {
     $input = file_get_contents(DRUPAL_ROOT . '/modules/search/tests/UnicodeTest.txt');
     $strings = explode(chr(10), $input);
     foreach ($strings as $key => $string) {
@@ -969,6 +969,23 @@ class SearchSimplifyTestCase extends DrupalWebTestCase {
     // Diff really does not like files starting with \0 so test it separately.
     $this->assertIdentical(' ', search_simplify($string), t('Search simplify works for ASCII control characters.'));
   }
+
+  /**
+   * Tests that search_simplify() does the right thing with punctuation.
+   */
+  function testSearchSimplifyPunctuation() {
+    $cases = array(
+      array('20.03/94-28,876', '20039428876', 'Punctuation removed from numbers'),
+      array('great...drupal--module', 'great drupal module', 'Multiple dot and dashes are word boundaries'),
+      array('very_great-drupal.module', 'verygreatdrupalmodule', 'Single dot, dash, underscore are removed'),
+      array('regular,punctuation;word', 'regular punctuation word', 'Punctuation is a word boundary'),
+    );
+
+    foreach ($cases as $case) {
+      $out = trim(search_simplify($case[0]));
+      $this->assertEqual($out, $case[1], $case[2]);
+    }
+  }
 }
 
 /**
author	Dries Buytaert <dries@buytaert.net>	2010-07-22 16:16:42 +0000
committer	Dries Buytaert <dries@buytaert.net>	2010-07-22 16:16:42 +0000
commit	d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35 (patch)
tree	fb10b3cccb6ecc2166404e6049e18c44bb9cd288 /modules/search
parent	19c7193c4c3b54efc053d8fee98cb5d00fd79c18 (diff)
download	brdo-d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35.tar.gz brdo-d79dff03ac0a40457ec5d380f73ecd0bf1ea3e35.tar.bz2