- Patch #326016 by jhodgdon: PREG_CLASS_CJK doesn't include all CJK characters.

author: Dries Buytaert <dries@buytaert.net> 2010-05-09 19:46:11 +0000
committer: Dries Buytaert <dries@buytaert.net> 2010-05-09 19:46:11 +0000
commit: 2ece32e6749bf2ecfefb1aa508bf4e61d2b08097 (patch)
tree: 4d84fe521aa54111245f1bfb82d46fc745f4173c /modules/search
parent: daede057fdf1feb9ac363a694287e22a2797b425 (diff)
download: brdo-2ece32e6749bf2ecfefb1aa508bf4e61d2b08097.tar.gz
brdo-2ece32e6749bf2ecfefb1aa508bf4e61d2b08097.tar.bz2
2 files changed, 194 insertions, 14 deletions
diff --git a/modules/search/search.module b/modules/search/search.module
index 7c94c0f95..4eeb670e2 100644
--- a/modules/search/search.module
+++ b/modules/search/search.module
@@ -103,12 +103,25 @@ define('PREG_CLASS_PUNCTUATION',
   '\x{ff65}');
 
 /**
- * Matches all CJK characters that are candidates for auto-splitting
- * (Chinese, Japanese, Korean).
- * Contains kana and BMP ideographs.
+ * Matches CJK (Chinese, Japanese, Korean) letter-like characters.
+ *
+ * This list is derived from the "East Asian Scripts" section of
+ * http://www.unicode.org/charts/index.html, as well as a comment on
+ * http://unicode.org/reports/tr11/tr11-11.html listing some character
+ * ranges that are reserved for additional CJK ideographs.
+ *
+ * The character ranges do not include numbers, punctuation, or symbols, since
+ * these are handled separately in search. Note that radicals and strokes are
+ * considered symbols. (See
+ * http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt)
+ *
+ * @see search_expand_cjk()
  */
-define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}' .
-'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
+define('PREG_CLASS_CJK', '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' .
+  '\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' .
+  '\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' .
+  '\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' .
+  '\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}');
 
 /**
  * Implements hook_help().
@@ -447,28 +460,45 @@ function search_simplify($text) {
 }
 
 /**
- * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
- * sequences of characters ('minimum_word_size' long).
+ * Splits CJK (Chinese, Japanese, Korean) text into tokens.
+ *
+ * The Search module matches exact words, where a word is defined to be a
+ * sequence of characters delimited by spaces or punctuation. CJK languages are
+ * written in long strings of characters, though, not split up into words. So
+ * in order to allow search matching, we split up CJK text into tokens
+ * consisting of consecutive, overlapping sequences of characters whose length
+ * is equal to the 'minimum_word_size' variable. This tokenizing is only done if
+ * the 'overlap_cjk' variable is TRUE.
+ *
+ * @param $matches
+ *   This function is a callback for preg_replace_callback(), which is called
+ *   from search_simplify(). So, $matches is an array of regular expression
+ *   matches, which means that $matches[0] contains the matched text -- a string
+ *   of CJK characters to tokenize.
+ *
+ * @return
+ *   Tokenized text, starting and ending with a space character.
  */
 function search_expand_cjk($matches) {
   $min = variable_get('minimum_word_size', 3);
   $str = $matches[0];
-  $l = drupal_strlen($str);
-  // Passthrough short words
-  if ($l <= $min) {
+  $length = drupal_strlen($str);
+  // If the text is shorter than the minimum word size, don't tokenize it.
+  if ($length <= $min) {
     return ' ' . $str . ' ';
   }
   $tokens = ' ';
-  // FIFO queue of characters
+  // Build a FIFO queue of characters.
   $chars = array();
-  // Begin loop
-  for ($i = 0; $i < $l; ++$i) {
-    // Grab next character
+  for ($i = 0; $i < $length; $i++) {
+    // Add the next character off the beginning of the string to the queue.
     $current = drupal_substr($str, 0, 1);
     $str = substr($str, strlen($current));
     $chars[] = $current;
     if ($i >= $min - 1) {
+      // Make a token of $min characters, and add it to the token string.
       $tokens .= implode('', $chars) . ' ';
+      // Shift out the first character in the queue.
       array_shift($chars);
     }
   }
diff --git a/modules/search/search.test b/modules/search/search.test
index fe2062dff..1a78c9eb4 100644
--- a/modules/search/search.test
+++ b/modules/search/search.test
@@ -872,3 +872,153 @@ class SearchConfigSettingsForm extends DrupalWebTestCase {
   }
 }
 
+/**
+ * Test the CJK tokenizer.
+ */
+class SearchTokenizerTestCase extends DrupalWebTestCase {
+  public static function getInfo() {
+    return array(
+      'name' => 'CJK tokenizer',
+      'description' => 'Check that CJK tokenizer works as intended.',
+      'group' => 'Search',
+    );
+  }
+
+  function setUp() {
+    parent::setUp('search');
+  }
+
+  /**
+   * Verifies that strings of CJK characters are tokenized.
+   *
+   * The search_simplify() function does special things with numbers, symbols,
+   * and punctuation. So we only test that CJK characters that are not in these
+   * character classes are tokenized properly. See PREG_CLASS_CKJ for more
+   * information.
+   */
+  function testTokenizer() {
+    // Set the minimum word size to 1 (to split all CJK characters) and make
+    // sure CJK tokenizing is turned on.
+    variable_set('minimum_word_size', 1);
+    variable_set('overlap_cjk', TRUE);
+    $this->refreshVariables();
+
+    // Create a string of CJK characters from various character ranges in
+    // the Unicode tables.
+
+    // Beginnings of the character ranges.
+    $starts = array(
+      'CJK unified' => 0x4e00,
+      'CJK Ext A' => 0x3400,
+      'CJK Compat' => 0xf900,
+      'Hangul Jamo' => 0x1100,
+      'Hangul Ext A' => 0xa960,
+      'Hangul Ext B' => 0xd7b0,
+      'Hangul Compat' => 0x3131,
+      'Half non-punct 1' => 0xff21,
+      'Half non-punct 2' => 0xff41,
+      'Half non-punct 3' => 0xff66,
+      'Hangul Syllables' => 0xac00,
+      'Hiragana' => 0x3040,
+      'Katakana' => 0x30a1,
+      'Katakana Ext' => 0x31f0,
+      'CJK Reserve 1' => 0x20000,
+      'CJK Reserve 2' => 0x30000,
+      'Bomofo' => 0x3100,
+      'Bomofo Ext' => 0x31a0,
+      'Lisu' => 0xa4d0,
+      'Yi' => 0xa000,
+    );
+
+    // Ends of the character ranges.
+    $ends = array(
+      'CJK unified' => 0x9fcf,
+      'CJK Ext A' => 0x4dbf,
+      'CJK Compat' => 0xfaff,
+      'Hangul Jamo' => 0x11ff,
+      'Hangul Ext A' => 0xa97f,
+      'Hangul Ext B' => 0xd7ff,
+      'Hangul Compat' => 0x318e,
+      'Half non-punct 1' => 0xff3a,
+      'Half non-punct 2' => 0xff5a,
+      'Half non-punct 3' => 0xffdc,
+      'Hangul Syllables' => 0xd7af,
+      'Hiragana' => 0x309f,
+      'Katakana' => 0x30ff,
+      'Katakana Ext' => 0x31ff,
+      'CJK Reserve 1' => 0x2fffd,
+      'CJK Reserve 2' => 0x3fffd,
+      'Bomofo' => 0x312f,
+      'Bomofo Ext' => 0x31b7,
+      'Lisu' => 0xa4fd,
+      'Yi' => 0xa48f,
+    );
+
+    // Generate characters consisting of starts, midpoints, and ends.
+    $chars = array();
+    $charcodes = array();
+    foreach ($starts as $key => $value) {
+      $charcodes[] = $starts[$key];
+      $chars[] = $this->code2utf($starts[$key]);
+      $mid = round(0.5 * ($starts[$key] + $ends[$key]));
+      $charcodes[] = $mid;
+      $chars[] = $this->code2utf($mid);
+      $charcodes[] = $ends[$key];
+      $chars[] = $this->code2utf($ends[$key]);
+    }
+
+    // Merge into a string and tokenize.
+    $string = implode('', $chars);
+    $out = trim(search_simplify($string));
+    $expected = drupal_strtolower(implode(' ', $chars));
+
+    // Verify that the output matches what we expect.
+    $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
+  }
+
+  /**
+   * Verifies that strings of non-CJK characters are not tokenized.
+   *
+   * This is just a sanity check - it verifies that strings of letters are
+   * not tokenized.
+   */
+  function testNoTokenizer() {
+    // Set the minimum word size to 1 (to split all CJK characters) and make
+    // sure CJK tokenizing is turned on.
+    variable_set('minimum_word_size', 1);
+    variable_set('overlap_cjk', TRUE);
+    $this->refreshVariables();
+
+    $letters = 'abcdefghijklmnopqrstuvwxyz';
+    $out = trim(search_simplify($letters));
+
+    $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
+  }
+
+  /**
+   * Like PHP chr() function, but for unicode characters.
+   *
+   * chr() only works for ASCII characters up to character 255. This function
+   * converts a number to the corresponding unicode character. Adapted from
+   * functions supplied in comments on several functions on php.net.
+   */
+  function code2utf($num) {
+    if ($num < 128) {
+      return chr($num);
+    }
+
+    if ($num < 2048) {
+      return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
+    }
+
+    if ($num < 65536) {
+      return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
+    }
+
+    if ($num < 2097152) {
+      return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
+    }
+
+    return '';
+  }
+}
author	Dries Buytaert <dries@buytaert.net>	2010-05-09 19:46:11 +0000
committer	Dries Buytaert <dries@buytaert.net>	2010-05-09 19:46:11 +0000
commit	2ece32e6749bf2ecfefb1aa508bf4e61d2b08097 (patch)
tree	4d84fe521aa54111245f1bfb82d46fc745f4173c /modules/search
parent	daede057fdf1feb9ac363a694287e22a2797b425 (diff)
download	brdo-2ece32e6749bf2ecfefb1aa508bf4e61d2b08097.tar.gz brdo-2ece32e6749bf2ecfefb1aa508bf4e61d2b08097.tar.bz2