diff options
author | Dries Buytaert <dries@buytaert.net> | 2010-05-09 19:46:11 +0000 |
---|---|---|
committer | Dries Buytaert <dries@buytaert.net> | 2010-05-09 19:46:11 +0000 |
commit | 2ece32e6749bf2ecfefb1aa508bf4e61d2b08097 (patch) | |
tree | 4d84fe521aa54111245f1bfb82d46fc745f4173c /modules/search | |
parent | daede057fdf1feb9ac363a694287e22a2797b425 (diff) | |
download | brdo-2ece32e6749bf2ecfefb1aa508bf4e61d2b08097.tar.gz brdo-2ece32e6749bf2ecfefb1aa508bf4e61d2b08097.tar.bz2 |
- Patch #326016 by jhodgdon: PREG_CLASS_CJK doesn't include all CJK characters.
Diffstat (limited to 'modules/search')
-rw-r--r-- | modules/search/search.module | 58 | ||||
-rw-r--r-- | modules/search/search.test | 150 |
2 files changed, 194 insertions, 14 deletions
diff --git a/modules/search/search.module b/modules/search/search.module index 7c94c0f95..4eeb670e2 100644 --- a/modules/search/search.module +++ b/modules/search/search.module @@ -103,12 +103,25 @@ define('PREG_CLASS_PUNCTUATION', '\x{ff65}'); /** - * Matches all CJK characters that are candidates for auto-splitting - * (Chinese, Japanese, Korean). - * Contains kana and BMP ideographs. + * Matches CJK (Chinese, Japanese, Korean) letter-like characters. + * + * This list is derived from the "East Asian Scripts" section of + * http://www.unicode.org/charts/index.html, as well as a comment on + * http://unicode.org/reports/tr11/tr11-11.html listing some character + * ranges that are reserved for additional CJK ideographs. + * + * The character ranges do not include numbers, punctuation, or symbols, since + * these are handled separately in search. Note that radicals and strokes are + * considered symbols. (See + * http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt) + * + * @see search_expand_cjk() */ -define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}' . -'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}'); +define('PREG_CLASS_CJK', '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' . + '\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' . + '\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' . + '\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' . + '\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}'); /** * Implements hook_help(). @@ -447,28 +460,45 @@ function search_simplify($text) { } /** - * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping - * sequences of characters ('minimum_word_size' long). + * Splits CJK (Chinese, Japanese, Korean) text into tokens. + * + * The Search module matches exact words, where a word is defined to be a + * sequence of characters delimited by spaces or punctuation. CJK languages are + * written in long strings of characters, though, not split up into words. So + * in order to allow search matching, we split up CJK text into tokens + * consisting of consecutive, overlapping sequences of characters whose length + * is equal to the 'minimum_word_size' variable. This tokenizing is only done if + * the 'overlap_cjk' variable is TRUE. + * + * @param $matches + * This function is a callback for preg_replace_callback(), which is called + * from search_simplify(). So, $matches is an array of regular expression + * matches, which means that $matches[0] contains the matched text -- a string + * of CJK characters to tokenize. + * + * @return + * Tokenized text, starting and ending with a space character. */ function search_expand_cjk($matches) { $min = variable_get('minimum_word_size', 3); $str = $matches[0]; - $l = drupal_strlen($str); - // Passthrough short words - if ($l <= $min) { + $length = drupal_strlen($str); + // If the text is shorter than the minimum word size, don't tokenize it. + if ($length <= $min) { return ' ' . $str . ' '; } $tokens = ' '; - // FIFO queue of characters + // Build a FIFO queue of characters. $chars = array(); - // Begin loop - for ($i = 0; $i < $l; ++$i) { - // Grab next character + for ($i = 0; $i < $length; $i++) { + // Add the next character off the beginning of the string to the queue. $current = drupal_substr($str, 0, 1); $str = substr($str, strlen($current)); $chars[] = $current; if ($i >= $min - 1) { + // Make a token of $min characters, and add it to the token string. $tokens .= implode('', $chars) . ' '; + // Shift out the first character in the queue. array_shift($chars); } } diff --git a/modules/search/search.test b/modules/search/search.test index fe2062dff..1a78c9eb4 100644 --- a/modules/search/search.test +++ b/modules/search/search.test @@ -872,3 +872,153 @@ class SearchConfigSettingsForm extends DrupalWebTestCase { } } +/** + * Test the CJK tokenizer. + */ +class SearchTokenizerTestCase extends DrupalWebTestCase { + public static function getInfo() { + return array( + 'name' => 'CJK tokenizer', + 'description' => 'Check that CJK tokenizer works as intended.', + 'group' => 'Search', + ); + } + + function setUp() { + parent::setUp('search'); + } + + /** + * Verifies that strings of CJK characters are tokenized. + * + * The search_simplify() function does special things with numbers, symbols, + * and punctuation. So we only test that CJK characters that are not in these + * character classes are tokenized properly. See PREG_CLASS_CKJ for more + * information. + */ + function testTokenizer() { + // Set the minimum word size to 1 (to split all CJK characters) and make + // sure CJK tokenizing is turned on. + variable_set('minimum_word_size', 1); + variable_set('overlap_cjk', TRUE); + $this->refreshVariables(); + + // Create a string of CJK characters from various character ranges in + // the Unicode tables. + + // Beginnings of the character ranges. + $starts = array( + 'CJK unified' => 0x4e00, + 'CJK Ext A' => 0x3400, + 'CJK Compat' => 0xf900, + 'Hangul Jamo' => 0x1100, + 'Hangul Ext A' => 0xa960, + 'Hangul Ext B' => 0xd7b0, + 'Hangul Compat' => 0x3131, + 'Half non-punct 1' => 0xff21, + 'Half non-punct 2' => 0xff41, + 'Half non-punct 3' => 0xff66, + 'Hangul Syllables' => 0xac00, + 'Hiragana' => 0x3040, + 'Katakana' => 0x30a1, + 'Katakana Ext' => 0x31f0, + 'CJK Reserve 1' => 0x20000, + 'CJK Reserve 2' => 0x30000, + 'Bomofo' => 0x3100, + 'Bomofo Ext' => 0x31a0, + 'Lisu' => 0xa4d0, + 'Yi' => 0xa000, + ); + + // Ends of the character ranges. + $ends = array( + 'CJK unified' => 0x9fcf, + 'CJK Ext A' => 0x4dbf, + 'CJK Compat' => 0xfaff, + 'Hangul Jamo' => 0x11ff, + 'Hangul Ext A' => 0xa97f, + 'Hangul Ext B' => 0xd7ff, + 'Hangul Compat' => 0x318e, + 'Half non-punct 1' => 0xff3a, + 'Half non-punct 2' => 0xff5a, + 'Half non-punct 3' => 0xffdc, + 'Hangul Syllables' => 0xd7af, + 'Hiragana' => 0x309f, + 'Katakana' => 0x30ff, + 'Katakana Ext' => 0x31ff, + 'CJK Reserve 1' => 0x2fffd, + 'CJK Reserve 2' => 0x3fffd, + 'Bomofo' => 0x312f, + 'Bomofo Ext' => 0x31b7, + 'Lisu' => 0xa4fd, + 'Yi' => 0xa48f, + ); + + // Generate characters consisting of starts, midpoints, and ends. + $chars = array(); + $charcodes = array(); + foreach ($starts as $key => $value) { + $charcodes[] = $starts[$key]; + $chars[] = $this->code2utf($starts[$key]); + $mid = round(0.5 * ($starts[$key] + $ends[$key])); + $charcodes[] = $mid; + $chars[] = $this->code2utf($mid); + $charcodes[] = $ends[$key]; + $chars[] = $this->code2utf($ends[$key]); + } + + // Merge into a string and tokenize. + $string = implode('', $chars); + $out = trim(search_simplify($string)); + $expected = drupal_strtolower(implode(' ', $chars)); + + // Verify that the output matches what we expect. + $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters'); + } + + /** + * Verifies that strings of non-CJK characters are not tokenized. + * + * This is just a sanity check - it verifies that strings of letters are + * not tokenized. + */ + function testNoTokenizer() { + // Set the minimum word size to 1 (to split all CJK characters) and make + // sure CJK tokenizing is turned on. + variable_set('minimum_word_size', 1); + variable_set('overlap_cjk', TRUE); + $this->refreshVariables(); + + $letters = 'abcdefghijklmnopqrstuvwxyz'; + $out = trim(search_simplify($letters)); + + $this->assertEqual($letters, $out, 'Letters are not CJK tokenized'); + } + + /** + * Like PHP chr() function, but for unicode characters. + * + * chr() only works for ASCII characters up to character 255. This function + * converts a number to the corresponding unicode character. Adapted from + * functions supplied in comments on several functions on php.net. + */ + function code2utf($num) { + if ($num < 128) { + return chr($num); + } + + if ($num < 2048) { + return chr(($num >> 6) + 192) . chr(($num & 63) + 128); + } + + if ($num < 65536) { + return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); + } + + if ($num < 2097152) { + return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); + } + + return ''; + } +} |