summaryrefslogtreecommitdiff
path: root/inc/indexer.php
diff options
context:
space:
mode:
Diffstat (limited to 'inc/indexer.php')
-rw-r--r--inc/indexer.php19
1 files changed, 18 insertions, 1 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index 6ece84d7b..a8511b1ee 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -12,6 +12,19 @@
require_once(DOKU_INC.'inc/utf8.php');
require_once(DOKU_INC.'inc/parserutils.php');
+// Asian characters are handled as words. The following regexp defines the
+// Unicode-Ranges for Asian characters
+// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
+// I'm no language expert. If you think some ranges are wrongly chosen or
+// a range is missing, please contact me
+define(IDX_ASIAN,'['.
+ '\x{0E00}-\x{0E7F}'. // Thai
+ '\x{2E80}-\x{D7AF}'. // CJK -> Hangul
+ '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs
+ '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms
+ ']');
+
+
/**
* Split a page into words
*
@@ -37,9 +50,10 @@ function idx_getPageWords($page){
$words = array();
foreach ($tokens as $word => $count) {
-
// simple filter to restrict use of utf8_stripspecials
if (preg_match('/[^0-9A-Za-z]/u', $word)) {
+ // handle asian chars as single words
+ $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
$arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
$arr = array_count_values($arr);
@@ -312,6 +326,9 @@ function idx_tokenizer($string,&$stopwords){
$words = array();
if(preg_match('/[^0-9A-Za-z]/u', $string)){
+ #handle asian chars as single words
+ $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
+
$arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
foreach ($arr as $w) {
if (!is_numeric($w) && strlen($w) < 3) continue;