From 93a60ad223af9f649d62d2acbdffd22ce5ad1b97 Mon Sep 17 00:00:00 2001
From: Andreas Gohr <andi@splitbrain.org>
Date: Sun, 25 Sep 2005 19:54:51 +0200
Subject: asian language support for the indexer #563

Asian languages do not use spaces to seperate words. The indexer however does
a word based lookup. Splitting for example Japanese texts into real words is
only possible with complicated natural language processing, something
completely out of scope for DokuWiki.

This patch solves the problem by treating all asian characters as single
words. When an asian word (consisting of multiple characters) is searched it
is treated as a phrase search, looking up each charcter by it self first,
then checking for the phrase in found documents.

darcs-hash:20050925175451-7ad00-933b33b51b5f2fa05e736c18b8db58a5fdbf41ce.gz
---
 inc/fulltext.php | 11 +++++++----
 inc/indexer.php  | 19 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'inc')

diff --git a/inc/fulltext.php b/inc/fulltext.php
index 96f3ad53c..89fa5b259 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -18,7 +18,6 @@
  */
 function ft_pageSearch($query,&$poswords){
     $q = ft_queryParser($query);
-
     // use this for higlighting later:
     $poswords = join(' ',$q['and']);
 
@@ -51,14 +50,12 @@ function ft_pageSearch($query,&$poswords){
     }
 
     if(!count($docs)) return array();
-
     // handle phrases
     if(count($q['phrases'])){
         //build a regexp
         $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
         $q['phrases'] = array_map('preg_quote',$q['phrases']);
         $regex = '('.join('|',$q['phrases']).')';
-
         // check the source of all documents for the exact phrases
         foreach(array_keys($docs) as $id){
             $text  = utf8_strtolower(rawWiki($id));
@@ -237,6 +234,7 @@ function ft_resultCombine($args){
  * Builds an array of search words from a query
  *
  * @todo support OR and parenthesises?
+ * @todo add namespace handling
  */
 function ft_queryParser($query){
     global $conf;
@@ -255,7 +253,7 @@ function ft_queryParser($query){
     
     // handle phrase searches
     while(preg_match('/"(.*?)"/',$query,$match)){
-        $q['phrases'][] = $match[0];
+        $q['phrases'][] = $match[1];
         $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
         $query = preg_replace('/"(.*?)"/','',$query,1);
     }
@@ -266,6 +264,11 @@ function ft_queryParser($query){
             $token = idx_tokenizer($w,$stopwords);
             if(count($token)) $q['not'] = array_merge($q['not'],$token);
         }else{
+            // asian "words" need to be searched as phrases
+            if(preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
+                $q['phrases'] = array_merge($q['phrases'],$matches[1]);
+
+            }
             $token = idx_tokenizer($w,$stopwords);
             if(count($token)) $q['and'] = array_merge($q['and'],$token);
         }
diff --git a/inc/indexer.php b/inc/indexer.php
index 6ece84d7b..a8511b1ee 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -12,6 +12,19 @@
   require_once(DOKU_INC.'inc/utf8.php');
   require_once(DOKU_INC.'inc/parserutils.php');
 
+// Asian characters are handled as words. The following regexp defines the
+// Unicode-Ranges for Asian characters
+// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
+// I'm no language expert. If you think some ranges are wrongly chosen or
+// a range is missing, please contact me
+define(IDX_ASIAN,'['.
+                 '\x{0E00}-\x{0E7F}'.  // Thai
+                 '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
+                 '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
+                 '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
+                 ']');
+
+
 /**
  * Split a page into words
  *
@@ -37,9 +50,10 @@ function idx_getPageWords($page){
 
     $words = array();
     foreach ($tokens as $word => $count) {
-
         // simple filter to restrict use of utf8_stripspecials 
         if (preg_match('/[^0-9A-Za-z]/u', $word)) {
+            // handle asian chars as single words
+            $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
             $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
             $arr = array_count_values($arr);
             
@@ -312,6 +326,9 @@ function idx_tokenizer($string,&$stopwords){
     $words = array();
 
     if(preg_match('/[^0-9A-Za-z]/u', $string)){
+        #handle asian chars as single words
+        $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
+
         $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
         foreach ($arr as $w) {
             if (!is_numeric($w) && strlen($w) < 3) continue;
-- 
cgit v1.2.3