2 files changed, 25 insertions, 5 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 96f3ad53c..89fa5b259 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -18,7 +18,6 @@
  */
 function ft_pageSearch($query,&$poswords){
     $q = ft_queryParser($query);
-
     // use this for higlighting later:
     $poswords = join(' ',$q['and']);
 
@@ -51,14 +50,12 @@ function ft_pageSearch($query,&$poswords){
     }
 
     if(!count($docs)) return array();
-
     // handle phrases
     if(count($q['phrases'])){
         //build a regexp
         $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
         $q['phrases'] = array_map('preg_quote',$q['phrases']);
         $regex = '('.join('|',$q['phrases']).')';
-
         // check the source of all documents for the exact phrases
         foreach(array_keys($docs) as $id){
             $text  = utf8_strtolower(rawWiki($id));
@@ -237,6 +234,7 @@ function ft_resultCombine($args){
  * Builds an array of search words from a query
  *
  * @todo support OR and parenthesises?
+ * @todo add namespace handling
  */
 function ft_queryParser($query){
     global $conf;
@@ -255,7 +253,7 @@ function ft_queryParser($query){
     
     // handle phrase searches
     while(preg_match('/"(.*?)"/',$query,$match)){
-        $q['phrases'][] = $match[0];
+        $q['phrases'][] = $match[1];
         $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
         $query = preg_replace('/"(.*?)"/','',$query,1);
     }
@@ -266,6 +264,11 @@ function ft_queryParser($query){
             $token = idx_tokenizer($w,$stopwords);
             if(count($token)) $q['not'] = array_merge($q['not'],$token);
         }else{
+            // asian "words" need to be searched as phrases
+            if(preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
+                $q['phrases'] = array_merge($q['phrases'],$matches[1]);
+
+            }
             $token = idx_tokenizer($w,$stopwords);
             if(count($token)) $q['and'] = array_merge($q['and'],$token);
         }
diff --git a/inc/indexer.php b/inc/indexer.php
index 6ece84d7b..a8511b1ee 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -12,6 +12,19 @@
   require_once(DOKU_INC.'inc/utf8.php');
   require_once(DOKU_INC.'inc/parserutils.php');
 
+// Asian characters are handled as words. The following regexp defines the
+// Unicode-Ranges for Asian characters
+// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
+// I'm no language expert. If you think some ranges are wrongly chosen or
+// a range is missing, please contact me
+define(IDX_ASIAN,'['.
+                 '\x{0E00}-\x{0E7F}'.  // Thai
+                 '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
+                 '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
+                 '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
+                 ']');
+
+
 /**
  * Split a page into words
  *
@@ -37,9 +50,10 @@ function idx_getPageWords($page){
 
     $words = array();
     foreach ($tokens as $word => $count) {
-
         // simple filter to restrict use of utf8_stripspecials 
         if (preg_match('/[^0-9A-Za-z]/u', $word)) {
+            // handle asian chars as single words
+            $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
             $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
             $arr = array_count_values($arr);
             
@@ -312,6 +326,9 @@ function idx_tokenizer($string,&$stopwords){
     $words = array();
 
     if(preg_match('/[^0-9A-Za-z]/u', $string)){
+        #handle asian chars as single words
+        $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
+
         $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
         foreach ($arr as $w) {
             if (!is_numeric($w) && strlen($w) < 3) continue;