diff options
-rw-r--r-- | inc/fulltext.php | 11 | ||||
-rw-r--r-- | inc/indexer.php | 19 |
2 files changed, 25 insertions, 5 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index 96f3ad53c..89fa5b259 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -18,7 +18,6 @@ */ function ft_pageSearch($query,&$poswords){ $q = ft_queryParser($query); - // use this for higlighting later: $poswords = join(' ',$q['and']); @@ -51,14 +50,12 @@ function ft_pageSearch($query,&$poswords){ } if(!count($docs)) return array(); - // handle phrases if(count($q['phrases'])){ //build a regexp $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); $q['phrases'] = array_map('preg_quote',$q['phrases']); $regex = '('.join('|',$q['phrases']).')'; - // check the source of all documents for the exact phrases foreach(array_keys($docs) as $id){ $text = utf8_strtolower(rawWiki($id)); @@ -237,6 +234,7 @@ function ft_resultCombine($args){ * Builds an array of search words from a query * * @todo support OR and parenthesises? + * @todo add namespace handling */ function ft_queryParser($query){ global $conf; @@ -255,7 +253,7 @@ function ft_queryParser($query){ // handle phrase searches while(preg_match('/"(.*?)"/',$query,$match)){ - $q['phrases'][] = $match[0]; + $q['phrases'][] = $match[1]; $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); $query = preg_replace('/"(.*?)"/','',$query,1); } @@ -266,6 +264,11 @@ function ft_queryParser($query){ $token = idx_tokenizer($w,$stopwords); if(count($token)) $q['not'] = array_merge($q['not'],$token); }else{ + // asian "words" need to be searched as phrases + if(preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){ + $q['phrases'] = array_merge($q['phrases'],$matches[1]); + + } $token = idx_tokenizer($w,$stopwords); if(count($token)) $q['and'] = array_merge($q['and'],$token); } diff --git a/inc/indexer.php b/inc/indexer.php index 6ece84d7b..a8511b1ee 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -12,6 +12,19 @@ require_once(DOKU_INC.'inc/utf8.php'); require_once(DOKU_INC.'inc/parserutils.php'); +// Asian characters are handled as words. The following regexp defines the +// Unicode-Ranges for Asian characters +// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block +// I'm no language expert. If you think some ranges are wrongly chosen or +// a range is missing, please contact me +define(IDX_ASIAN,'['. + '\x{0E00}-\x{0E7F}'. // Thai + '\x{2E80}-\x{D7AF}'. // CJK -> Hangul + '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs + '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms + ']'); + + /** * Split a page into words * @@ -37,9 +50,10 @@ function idx_getPageWords($page){ $words = array(); foreach ($tokens as $word => $count) { - // simple filter to restrict use of utf8_stripspecials if (preg_match('/[^0-9A-Za-z]/u', $word)) { + // handle asian chars as single words + $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); $arr = array_count_values($arr); @@ -312,6 +326,9 @@ function idx_tokenizer($string,&$stopwords){ $words = array(); if(preg_match('/[^0-9A-Za-z]/u', $string)){ + #handle asian chars as single words + $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); + $arr = explode(' ', utf8_stripspecials($string,' ','._\-:')); foreach ($arr as $w) { if (!is_numeric($w) && strlen($w) < 3) continue; |