summaryrefslogtreecommitdiff
path: root/inc/fulltext.php
diff options
context:
space:
mode:
authorAndreas Gohr <andi@splitbrain.org>2005-09-25 19:54:51 +0200
committerAndreas Gohr <andi@splitbrain.org>2005-09-25 19:54:51 +0200
commit93a60ad223af9f649d62d2acbdffd22ce5ad1b97 (patch)
tree16e6bfbea21c90fa3fc625b0d3cc5ee4d14692bd /inc/fulltext.php
parent134f4ab222dddaf47588c908d0c2b81a3da2b76b (diff)
downloadrpg-93a60ad223af9f649d62d2acbdffd22ce5ad1b97.tar.gz
rpg-93a60ad223af9f649d62d2acbdffd22ce5ad1b97.tar.bz2
asian language support for the indexer #563
Asian languages do not use spaces to seperate words. The indexer however does a word based lookup. Splitting for example Japanese texts into real words is only possible with complicated natural language processing, something completely out of scope for DokuWiki. This patch solves the problem by treating all asian characters as single words. When an asian word (consisting of multiple characters) is searched it is treated as a phrase search, looking up each charcter by it self first, then checking for the phrase in found documents. darcs-hash:20050925175451-7ad00-933b33b51b5f2fa05e736c18b8db58a5fdbf41ce.gz
Diffstat (limited to 'inc/fulltext.php')
-rw-r--r--inc/fulltext.php11
1 files changed, 7 insertions, 4 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 96f3ad53c..89fa5b259 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -18,7 +18,6 @@
*/
function ft_pageSearch($query,&$poswords){
$q = ft_queryParser($query);
-
// use this for higlighting later:
$poswords = join(' ',$q['and']);
@@ -51,14 +50,12 @@ function ft_pageSearch($query,&$poswords){
}
if(!count($docs)) return array();
-
// handle phrases
if(count($q['phrases'])){
//build a regexp
$q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
$q['phrases'] = array_map('preg_quote',$q['phrases']);
$regex = '('.join('|',$q['phrases']).')';
-
// check the source of all documents for the exact phrases
foreach(array_keys($docs) as $id){
$text = utf8_strtolower(rawWiki($id));
@@ -237,6 +234,7 @@ function ft_resultCombine($args){
* Builds an array of search words from a query
*
* @todo support OR and parenthesises?
+ * @todo add namespace handling
*/
function ft_queryParser($query){
global $conf;
@@ -255,7 +253,7 @@ function ft_queryParser($query){
// handle phrase searches
while(preg_match('/"(.*?)"/',$query,$match)){
- $q['phrases'][] = $match[0];
+ $q['phrases'][] = $match[1];
$q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
$query = preg_replace('/"(.*?)"/','',$query,1);
}
@@ -266,6 +264,11 @@ function ft_queryParser($query){
$token = idx_tokenizer($w,$stopwords);
if(count($token)) $q['not'] = array_merge($q['not'],$token);
}else{
+ // asian "words" need to be searched as phrases
+ if(preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
+ $q['phrases'] = array_merge($q['phrases'],$matches[1]);
+
+ }
$token = idx_tokenizer($w,$stopwords);
if(count($token)) $q['and'] = array_merge($q['and'],$token);
}