Use external program to split pages into words

An external tokenizer inserts extra spaces to mark words in the input text. The text is sent through STDIN and STDOUT file handles. A good choice for Chinese and Japanese is MeCab. http://sourceforge.net/projects/mecab/ With the command line 'mecab -O wakati'
author: Tom N Harris <tnharris@whoopdedo.org> 2010-11-16 18:09:53 -0500
committer: Tom N Harris <tnharris@whoopdedo.org> 2010-11-16 18:09:53 -0500
commit: 1c07b9e622d139fa815c955c89569f96342475fb (patch)
tree: 08b1d84b5d1fa7c3b1b22c89a9be6efd3e543704
parent: 6c528220aaf62f4ba5890483797d6661352500bb (diff)
download: rpg-1c07b9e622d139fa815c955c89569f96342475fb.tar.gz
rpg-1c07b9e622d139fa815c955c89569f96342475fb.tar.bz2
4 files changed, 22 insertions, 16 deletions
diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php
index 2405494e0..f10c70e58 100644
--- a/conf/dokuwiki.php
+++ b/conf/dokuwiki.php
@@ -133,6 +133,8 @@ $conf['broken_iua']  = 0;                //Platform with broken ignore_user_abor
 $conf['xsendfile']   = 0;                //Use X-Sendfile (1 = lighttpd, 2 = standard)
 $conf['renderer_xhtml'] = 'xhtml';       //renderer to use for main page generation
 $conf['rememberme'] = 1;                 //Enable/disable remember me on login
+$conf['external_tokenizer'] = 0;         //Use an external program to split pages into words for indexing
+$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati';
 
 //Set target to use when creating links - leave empty for same window
 $conf['target']['wiki']      = '';
diff --git a/inc/indexer.php b/inc/indexer.php
index b3e10a548..1c955a99d 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -662,6 +662,7 @@ function idx_parseIndexLine(&$page_idx,$line){
  * @author Andreas Gohr <andi@splitbrain.org>
  */
 function idx_tokenizer($string,&$stopwords,$wc=false){
+    global $conf;
     $words = array();
     $wc = ($wc) ? '' : $wc = '\*';
 
@@ -670,6 +671,16 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
     else
         $sw =& $stopwords;
 
+    if ($conf['external_tokenizer']) {
+	if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output))
+            $string = $output;
+    } else {
+        if(preg_match('/[^0-9A-Za-z ]/u', $string)) {
+            // handle asian chars as single words (may fail on older PHP version)
+            $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
+            if(!is_null($asia)) $string = $asia; //recover from regexp failure
+        }
+    }
     $string = strtr($string, "\r\n\t", '   ');
     if(preg_match('/[^0-9A-Za-z ]/u', $string))
         $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc);
@@ -677,24 +688,13 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
     $wordlist = explode(' ', $string);
     foreach ($wordlist as $word) {
         if(preg_match('/[^0-9A-Za-z]/u', $word)){
-            // handle asian chars as single words (may fail on older PHP version)
-            $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word);
-            if(!is_null($asia)) $word = $asia; //recover from regexp failure
-
-            $arr = explode(' ', $word);
-            foreach ($arr as $w) {
-                if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue;
-                $w = utf8_strtolower($w);
-                if(is_int(array_search("$w\n",$stopwords))) continue;
-                $words[] = $w;
-            }
+            $word = utf8_strtolower($word);
         }else{
-            $w = $word;
-            if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue;
-            $w = strtolower($w);
-            if(is_int(array_search("$w\n",$stopwords))) continue;
-            $words[] = $w;
+            $word = strtolower($word);
         }
+        if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue;
+        if(is_int(array_search("$word\n",$stopwords))) continue;
+        $words[] = $word;
     }
 
     return $words;
diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php
index a944d6bd7..85214bf98 100644
--- a/lib/plugins/config/lang/en/lang.php
+++ b/lib/plugins/config/lang/en/lang.php
@@ -141,6 +141,8 @@ $lang['renderer_xhtml']   = 'Renderer to use for main (xhtml) wiki output';
 $lang['renderer__core']   = '%s (dokuwiki core)';
 $lang['renderer__plugin'] = '%s (plugin)';
 $lang['rememberme'] = 'Allow permanent login cookies (remember me)';
+$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing';
+$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer';
 
 $lang['rss_type']    = 'XML feed type';
 $lang['rss_linkto']  = 'XML feed links to';
diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php
index edba65262..331da5ab8 100644
--- a/lib/plugins/config/settings/config.metadata.php
+++ b/lib/plugins/config/settings/config.metadata.php
@@ -190,6 +190,8 @@ $meta['broken_iua']  = array('onoff');
 $meta['xsendfile']   = array('multichoice','_choices' => array(0,1,2,3));
 $meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml'));
 $meta['readdircache'] = array('numeric');
+$meta['external_tokenizer'] = array('onoff');
+$meta['tokenizer_cmd'] = array('string');
 
 $meta['_network']    = array('fieldset');
 $meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i');
author	Tom N Harris <tnharris@whoopdedo.org>	2010-11-16 18:09:53 -0500
committer	Tom N Harris <tnharris@whoopdedo.org>	2010-11-16 18:09:53 -0500
commit	1c07b9e622d139fa815c955c89569f96342475fb (patch)
tree	08b1d84b5d1fa7c3b1b22c89a9be6efd3e543704
parent	6c528220aaf62f4ba5890483797d6661352500bb (diff)
download	rpg-1c07b9e622d139fa815c955c89569f96342475fb.tar.gz rpg-1c07b9e622d139fa815c955c89569f96342475fb.tar.bz2