summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--conf/dokuwiki.php2
-rw-r--r--inc/indexer.php32
-rw-r--r--lib/plugins/config/lang/en/lang.php2
-rw-r--r--lib/plugins/config/settings/config.metadata.php2
4 files changed, 22 insertions, 16 deletions
diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php
index 2405494e0..f10c70e58 100644
--- a/conf/dokuwiki.php
+++ b/conf/dokuwiki.php
@@ -133,6 +133,8 @@ $conf['broken_iua'] = 0; //Platform with broken ignore_user_abor
$conf['xsendfile'] = 0; //Use X-Sendfile (1 = lighttpd, 2 = standard)
$conf['renderer_xhtml'] = 'xhtml'; //renderer to use for main page generation
$conf['rememberme'] = 1; //Enable/disable remember me on login
+$conf['external_tokenizer'] = 0; //Use an external program to split pages into words for indexing
+$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati';
//Set target to use when creating links - leave empty for same window
$conf['target']['wiki'] = '';
diff --git a/inc/indexer.php b/inc/indexer.php
index b3e10a548..1c955a99d 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -662,6 +662,7 @@ function idx_parseIndexLine(&$page_idx,$line){
* @author Andreas Gohr <andi@splitbrain.org>
*/
function idx_tokenizer($string,&$stopwords,$wc=false){
+ global $conf;
$words = array();
$wc = ($wc) ? '' : $wc = '\*';
@@ -670,6 +671,16 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
else
$sw =& $stopwords;
+ if ($conf['external_tokenizer']) {
+ if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output))
+ $string = $output;
+ } else {
+ if(preg_match('/[^0-9A-Za-z ]/u', $string)) {
+ // handle asian chars as single words (may fail on older PHP version)
+ $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
+ if(!is_null($asia)) $string = $asia; //recover from regexp failure
+ }
+ }
$string = strtr($string, "\r\n\t", ' ');
if(preg_match('/[^0-9A-Za-z ]/u', $string))
$string = utf8_stripspecials($string, ' ', '\._\-:'.$wc);
@@ -677,24 +688,13 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
$wordlist = explode(' ', $string);
foreach ($wordlist as $word) {
if(preg_match('/[^0-9A-Za-z]/u', $word)){
- // handle asian chars as single words (may fail on older PHP version)
- $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word);
- if(!is_null($asia)) $word = $asia; //recover from regexp failure
-
- $arr = explode(' ', $word);
- foreach ($arr as $w) {
- if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue;
- $w = utf8_strtolower($w);
- if(is_int(array_search("$w\n",$stopwords))) continue;
- $words[] = $w;
- }
+ $word = utf8_strtolower($word);
}else{
- $w = $word;
- if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue;
- $w = strtolower($w);
- if(is_int(array_search("$w\n",$stopwords))) continue;
- $words[] = $w;
+ $word = strtolower($word);
}
+ if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue;
+ if(is_int(array_search("$word\n",$stopwords))) continue;
+ $words[] = $word;
}
return $words;
diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php
index a944d6bd7..85214bf98 100644
--- a/lib/plugins/config/lang/en/lang.php
+++ b/lib/plugins/config/lang/en/lang.php
@@ -141,6 +141,8 @@ $lang['renderer_xhtml'] = 'Renderer to use for main (xhtml) wiki output';
$lang['renderer__core'] = '%s (dokuwiki core)';
$lang['renderer__plugin'] = '%s (plugin)';
$lang['rememberme'] = 'Allow permanent login cookies (remember me)';
+$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing';
+$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer';
$lang['rss_type'] = 'XML feed type';
$lang['rss_linkto'] = 'XML feed links to';
diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php
index edba65262..331da5ab8 100644
--- a/lib/plugins/config/settings/config.metadata.php
+++ b/lib/plugins/config/settings/config.metadata.php
@@ -190,6 +190,8 @@ $meta['broken_iua'] = array('onoff');
$meta['xsendfile'] = array('multichoice','_choices' => array(0,1,2,3));
$meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml'));
$meta['readdircache'] = array('numeric');
+$meta['external_tokenizer'] = array('onoff');
+$meta['tokenizer_cmd'] = array('string');
$meta['_network'] = array('fieldset');
$meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i');