From 3a1a171b951828395a7578475e86e622f9a7205c Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:17:52 -0500 Subject: Remove unused idx_touchIndex function --- inc/indexer.php | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index f5330040a..7a8bb3ff8 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -103,22 +103,6 @@ function idx_getIndex($pre, $wlen){ return file($fn); } -/** - * Create an empty index file if it doesn't exist yet. - * - * FIXME: This function isn't currently used. It will probably be removed soon. - * - * @author Tom N Harris - */ -function idx_touchIndex($pre, $wlen){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; - if(!@file_exists($fn)){ - touch($fn); - if($conf['fperm']) chmod($fn, $conf['fperm']); - } -} - /** * Read a line ending with \n. * Returns false on EOF. -- cgit v1.2.3 From ee0891d8ffd7e4a59c958b9546a3b8382e4e5991 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:18:51 -0500 Subject: Do not assume that index files will be backward compatible --- lib/exe/indexer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 3fa81715b..4a6f74ba4 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -140,7 +140,7 @@ function runIndexer(){ // check if indexing needed $idxtag = metaFN($ID,'.indexed'); if(@file_exists($idxtag)){ - if(io_readFile($idxtag) >= INDEXER_VERSION){ + if(trim(io_readFile($idxtag)) == INDEXER_VERSION){ $last = @filemtime($idxtag); if($last > @filemtime(wikiFN($ID))){ print "runIndexer(): index for $ID up to date".NL; -- cgit v1.2.3 From 4b9792c696658fe0cbedc187198fa463b6ff83fc Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:22:08 -0500 Subject: Measure length of multi-character Asian words --- inc/indexer.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 7a8bb3ff8..d9eccac76 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -52,8 +52,10 @@ function wordlen($w){ $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked - if(preg_match('/'.IDX_ASIAN2.'/u',$w)) - $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF + if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { + foreach($leadbytes[0] as $b) + $l += ord($b) - 0xE1; + } return $l; } -- cgit v1.2.3 From 4e1bf408de9297d5773cd8bfe1af997c83eab1a2 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:32:23 -0500 Subject: Refactor tokenizer to avoid splitting multiple times --- inc/indexer.php | 69 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index d9eccac76..56d80b7fa 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -203,8 +203,7 @@ function idx_getPageWords($page){ list($page,$body) = $data; - $body = strtr($body, "\r\n\t", ' '); - $tokens = explode(' ', $body); + $tokens = idx_tokenizer($body, $stopwords); $tokens = array_count_values($tokens); // count the frequency of each token // ensure the deaccented or romanised page names of internal links are added to the token array @@ -225,16 +224,12 @@ function idx_getPageWords($page){ } $words = array(); - foreach ($tokens as $word => $count) { - $arr = idx_tokenizer($word,$stopwords); - $arr = array_count_values($arr); - foreach ($arr as $w => $c) { - $l = wordlen($w); - if(isset($words[$l])){ - $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0); - }else{ - $words[$l] = array($w => $c * $count); - } + foreach ($tokens as $w => $c) { + $l = wordlen($w); + if(isset($words[$l])){ + $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); + }else{ + $words[$l] = array($w => $c); } } @@ -655,33 +650,51 @@ function idx_parseIndexLine(&$page_idx,$line){ * Tokenizes a string into an array of search words * * Uses the same algorithm as idx_getPageWords() + * Takes an arbitrarily complex string and returns a list of words + * suitable for indexing. The string may include spaces and line + * breaks * * @param string $string the query as given by the user * @param arrayref $stopwords array of stopwords * @param boolean $wc are wildcards allowed? + * @return array list of indexable words + * @author Tom N Harris + * @author Andreas Gohr */ function idx_tokenizer($string,&$stopwords,$wc=false){ $words = array(); $wc = ($wc) ? '' : $wc = '\*'; - if(preg_match('/[^0-9A-Za-z]/u', $string)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); - if(!is_null($asia)) $string = $asia; //recover from regexp failure - - $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; + if (!$stopwords) + $sw = array(); + else + $sw =& $stopwords; + + $string = strtr($string, "\r\n\t", ' '); + if(preg_match('/[^0-9A-Za-z ]/u', $string)) + $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); + + $wordlist = explode(' ', $string); + foreach ($wordlist as $word) { + if(preg_match('/[^0-9A-Za-z]/u', $word)){ + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word); + if(!is_null($asia)) $word = $asia; //recover from regexp failure + + $arr = explode(' ', $word); + foreach ($arr as $w) { + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; + $w = utf8_strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) continue; + $words[] = $w; + } + }else{ + $w = $word; + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; + $w = strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) return $words; $words[] = $w; } - }else{ - $w = $string; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) return $words; - $words[] = $w; } return $words; -- cgit v1.2.3 From 5bcab0c47360e5b31237885cff4583e0eba479f8 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Mon, 15 Nov 2010 15:48:31 -0500 Subject: tokenizer was returning prematurely --- inc/indexer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 56d80b7fa..b3e10a548 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -690,9 +690,9 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ } }else{ $w = $word; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) return $words; + if(is_int(array_search("$w\n",$stopwords))) continue; $words[] = $w; } } -- cgit v1.2.3 From 6c528220aaf62f4ba5890483797d6661352500bb Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 16 Nov 2010 17:58:28 -0500 Subject: Repurpose io_runcmd for pipes --- inc/io.php | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/inc/io.php b/inc/io.php index 1d69dabc9..9b797ebf2 100644 --- a/inc/io.php +++ b/inc/io.php @@ -533,17 +533,20 @@ function io_rename($from,$to){ * * @author Harry Brueckner * @author Andreas Gohr - * @deprecated */ -function io_runcmd($cmd){ - $fh = popen($cmd, "r"); - if(!$fh) return false; - $ret = ''; - while (!feof($fh)) { - $ret .= fread($fh, 8192); - } - pclose($fh); - return $ret; +function io_runcmd($cmd, $input, &$output){ + $descspec = array( + 0=>array("pipe","r"), + 1=>array("pipe","w"), + 2=>array("pipe","w")); + $ph = proc_open($cmd, $descspec, $pipes); + if(!$ph) return -1; + fclose($pipes[2]); // ignore stderr + fwrite($pipes[0], $input); + fclose($pipes[0]); + $output = stream_get_contents($pipes[1]); + fclose($pipes[1]); + return proc_close($ph); } /** -- cgit v1.2.3 From 1c07b9e622d139fa815c955c89569f96342475fb Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 16 Nov 2010 18:09:53 -0500 Subject: Use external program to split pages into words An external tokenizer inserts extra spaces to mark words in the input text. The text is sent through STDIN and STDOUT file handles. A good choice for Chinese and Japanese is MeCab. http://sourceforge.net/projects/mecab/ With the command line 'mecab -O wakati' --- conf/dokuwiki.php | 2 ++ inc/indexer.php | 32 ++++++++++++------------- lib/plugins/config/lang/en/lang.php | 2 ++ lib/plugins/config/settings/config.metadata.php | 2 ++ 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php index 2405494e0..f10c70e58 100644 --- a/conf/dokuwiki.php +++ b/conf/dokuwiki.php @@ -133,6 +133,8 @@ $conf['broken_iua'] = 0; //Platform with broken ignore_user_abor $conf['xsendfile'] = 0; //Use X-Sendfile (1 = lighttpd, 2 = standard) $conf['renderer_xhtml'] = 'xhtml'; //renderer to use for main page generation $conf['rememberme'] = 1; //Enable/disable remember me on login +$conf['external_tokenizer'] = 0; //Use an external program to split pages into words for indexing +$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati'; //Set target to use when creating links - leave empty for same window $conf['target']['wiki'] = ''; diff --git a/inc/indexer.php b/inc/indexer.php index b3e10a548..1c955a99d 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -662,6 +662,7 @@ function idx_parseIndexLine(&$page_idx,$line){ * @author Andreas Gohr */ function idx_tokenizer($string,&$stopwords,$wc=false){ + global $conf; $words = array(); $wc = ($wc) ? '' : $wc = '\*'; @@ -670,6 +671,16 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ else $sw =& $stopwords; + if ($conf['external_tokenizer']) { + if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output)) + $string = $output; + } else { + if(preg_match('/[^0-9A-Za-z ]/u', $string)) { + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); + if(!is_null($asia)) $string = $asia; //recover from regexp failure + } + } $string = strtr($string, "\r\n\t", ' '); if(preg_match('/[^0-9A-Za-z ]/u', $string)) $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); @@ -677,24 +688,13 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ $wordlist = explode(' ', $string); foreach ($wordlist as $word) { if(preg_match('/[^0-9A-Za-z]/u', $word)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word); - if(!is_null($asia)) $word = $asia; //recover from regexp failure - - $arr = explode(' ', $word); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; - } + $word = utf8_strtolower($word); }else{ - $w = $word; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; + $word = strtolower($word); } + if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; + if(is_int(array_search("$word\n",$stopwords))) continue; + $words[] = $word; } return $words; diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php index a944d6bd7..85214bf98 100644 --- a/lib/plugins/config/lang/en/lang.php +++ b/lib/plugins/config/lang/en/lang.php @@ -141,6 +141,8 @@ $lang['renderer_xhtml'] = 'Renderer to use for main (xhtml) wiki output'; $lang['renderer__core'] = '%s (dokuwiki core)'; $lang['renderer__plugin'] = '%s (plugin)'; $lang['rememberme'] = 'Allow permanent login cookies (remember me)'; +$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing'; +$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer'; $lang['rss_type'] = 'XML feed type'; $lang['rss_linkto'] = 'XML feed links to'; diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php index edba65262..331da5ab8 100644 --- a/lib/plugins/config/settings/config.metadata.php +++ b/lib/plugins/config/settings/config.metadata.php @@ -190,6 +190,8 @@ $meta['broken_iua'] = array('onoff'); $meta['xsendfile'] = array('multichoice','_choices' => array(0,1,2,3)); $meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml')); $meta['readdircache'] = array('numeric'); +$meta['external_tokenizer'] = array('onoff'); +$meta['tokenizer_cmd'] = array('string'); $meta['_network'] = array('fieldset'); $meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i'); -- cgit v1.2.3 From 7c2ef4e8d524fb9262c5a08831220f9fb2dc11fe Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 17 Nov 2010 17:02:31 -0500 Subject: Use a different indexer version when external tokenizer is enabled --- bin/indexer.php | 8 ++------ inc/indexer.php | 17 +++++++++++++++++ lib/exe/indexer.php | 7 ++----- lib/exe/xmlrpc.php | 4 ++-- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/bin/indexer.php b/bin/indexer.php index 48e98b571..497c6146a 100755 --- a/bin/indexer.php +++ b/bin/indexer.php @@ -13,10 +13,6 @@ require_once(DOKU_INC.'inc/auth.php'); require_once(DOKU_INC.'inc/cliopts.php'); session_write_close(); -// Version tag used to force rebuild on upgrade -// Need to keep in sync with lib/exe/indexer.php -if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2); - // handle options $short_opts = 'hcuq'; $long_opts = array('help', 'clear', 'update', 'quiet'); @@ -88,7 +84,7 @@ function _index($id){ if(!$CLEAR){ $idxtag = metaFN($id,'.indexed'); if(@file_exists($idxtag)){ - if(io_readFile($idxtag) >= INDEXER_VERSION){ + if(io_readFile($idxtag) == idx_get_version()){ $last = @filemtime(metaFN($id,'.indexed')); if($last > @filemtime(wikiFN($id))) return; } @@ -98,7 +94,7 @@ function _index($id){ _lock(); _quietecho("$id... "); idx_addPage($id); - io_saveFile(metaFN($id,'.indexed'),INDEXER_VERSION); + io_saveFile(metaFN($id,'.indexed'), idx_get_version()); _quietecho("done.\n"); _unlock(); } diff --git a/inc/indexer.php b/inc/indexer.php index 1c955a99d..4914c9fc6 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -8,6 +8,9 @@ if(!defined('DOKU_INC')) die('meh.'); +// Version tag used to force rebuild on upgrade +define('INDEXER_VERSION', 2); + // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); @@ -42,6 +45,20 @@ define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two charact ']?'); define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); +/** + * Version of the indexer taking into consideration the external tokenizer. + * The indexer is only compatible with data written by the same version. + * + * @author Tom N Harris + */ +function idx_get_version(){ + global $conf; + if($conf['external_tokenizer']) + return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); + else + return INDEXER_VERSION; +} + /** * Measure the length of a string. * Differs from strlen in handling of asian characters. diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 4a6f74ba4..55d860296 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -11,9 +11,6 @@ require_once(DOKU_INC.'inc/init.php'); session_write_close(); //close session if(!defined('NL')) define('NL',"\n"); -// Version tag used to force rebuild on upgrade -define('INDEXER_VERSION', 2); - // keep running after browser closes connection @ignore_user_abort(true); @@ -140,7 +137,7 @@ function runIndexer(){ // check if indexing needed $idxtag = metaFN($ID,'.indexed'); if(@file_exists($idxtag)){ - if(trim(io_readFile($idxtag)) == INDEXER_VERSION){ + if(trim(io_readFile($idxtag)) == idx_get_version()){ $last = @filemtime($idxtag); if($last > @filemtime(wikiFN($ID))){ print "runIndexer(): index for $ID up to date".NL; @@ -168,7 +165,7 @@ function runIndexer(){ idx_addPage($ID); // we're finished - save and free lock - io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION); + io_saveFile(metaFN($ID,'.indexed'), idx_get_version()); @rmdir($lock); print "runIndexer(): finished".NL; return true; diff --git a/lib/exe/xmlrpc.php b/lib/exe/xmlrpc.php index f06792361..410d4f6ba 100644 --- a/lib/exe/xmlrpc.php +++ b/lib/exe/xmlrpc.php @@ -1,7 +1,7 @@ Date: Thu, 18 Nov 2010 13:55:55 -0500 Subject: Restore io_runcmd, use io_exec for exec with pipes --- inc/indexer.php | 4 ++-- inc/io.php | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 4914c9fc6..32fbf4a1a 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -484,7 +484,7 @@ function idx_indexLengths(&$filter){ } else { $lengths = idx_listIndexLengths(); foreach ( $lengths as $key => $length) { - // we keep all the values equal or superior + // we keep all the values equal or superior if ((int)$length >= (int)$filter) { $idx[] = $length; } @@ -689,7 +689,7 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ $sw =& $stopwords; if ($conf['external_tokenizer']) { - if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output)) + if (0 == io_exec($conf['tokenizer_cmd'], $string, $output)) $string = $output; } else { if(preg_match('/[^0-9A-Za-z ]/u', $string)) { diff --git a/inc/io.php b/inc/io.php index 9b797ebf2..a0be00da3 100644 --- a/inc/io.php +++ b/inc/io.php @@ -529,12 +529,30 @@ function io_rename($from,$to){ /** - * Runs an external command and returns it's output as string + * Runs an external command and returns its output as string * * @author Harry Brueckner * @author Andreas Gohr + * @deprecated */ -function io_runcmd($cmd, $input, &$output){ +function io_runcmd($cmd){ + $fh = popen($cmd, "r"); + if(!$fh) return false; + $ret = ''; + while (!feof($fh)) { + $ret .= fread($fh, 8192); + } + pclose($fh); + return $ret; +} + +/** + * Runs an external command with input and output pipes. + * Returns the exit code from the process. + * + * @author Tom N Harris + */ +function io_exec($cmd, $input, &$output){ $descspec = array( 0=>array("pipe","r"), 1=>array("pipe","w"), -- cgit v1.2.3