diff options
author | Tom N Harris <tnharris@whoopdedo.org> | 2010-11-20 14:29:13 -0500 |
---|---|---|
committer | Tom N Harris <tnharris@whoopdedo.org> | 2010-11-20 14:29:13 -0500 |
commit | 3c4b38902b6f6d32222611b22087d5d41d20de6e (patch) | |
tree | 3d28bbaa015e10283050b6512a4f49e53323b8fc | |
parent | 4753bcc0e2fd9417e885e128e8c9ab4bfc566c32 (diff) | |
parent | 420edfd639fb3d0a0f6a2504ecb2f8f6b68be1f7 (diff) | |
download | rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.gz rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.bz2 |
Merge branch 'tokenizer-rewrite' into michitux
-rwxr-xr-x | bin/indexer.php | 8 | ||||
-rw-r--r-- | conf/dokuwiki.php | 2 | ||||
-rw-r--r-- | inc/indexer.php | 112 | ||||
-rw-r--r-- | inc/io.php | 23 | ||||
-rw-r--r-- | lib/exe/indexer.php | 7 | ||||
-rw-r--r-- | lib/exe/xmlrpc.php | 4 | ||||
-rw-r--r-- | lib/plugins/config/lang/en/lang.php | 2 | ||||
-rw-r--r-- | lib/plugins/config/settings/config.metadata.php | 2 |
8 files changed, 98 insertions, 62 deletions
diff --git a/bin/indexer.php b/bin/indexer.php index 48e98b571..497c6146a 100755 --- a/bin/indexer.php +++ b/bin/indexer.php @@ -13,10 +13,6 @@ require_once(DOKU_INC.'inc/auth.php'); require_once(DOKU_INC.'inc/cliopts.php'); session_write_close(); -// Version tag used to force rebuild on upgrade -// Need to keep in sync with lib/exe/indexer.php -if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2); - // handle options $short_opts = 'hcuq'; $long_opts = array('help', 'clear', 'update', 'quiet'); @@ -88,7 +84,7 @@ function _index($id){ if(!$CLEAR){ $idxtag = metaFN($id,'.indexed'); if(@file_exists($idxtag)){ - if(io_readFile($idxtag) >= INDEXER_VERSION){ + if(io_readFile($idxtag) == idx_get_version()){ $last = @filemtime(metaFN($id,'.indexed')); if($last > @filemtime(wikiFN($id))) return; } @@ -98,7 +94,7 @@ function _index($id){ _lock(); _quietecho("$id... "); idx_addPage($id); - io_saveFile(metaFN($id,'.indexed'),INDEXER_VERSION); + io_saveFile(metaFN($id,'.indexed'), idx_get_version()); _quietecho("done.\n"); _unlock(); } diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php index 2405494e0..f10c70e58 100644 --- a/conf/dokuwiki.php +++ b/conf/dokuwiki.php @@ -133,6 +133,8 @@ $conf['broken_iua'] = 0; //Platform with broken ignore_user_abor $conf['xsendfile'] = 0; //Use X-Sendfile (1 = lighttpd, 2 = standard) $conf['renderer_xhtml'] = 'xhtml'; //renderer to use for main page generation $conf['rememberme'] = 1; //Enable/disable remember me on login +$conf['external_tokenizer'] = 0; //Use an external program to split pages into words for indexing +$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati'; //Set target to use when creating links - leave empty for same window $conf['target']['wiki'] = ''; diff --git a/inc/indexer.php b/inc/indexer.php index 954512673..d4432026e 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -8,6 +8,9 @@ if(!defined('DOKU_INC')) die('meh.'); +// Version tag used to force rebuild on upgrade +define('INDEXER_VERSION', 2); + // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); @@ -43,6 +46,20 @@ define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two charact define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); /** + * Version of the indexer taking into consideration the external tokenizer. + * The indexer is only compatible with data written by the same version. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_get_version(){ + global $conf; + if($conf['external_tokenizer']) + return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); + else + return INDEXER_VERSION; +} + +/** * Measure the length of a string. * Differs from strlen in handling of asian characters. * @@ -52,8 +69,10 @@ function wordlen($w){ $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked - if(preg_match('/'.IDX_ASIAN2.'/u',$w)) - $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF + if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { + foreach($leadbytes[0] as $b) + $l += ord($b) - 0xE1; + } return $l; } @@ -102,22 +121,6 @@ function idx_getIndex($pre, $wlen){ } /** - * Create an empty index file if it doesn't exist yet. - * - * FIXME: This function isn't currently used. It will probably be removed soon. - * - * @author Tom N Harris <tnharris@whoopdedo.org> - */ -function idx_touchIndex($pre, $wlen){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; - if(!@file_exists($fn)){ - touch($fn); - if($conf['fperm']) chmod($fn, $conf['fperm']); - } -} - -/** * Read a line ending with \n. * Returns false on EOF. * @@ -215,8 +218,7 @@ function idx_getPageWords($page){ list($page,$body) = $data; - $body = strtr($body, "\r\n\t", ' '); - $tokens = explode(' ', $body); + $tokens = idx_tokenizer($body, $stopwords); $tokens = array_count_values($tokens); // count the frequency of each token // ensure the deaccented or romanised page names of internal links are added to the token array @@ -237,16 +239,12 @@ function idx_getPageWords($page){ } $words = array(); - foreach ($tokens as $word => $count) { - $arr = idx_tokenizer($word,$stopwords); - $arr = array_count_values($arr); - foreach ($arr as $w => $c) { - $l = wordlen($w); - if(isset($words[$l])){ - $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0); - }else{ - $words[$l] = array($w => $c * $count); - } + foreach ($tokens as $w => $c) { + $l = wordlen($w); + if(isset($words[$l])){ + $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); + }else{ + $words[$l] = array($w => $c); } } @@ -480,7 +478,7 @@ function idx_indexLengths(&$filter){ } else { $lengths = idx_listIndexLengths(); foreach ( $lengths as $key => $length) { - // we keep all the values equal or superior + // we keep all the values equal or superior if ((int)$length >= (int)$filter) { $idx[] = $length; } @@ -663,33 +661,51 @@ function idx_parseIndexLine(&$page_idx,$line){ * Tokenizes a string into an array of search words * * Uses the same algorithm as idx_getPageWords() + * Takes an arbitrarily complex string and returns a list of words + * suitable for indexing. The string may include spaces and line + * breaks * * @param string $string the query as given by the user * @param arrayref $stopwords array of stopwords * @param boolean $wc are wildcards allowed? + * @return array list of indexable words + * @author Tom N Harris <tnharris@whoopdedo.org> + * @author Andreas Gohr <andi@splitbrain.org> */ function idx_tokenizer($string,&$stopwords,$wc=false){ + global $conf; $words = array(); $wc = ($wc) ? '' : $wc = '\*'; - if(preg_match('/[^0-9A-Za-z]/u', $string)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); - if(!is_null($asia)) $string = $asia; //recover from regexp failure - - $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; + if (!$stopwords) + $sw = array(); + else + $sw =& $stopwords; + + if ($conf['external_tokenizer']) { + if (0 == io_exec($conf['tokenizer_cmd'], $string, $output)) + $string = $output; + } else { + if(preg_match('/[^0-9A-Za-z ]/u', $string)) { + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); + if(!is_null($asia)) $string = $asia; //recover from regexp failure } - }else{ - $w = $string; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) return $words; - $words[] = $w; + } + $string = strtr($string, "\r\n\t", ' '); + if(preg_match('/[^0-9A-Za-z ]/u', $string)) + $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); + + $wordlist = explode(' ', $string); + foreach ($wordlist as $word) { + if(preg_match('/[^0-9A-Za-z]/u', $word)){ + $word = utf8_strtolower($word); + }else{ + $word = strtolower($word); + } + if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; + if(is_int(array_search("$word\n",$stopwords))) continue; + $words[] = $word; } return $words; diff --git a/inc/io.php b/inc/io.php index 1d69dabc9..a0be00da3 100644 --- a/inc/io.php +++ b/inc/io.php @@ -529,7 +529,7 @@ function io_rename($from,$to){ /** - * Runs an external command and returns it's output as string + * Runs an external command and returns its output as string * * @author Harry Brueckner <harry_b@eml.cc> * @author Andreas Gohr <andi@splitbrain.org> @@ -547,6 +547,27 @@ function io_runcmd($cmd){ } /** + * Runs an external command with input and output pipes. + * Returns the exit code from the process. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function io_exec($cmd, $input, &$output){ + $descspec = array( + 0=>array("pipe","r"), + 1=>array("pipe","w"), + 2=>array("pipe","w")); + $ph = proc_open($cmd, $descspec, $pipes); + if(!$ph) return -1; + fclose($pipes[2]); // ignore stderr + fwrite($pipes[0], $input); + fclose($pipes[0]); + $output = stream_get_contents($pipes[1]); + fclose($pipes[1]); + return proc_close($ph); +} + +/** * Search a file for matching lines * * This is probably not faster than file()+preg_grep() but less diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 3fa81715b..55d860296 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -11,9 +11,6 @@ require_once(DOKU_INC.'inc/init.php'); session_write_close(); //close session if(!defined('NL')) define('NL',"\n"); -// Version tag used to force rebuild on upgrade -define('INDEXER_VERSION', 2); - // keep running after browser closes connection @ignore_user_abort(true); @@ -140,7 +137,7 @@ function runIndexer(){ // check if indexing needed $idxtag = metaFN($ID,'.indexed'); if(@file_exists($idxtag)){ - if(io_readFile($idxtag) >= INDEXER_VERSION){ + if(trim(io_readFile($idxtag)) == idx_get_version()){ $last = @filemtime($idxtag); if($last > @filemtime(wikiFN($ID))){ print "runIndexer(): index for $ID up to date".NL; @@ -168,7 +165,7 @@ function runIndexer(){ idx_addPage($ID); // we're finished - save and free lock - io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION); + io_saveFile(metaFN($ID,'.indexed'), idx_get_version()); @rmdir($lock); print "runIndexer(): finished".NL; return true; diff --git a/lib/exe/xmlrpc.php b/lib/exe/xmlrpc.php index f06792361..410d4f6ba 100644 --- a/lib/exe/xmlrpc.php +++ b/lib/exe/xmlrpc.php @@ -1,7 +1,7 @@ <?php if(!defined('DOKU_INC')) define('DOKU_INC',dirname(__FILE__).'/../../'); -// fix when '<?xml' isn't on the very first line +// fix when '< ?xml' isn't on the very first line if(isset($HTTP_RAW_POST_DATA)) $HTTP_RAW_POST_DATA = trim($HTTP_RAW_POST_DATA); /** @@ -570,7 +570,7 @@ class dokuwiki_xmlrpc_server extends IXR_IntrospectionServer { idx_addPage($id); // we're finished - save and free lock - io_saveFile(metaFN($id,'.indexed'),INDEXER_VERSION); + io_saveFile(metaFN($id,'.indexed'), idx_get_version()); @rmdir($lock); } diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php index a944d6bd7..85214bf98 100644 --- a/lib/plugins/config/lang/en/lang.php +++ b/lib/plugins/config/lang/en/lang.php @@ -141,6 +141,8 @@ $lang['renderer_xhtml'] = 'Renderer to use for main (xhtml) wiki output'; $lang['renderer__core'] = '%s (dokuwiki core)'; $lang['renderer__plugin'] = '%s (plugin)'; $lang['rememberme'] = 'Allow permanent login cookies (remember me)'; +$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing'; +$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer'; $lang['rss_type'] = 'XML feed type'; $lang['rss_linkto'] = 'XML feed links to'; diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php index edba65262..331da5ab8 100644 --- a/lib/plugins/config/settings/config.metadata.php +++ b/lib/plugins/config/settings/config.metadata.php @@ -190,6 +190,8 @@ $meta['broken_iua'] = array('onoff'); $meta['xsendfile'] = array('multichoice','_choices' => array(0,1,2,3)); $meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml')); $meta['readdircache'] = array('numeric'); +$meta['external_tokenizer'] = array('onoff'); +$meta['tokenizer_cmd'] = array('string'); $meta['_network'] = array('fieldset'); $meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i'); |