summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom N Harris <tnharris@whoopdedo.org>2010-11-20 14:29:13 -0500
committerTom N Harris <tnharris@whoopdedo.org>2010-11-20 14:29:13 -0500
commit3c4b38902b6f6d32222611b22087d5d41d20de6e (patch)
tree3d28bbaa015e10283050b6512a4f49e53323b8fc
parent4753bcc0e2fd9417e885e128e8c9ab4bfc566c32 (diff)
parent420edfd639fb3d0a0f6a2504ecb2f8f6b68be1f7 (diff)
downloadrpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.gz
rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.bz2
Merge branch 'tokenizer-rewrite' into michitux
-rwxr-xr-xbin/indexer.php8
-rw-r--r--conf/dokuwiki.php2
-rw-r--r--inc/indexer.php112
-rw-r--r--inc/io.php23
-rw-r--r--lib/exe/indexer.php7
-rw-r--r--lib/exe/xmlrpc.php4
-rw-r--r--lib/plugins/config/lang/en/lang.php2
-rw-r--r--lib/plugins/config/settings/config.metadata.php2
8 files changed, 98 insertions, 62 deletions
diff --git a/bin/indexer.php b/bin/indexer.php
index 48e98b571..497c6146a 100755
--- a/bin/indexer.php
+++ b/bin/indexer.php
@@ -13,10 +13,6 @@ require_once(DOKU_INC.'inc/auth.php');
require_once(DOKU_INC.'inc/cliopts.php');
session_write_close();
-// Version tag used to force rebuild on upgrade
-// Need to keep in sync with lib/exe/indexer.php
-if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2);
-
// handle options
$short_opts = 'hcuq';
$long_opts = array('help', 'clear', 'update', 'quiet');
@@ -88,7 +84,7 @@ function _index($id){
if(!$CLEAR){
$idxtag = metaFN($id,'.indexed');
if(@file_exists($idxtag)){
- if(io_readFile($idxtag) >= INDEXER_VERSION){
+ if(io_readFile($idxtag) == idx_get_version()){
$last = @filemtime(metaFN($id,'.indexed'));
if($last > @filemtime(wikiFN($id))) return;
}
@@ -98,7 +94,7 @@ function _index($id){
_lock();
_quietecho("$id... ");
idx_addPage($id);
- io_saveFile(metaFN($id,'.indexed'),INDEXER_VERSION);
+ io_saveFile(metaFN($id,'.indexed'), idx_get_version());
_quietecho("done.\n");
_unlock();
}
diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php
index 2405494e0..f10c70e58 100644
--- a/conf/dokuwiki.php
+++ b/conf/dokuwiki.php
@@ -133,6 +133,8 @@ $conf['broken_iua'] = 0; //Platform with broken ignore_user_abor
$conf['xsendfile'] = 0; //Use X-Sendfile (1 = lighttpd, 2 = standard)
$conf['renderer_xhtml'] = 'xhtml'; //renderer to use for main page generation
$conf['rememberme'] = 1; //Enable/disable remember me on login
+$conf['external_tokenizer'] = 0; //Use an external program to split pages into words for indexing
+$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati';
//Set target to use when creating links - leave empty for same window
$conf['target']['wiki'] = '';
diff --git a/inc/indexer.php b/inc/indexer.php
index 954512673..d4432026e 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -8,6 +8,9 @@
if(!defined('DOKU_INC')) die('meh.');
+// Version tag used to force rebuild on upgrade
+define('INDEXER_VERSION', 2);
+
// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
@@ -43,6 +46,20 @@ define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two charact
define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
/**
+ * Version of the indexer taking into consideration the external tokenizer.
+ * The indexer is only compatible with data written by the same version.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_get_version(){
+ global $conf;
+ if($conf['external_tokenizer'])
+ return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']);
+ else
+ return INDEXER_VERSION;
+}
+
+/**
* Measure the length of a string.
* Differs from strlen in handling of asian characters.
*
@@ -52,8 +69,10 @@ function wordlen($w){
$l = strlen($w);
// If left alone, all chinese "words" will get put into w3.idx
// So the "length" of a "word" is faked
- if(preg_match('/'.IDX_ASIAN2.'/u',$w))
- $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF
+ if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
+ foreach($leadbytes[0] as $b)
+ $l += ord($b) - 0xE1;
+ }
return $l;
}
@@ -102,22 +121,6 @@ function idx_getIndex($pre, $wlen){
}
/**
- * Create an empty index file if it doesn't exist yet.
- *
- * FIXME: This function isn't currently used. It will probably be removed soon.
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
-function idx_touchIndex($pre, $wlen){
- global $conf;
- $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
- if(!@file_exists($fn)){
- touch($fn);
- if($conf['fperm']) chmod($fn, $conf['fperm']);
- }
-}
-
-/**
* Read a line ending with \n.
* Returns false on EOF.
*
@@ -215,8 +218,7 @@ function idx_getPageWords($page){
list($page,$body) = $data;
- $body = strtr($body, "\r\n\t", ' ');
- $tokens = explode(' ', $body);
+ $tokens = idx_tokenizer($body, $stopwords);
$tokens = array_count_values($tokens); // count the frequency of each token
// ensure the deaccented or romanised page names of internal links are added to the token array
@@ -237,16 +239,12 @@ function idx_getPageWords($page){
}
$words = array();
- foreach ($tokens as $word => $count) {
- $arr = idx_tokenizer($word,$stopwords);
- $arr = array_count_values($arr);
- foreach ($arr as $w => $c) {
- $l = wordlen($w);
- if(isset($words[$l])){
- $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
- }else{
- $words[$l] = array($w => $c * $count);
- }
+ foreach ($tokens as $w => $c) {
+ $l = wordlen($w);
+ if(isset($words[$l])){
+ $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
+ }else{
+ $words[$l] = array($w => $c);
}
}
@@ -480,7 +478,7 @@ function idx_indexLengths(&$filter){
} else {
$lengths = idx_listIndexLengths();
foreach ( $lengths as $key => $length) {
- // we keep all the values equal or superior
+ // we keep all the values equal or superior
if ((int)$length >= (int)$filter) {
$idx[] = $length;
}
@@ -663,33 +661,51 @@ function idx_parseIndexLine(&$page_idx,$line){
* Tokenizes a string into an array of search words
*
* Uses the same algorithm as idx_getPageWords()
+ * Takes an arbitrarily complex string and returns a list of words
+ * suitable for indexing. The string may include spaces and line
+ * breaks
*
* @param string $string the query as given by the user
* @param arrayref $stopwords array of stopwords
* @param boolean $wc are wildcards allowed?
+ * @return array list of indexable words
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ * @author Andreas Gohr <andi@splitbrain.org>
*/
function idx_tokenizer($string,&$stopwords,$wc=false){
+ global $conf;
$words = array();
$wc = ($wc) ? '' : $wc = '\*';
- if(preg_match('/[^0-9A-Za-z]/u', $string)){
- // handle asian chars as single words (may fail on older PHP version)
- $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
- if(!is_null($asia)) $string = $asia; //recover from regexp failure
-
- $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
- foreach ($arr as $w) {
- if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue;
- $w = utf8_strtolower($w);
- if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
- $words[] = $w;
+ if (!$stopwords)
+ $sw = array();
+ else
+ $sw =& $stopwords;
+
+ if ($conf['external_tokenizer']) {
+ if (0 == io_exec($conf['tokenizer_cmd'], $string, $output))
+ $string = $output;
+ } else {
+ if(preg_match('/[^0-9A-Za-z ]/u', $string)) {
+ // handle asian chars as single words (may fail on older PHP version)
+ $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
+ if(!is_null($asia)) $string = $asia; //recover from regexp failure
}
- }else{
- $w = $string;
- if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words;
- $w = strtolower($w);
- if(is_int(array_search("$w\n",$stopwords))) return $words;
- $words[] = $w;
+ }
+ $string = strtr($string, "\r\n\t", ' ');
+ if(preg_match('/[^0-9A-Za-z ]/u', $string))
+ $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc);
+
+ $wordlist = explode(' ', $string);
+ foreach ($wordlist as $word) {
+ if(preg_match('/[^0-9A-Za-z]/u', $word)){
+ $word = utf8_strtolower($word);
+ }else{
+ $word = strtolower($word);
+ }
+ if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue;
+ if(is_int(array_search("$word\n",$stopwords))) continue;
+ $words[] = $word;
}
return $words;
diff --git a/inc/io.php b/inc/io.php
index 1d69dabc9..a0be00da3 100644
--- a/inc/io.php
+++ b/inc/io.php
@@ -529,7 +529,7 @@ function io_rename($from,$to){
/**
- * Runs an external command and returns it's output as string
+ * Runs an external command and returns its output as string
*
* @author Harry Brueckner <harry_b@eml.cc>
* @author Andreas Gohr <andi@splitbrain.org>
@@ -547,6 +547,27 @@ function io_runcmd($cmd){
}
/**
+ * Runs an external command with input and output pipes.
+ * Returns the exit code from the process.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function io_exec($cmd, $input, &$output){
+ $descspec = array(
+ 0=>array("pipe","r"),
+ 1=>array("pipe","w"),
+ 2=>array("pipe","w"));
+ $ph = proc_open($cmd, $descspec, $pipes);
+ if(!$ph) return -1;
+ fclose($pipes[2]); // ignore stderr
+ fwrite($pipes[0], $input);
+ fclose($pipes[0]);
+ $output = stream_get_contents($pipes[1]);
+ fclose($pipes[1]);
+ return proc_close($ph);
+}
+
+/**
* Search a file for matching lines
*
* This is probably not faster than file()+preg_grep() but less
diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php
index 3fa81715b..55d860296 100644
--- a/lib/exe/indexer.php
+++ b/lib/exe/indexer.php
@@ -11,9 +11,6 @@ require_once(DOKU_INC.'inc/init.php');
session_write_close(); //close session
if(!defined('NL')) define('NL',"\n");
-// Version tag used to force rebuild on upgrade
-define('INDEXER_VERSION', 2);
-
// keep running after browser closes connection
@ignore_user_abort(true);
@@ -140,7 +137,7 @@ function runIndexer(){
// check if indexing needed
$idxtag = metaFN($ID,'.indexed');
if(@file_exists($idxtag)){
- if(io_readFile($idxtag) >= INDEXER_VERSION){
+ if(trim(io_readFile($idxtag)) == idx_get_version()){
$last = @filemtime($idxtag);
if($last > @filemtime(wikiFN($ID))){
print "runIndexer(): index for $ID up to date".NL;
@@ -168,7 +165,7 @@ function runIndexer(){
idx_addPage($ID);
// we're finished - save and free lock
- io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION);
+ io_saveFile(metaFN($ID,'.indexed'), idx_get_version());
@rmdir($lock);
print "runIndexer(): finished".NL;
return true;
diff --git a/lib/exe/xmlrpc.php b/lib/exe/xmlrpc.php
index f06792361..410d4f6ba 100644
--- a/lib/exe/xmlrpc.php
+++ b/lib/exe/xmlrpc.php
@@ -1,7 +1,7 @@
<?php
if(!defined('DOKU_INC')) define('DOKU_INC',dirname(__FILE__).'/../../');
-// fix when '<?xml' isn't on the very first line
+// fix when '< ?xml' isn't on the very first line
if(isset($HTTP_RAW_POST_DATA)) $HTTP_RAW_POST_DATA = trim($HTTP_RAW_POST_DATA);
/**
@@ -570,7 +570,7 @@ class dokuwiki_xmlrpc_server extends IXR_IntrospectionServer {
idx_addPage($id);
// we're finished - save and free lock
- io_saveFile(metaFN($id,'.indexed'),INDEXER_VERSION);
+ io_saveFile(metaFN($id,'.indexed'), idx_get_version());
@rmdir($lock);
}
diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php
index a944d6bd7..85214bf98 100644
--- a/lib/plugins/config/lang/en/lang.php
+++ b/lib/plugins/config/lang/en/lang.php
@@ -141,6 +141,8 @@ $lang['renderer_xhtml'] = 'Renderer to use for main (xhtml) wiki output';
$lang['renderer__core'] = '%s (dokuwiki core)';
$lang['renderer__plugin'] = '%s (plugin)';
$lang['rememberme'] = 'Allow permanent login cookies (remember me)';
+$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing';
+$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer';
$lang['rss_type'] = 'XML feed type';
$lang['rss_linkto'] = 'XML feed links to';
diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php
index edba65262..331da5ab8 100644
--- a/lib/plugins/config/settings/config.metadata.php
+++ b/lib/plugins/config/settings/config.metadata.php
@@ -190,6 +190,8 @@ $meta['broken_iua'] = array('onoff');
$meta['xsendfile'] = array('multichoice','_choices' => array(0,1,2,3));
$meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml'));
$meta['readdircache'] = array('numeric');
+$meta['external_tokenizer'] = array('onoff');
+$meta['tokenizer_cmd'] = array('string');
$meta['_network'] = array('fieldset');
$meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i');