From 3a1a171b951828395a7578475e86e622f9a7205c Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:17:52 -0500 Subject: Remove unused idx_touchIndex function --- inc/indexer.php | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index f5330040a..7a8bb3ff8 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -103,22 +103,6 @@ function idx_getIndex($pre, $wlen){ return file($fn); } -/** - * Create an empty index file if it doesn't exist yet. - * - * FIXME: This function isn't currently used. It will probably be removed soon. - * - * @author Tom N Harris - */ -function idx_touchIndex($pre, $wlen){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; - if(!@file_exists($fn)){ - touch($fn); - if($conf['fperm']) chmod($fn, $conf['fperm']); - } -} - /** * Read a line ending with \n. * Returns false on EOF. -- cgit v1.2.3 From 4b9792c696658fe0cbedc187198fa463b6ff83fc Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:22:08 -0500 Subject: Measure length of multi-character Asian words --- inc/indexer.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 7a8bb3ff8..d9eccac76 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -52,8 +52,10 @@ function wordlen($w){ $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked - if(preg_match('/'.IDX_ASIAN2.'/u',$w)) - $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF + if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { + foreach($leadbytes[0] as $b) + $l += ord($b) - 0xE1; + } return $l; } -- cgit v1.2.3 From 4e1bf408de9297d5773cd8bfe1af997c83eab1a2 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:32:23 -0500 Subject: Refactor tokenizer to avoid splitting multiple times --- inc/indexer.php | 69 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 28 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index d9eccac76..56d80b7fa 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -203,8 +203,7 @@ function idx_getPageWords($page){ list($page,$body) = $data; - $body = strtr($body, "\r\n\t", ' '); - $tokens = explode(' ', $body); + $tokens = idx_tokenizer($body, $stopwords); $tokens = array_count_values($tokens); // count the frequency of each token // ensure the deaccented or romanised page names of internal links are added to the token array @@ -225,16 +224,12 @@ function idx_getPageWords($page){ } $words = array(); - foreach ($tokens as $word => $count) { - $arr = idx_tokenizer($word,$stopwords); - $arr = array_count_values($arr); - foreach ($arr as $w => $c) { - $l = wordlen($w); - if(isset($words[$l])){ - $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0); - }else{ - $words[$l] = array($w => $c * $count); - } + foreach ($tokens as $w => $c) { + $l = wordlen($w); + if(isset($words[$l])){ + $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); + }else{ + $words[$l] = array($w => $c); } } @@ -655,33 +650,51 @@ function idx_parseIndexLine(&$page_idx,$line){ * Tokenizes a string into an array of search words * * Uses the same algorithm as idx_getPageWords() + * Takes an arbitrarily complex string and returns a list of words + * suitable for indexing. The string may include spaces and line + * breaks * * @param string $string the query as given by the user * @param arrayref $stopwords array of stopwords * @param boolean $wc are wildcards allowed? + * @return array list of indexable words + * @author Tom N Harris + * @author Andreas Gohr */ function idx_tokenizer($string,&$stopwords,$wc=false){ $words = array(); $wc = ($wc) ? '' : $wc = '\*'; - if(preg_match('/[^0-9A-Za-z]/u', $string)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); - if(!is_null($asia)) $string = $asia; //recover from regexp failure - - $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; + if (!$stopwords) + $sw = array(); + else + $sw =& $stopwords; + + $string = strtr($string, "\r\n\t", ' '); + if(preg_match('/[^0-9A-Za-z ]/u', $string)) + $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); + + $wordlist = explode(' ', $string); + foreach ($wordlist as $word) { + if(preg_match('/[^0-9A-Za-z]/u', $word)){ + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word); + if(!is_null($asia)) $word = $asia; //recover from regexp failure + + $arr = explode(' ', $word); + foreach ($arr as $w) { + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; + $w = utf8_strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) continue; + $words[] = $w; + } + }else{ + $w = $word; + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; + $w = strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) return $words; $words[] = $w; } - }else{ - $w = $string; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) return $words; - $words[] = $w; } return $words; -- cgit v1.2.3 From 5bcab0c47360e5b31237885cff4583e0eba479f8 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Mon, 15 Nov 2010 15:48:31 -0500 Subject: tokenizer was returning prematurely --- inc/indexer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 56d80b7fa..b3e10a548 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -690,9 +690,9 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ } }else{ $w = $word; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) return $words; + if(is_int(array_search("$w\n",$stopwords))) continue; $words[] = $w; } } -- cgit v1.2.3 From 1c07b9e622d139fa815c955c89569f96342475fb Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 16 Nov 2010 18:09:53 -0500 Subject: Use external program to split pages into words An external tokenizer inserts extra spaces to mark words in the input text. The text is sent through STDIN and STDOUT file handles. A good choice for Chinese and Japanese is MeCab. http://sourceforge.net/projects/mecab/ With the command line 'mecab -O wakati' --- inc/indexer.php | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index b3e10a548..1c955a99d 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -662,6 +662,7 @@ function idx_parseIndexLine(&$page_idx,$line){ * @author Andreas Gohr */ function idx_tokenizer($string,&$stopwords,$wc=false){ + global $conf; $words = array(); $wc = ($wc) ? '' : $wc = '\*'; @@ -670,6 +671,16 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ else $sw =& $stopwords; + if ($conf['external_tokenizer']) { + if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output)) + $string = $output; + } else { + if(preg_match('/[^0-9A-Za-z ]/u', $string)) { + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); + if(!is_null($asia)) $string = $asia; //recover from regexp failure + } + } $string = strtr($string, "\r\n\t", ' '); if(preg_match('/[^0-9A-Za-z ]/u', $string)) $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); @@ -677,24 +688,13 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ $wordlist = explode(' ', $string); foreach ($wordlist as $word) { if(preg_match('/[^0-9A-Za-z]/u', $word)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word); - if(!is_null($asia)) $word = $asia; //recover from regexp failure - - $arr = explode(' ', $word); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; - } + $word = utf8_strtolower($word); }else{ - $w = $word; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; + $word = strtolower($word); } + if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; + if(is_int(array_search("$word\n",$stopwords))) continue; + $words[] = $word; } return $words; -- cgit v1.2.3 From 7c2ef4e8d524fb9262c5a08831220f9fb2dc11fe Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 17 Nov 2010 17:02:31 -0500 Subject: Use a different indexer version when external tokenizer is enabled --- inc/indexer.php | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 1c955a99d..4914c9fc6 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -8,6 +8,9 @@ if(!defined('DOKU_INC')) die('meh.'); +// Version tag used to force rebuild on upgrade +define('INDEXER_VERSION', 2); + // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); @@ -42,6 +45,20 @@ define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two charact ']?'); define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); +/** + * Version of the indexer taking into consideration the external tokenizer. + * The indexer is only compatible with data written by the same version. + * + * @author Tom N Harris + */ +function idx_get_version(){ + global $conf; + if($conf['external_tokenizer']) + return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); + else + return INDEXER_VERSION; +} + /** * Measure the length of a string. * Differs from strlen in handling of asian characters. -- cgit v1.2.3 From 420edfd639fb3d0a0f6a2504ecb2f8f6b68be1f7 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Thu, 18 Nov 2010 13:55:55 -0500 Subject: Restore io_runcmd, use io_exec for exec with pipes --- inc/indexer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 4914c9fc6..32fbf4a1a 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -484,7 +484,7 @@ function idx_indexLengths(&$filter){ } else { $lengths = idx_listIndexLengths(); foreach ( $lengths as $key => $length) { - // we keep all the values equal or superior + // we keep all the values equal or superior if ((int)$length >= (int)$filter) { $idx[] = $length; } @@ -689,7 +689,7 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ $sw =& $stopwords; if ($conf['external_tokenizer']) { - if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output)) + if (0 == io_exec($conf['tokenizer_cmd'], $string, $output)) $string = $output; } else { if(preg_match('/[^0-9A-Za-z ]/u', $string)) { -- cgit v1.2.3