diff options
author | Tom N Harris <tnharris@whoopdedo.org> | 2010-11-20 14:29:13 -0500 |
---|---|---|
committer | Tom N Harris <tnharris@whoopdedo.org> | 2010-11-20 14:29:13 -0500 |
commit | 3c4b38902b6f6d32222611b22087d5d41d20de6e (patch) | |
tree | 3d28bbaa015e10283050b6512a4f49e53323b8fc /inc/indexer.php | |
parent | 4753bcc0e2fd9417e885e128e8c9ab4bfc566c32 (diff) | |
parent | 420edfd639fb3d0a0f6a2504ecb2f8f6b68be1f7 (diff) | |
download | rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.gz rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.bz2 |
Merge branch 'tokenizer-rewrite' into michitux
Diffstat (limited to 'inc/indexer.php')
-rw-r--r-- | inc/indexer.php | 112 |
1 files changed, 64 insertions, 48 deletions
diff --git a/inc/indexer.php b/inc/indexer.php index 954512673..d4432026e 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -8,6 +8,9 @@ if(!defined('DOKU_INC')) die('meh.'); +// Version tag used to force rebuild on upgrade +define('INDEXER_VERSION', 2); + // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); @@ -43,6 +46,20 @@ define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two charact define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); /** + * Version of the indexer taking into consideration the external tokenizer. + * The indexer is only compatible with data written by the same version. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_get_version(){ + global $conf; + if($conf['external_tokenizer']) + return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); + else + return INDEXER_VERSION; +} + +/** * Measure the length of a string. * Differs from strlen in handling of asian characters. * @@ -52,8 +69,10 @@ function wordlen($w){ $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked - if(preg_match('/'.IDX_ASIAN2.'/u',$w)) - $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF + if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { + foreach($leadbytes[0] as $b) + $l += ord($b) - 0xE1; + } return $l; } @@ -102,22 +121,6 @@ function idx_getIndex($pre, $wlen){ } /** - * Create an empty index file if it doesn't exist yet. - * - * FIXME: This function isn't currently used. It will probably be removed soon. - * - * @author Tom N Harris <tnharris@whoopdedo.org> - */ -function idx_touchIndex($pre, $wlen){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; - if(!@file_exists($fn)){ - touch($fn); - if($conf['fperm']) chmod($fn, $conf['fperm']); - } -} - -/** * Read a line ending with \n. * Returns false on EOF. * @@ -215,8 +218,7 @@ function idx_getPageWords($page){ list($page,$body) = $data; - $body = strtr($body, "\r\n\t", ' '); - $tokens = explode(' ', $body); + $tokens = idx_tokenizer($body, $stopwords); $tokens = array_count_values($tokens); // count the frequency of each token // ensure the deaccented or romanised page names of internal links are added to the token array @@ -237,16 +239,12 @@ function idx_getPageWords($page){ } $words = array(); - foreach ($tokens as $word => $count) { - $arr = idx_tokenizer($word,$stopwords); - $arr = array_count_values($arr); - foreach ($arr as $w => $c) { - $l = wordlen($w); - if(isset($words[$l])){ - $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0); - }else{ - $words[$l] = array($w => $c * $count); - } + foreach ($tokens as $w => $c) { + $l = wordlen($w); + if(isset($words[$l])){ + $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); + }else{ + $words[$l] = array($w => $c); } } @@ -480,7 +478,7 @@ function idx_indexLengths(&$filter){ } else { $lengths = idx_listIndexLengths(); foreach ( $lengths as $key => $length) { - // we keep all the values equal or superior + // we keep all the values equal or superior if ((int)$length >= (int)$filter) { $idx[] = $length; } @@ -663,33 +661,51 @@ function idx_parseIndexLine(&$page_idx,$line){ * Tokenizes a string into an array of search words * * Uses the same algorithm as idx_getPageWords() + * Takes an arbitrarily complex string and returns a list of words + * suitable for indexing. The string may include spaces and line + * breaks * * @param string $string the query as given by the user * @param arrayref $stopwords array of stopwords * @param boolean $wc are wildcards allowed? + * @return array list of indexable words + * @author Tom N Harris <tnharris@whoopdedo.org> + * @author Andreas Gohr <andi@splitbrain.org> */ function idx_tokenizer($string,&$stopwords,$wc=false){ + global $conf; $words = array(); $wc = ($wc) ? '' : $wc = '\*'; - if(preg_match('/[^0-9A-Za-z]/u', $string)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); - if(!is_null($asia)) $string = $asia; //recover from regexp failure - - $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; + if (!$stopwords) + $sw = array(); + else + $sw =& $stopwords; + + if ($conf['external_tokenizer']) { + if (0 == io_exec($conf['tokenizer_cmd'], $string, $output)) + $string = $output; + } else { + if(preg_match('/[^0-9A-Za-z ]/u', $string)) { + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); + if(!is_null($asia)) $string = $asia; //recover from regexp failure } - }else{ - $w = $string; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) return $words; - $words[] = $w; + } + $string = strtr($string, "\r\n\t", ' '); + if(preg_match('/[^0-9A-Za-z ]/u', $string)) + $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); + + $wordlist = explode(' ', $string); + foreach ($wordlist as $word) { + if(preg_match('/[^0-9A-Za-z]/u', $word)){ + $word = utf8_strtolower($word); + }else{ + $word = strtolower($word); + } + if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; + if(is_int(array_search("$word\n",$stopwords))) continue; + $words[] = $word; } return $words; |