From 3a1a171b951828395a7578475e86e622f9a7205c Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:17:52 -0500 Subject: Remove unused idx_touchIndex function --- inc/indexer.php | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index f5330040a..7a8bb3ff8 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -103,22 +103,6 @@ function idx_getIndex($pre, $wlen){ return file($fn); } -/** - * Create an empty index file if it doesn't exist yet. - * - * FIXME: This function isn't currently used. It will probably be removed soon. - * - * @author Tom N Harris - */ -function idx_touchIndex($pre, $wlen){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; - if(!@file_exists($fn)){ - touch($fn); - if($conf['fperm']) chmod($fn, $conf['fperm']); - } -} - /** * Read a line ending with \n. * Returns false on EOF. -- cgit v1.2.3 From ee0891d8ffd7e4a59c958b9546a3b8382e4e5991 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:18:51 -0500 Subject: Do not assume that index files will be backward compatible --- lib/exe/indexer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 3fa81715b..4a6f74ba4 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -140,7 +140,7 @@ function runIndexer(){ // check if indexing needed $idxtag = metaFN($ID,'.indexed'); if(@file_exists($idxtag)){ - if(io_readFile($idxtag) >= INDEXER_VERSION){ + if(trim(io_readFile($idxtag)) == INDEXER_VERSION){ $last = @filemtime($idxtag); if($last > @filemtime(wikiFN($ID))){ print "runIndexer(): index for $ID up to date".NL; -- cgit v1.2.3 From 4b9792c696658fe0cbedc187198fa463b6ff83fc Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:22:08 -0500 Subject: Measure length of multi-character Asian words --- inc/indexer.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 7a8bb3ff8..d9eccac76 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -52,8 +52,10 @@ function wordlen($w){ $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked - if(preg_match('/'.IDX_ASIAN2.'/u',$w)) - $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF + if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { + foreach($leadbytes[0] as $b) + $l += ord($b) - 0xE1; + } return $l; } -- cgit v1.2.3 From 4e1bf408de9297d5773cd8bfe1af997c83eab1a2 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:32:23 -0500 Subject: Refactor tokenizer to avoid splitting multiple times --- inc/indexer.php | 69 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index d9eccac76..56d80b7fa 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -203,8 +203,7 @@ function idx_getPageWords($page){ list($page,$body) = $data; - $body = strtr($body, "\r\n\t", ' '); - $tokens = explode(' ', $body); + $tokens = idx_tokenizer($body, $stopwords); $tokens = array_count_values($tokens); // count the frequency of each token // ensure the deaccented or romanised page names of internal links are added to the token array @@ -225,16 +224,12 @@ function idx_getPageWords($page){ } $words = array(); - foreach ($tokens as $word => $count) { - $arr = idx_tokenizer($word,$stopwords); - $arr = array_count_values($arr); - foreach ($arr as $w => $c) { - $l = wordlen($w); - if(isset($words[$l])){ - $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0); - }else{ - $words[$l] = array($w => $c * $count); - } + foreach ($tokens as $w => $c) { + $l = wordlen($w); + if(isset($words[$l])){ + $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); + }else{ + $words[$l] = array($w => $c); } } @@ -655,33 +650,51 @@ function idx_parseIndexLine(&$page_idx,$line){ * Tokenizes a string into an array of search words * * Uses the same algorithm as idx_getPageWords() + * Takes an arbitrarily complex string and returns a list of words + * suitable for indexing. The string may include spaces and line + * breaks * * @param string $string the query as given by the user * @param arrayref $stopwords array of stopwords * @param boolean $wc are wildcards allowed? + * @return array list of indexable words + * @author Tom N Harris + * @author Andreas Gohr */ function idx_tokenizer($string,&$stopwords,$wc=false){ $words = array(); $wc = ($wc) ? '' : $wc = '\*'; - if(preg_match('/[^0-9A-Za-z]/u', $string)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); - if(!is_null($asia)) $string = $asia; //recover from regexp failure - - $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; + if (!$stopwords) + $sw = array(); + else + $sw =& $stopwords; + + $string = strtr($string, "\r\n\t", ' '); + if(preg_match('/[^0-9A-Za-z ]/u', $string)) + $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); + + $wordlist = explode(' ', $string); + foreach ($wordlist as $word) { + if(preg_match('/[^0-9A-Za-z]/u', $word)){ + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word); + if(!is_null($asia)) $word = $asia; //recover from regexp failure + + $arr = explode(' ', $word); + foreach ($arr as $w) { + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; + $w = utf8_strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) continue; + $words[] = $w; + } + }else{ + $w = $word; + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; + $w = strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) return $words; $words[] = $w; } - }else{ - $w = $string; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) return $words; - $words[] = $w; } return $words; -- cgit v1.2.3 From 5bcab0c47360e5b31237885cff4583e0eba479f8 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Mon, 15 Nov 2010 15:48:31 -0500 Subject: tokenizer was returning prematurely --- inc/indexer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 56d80b7fa..b3e10a548 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -690,9 +690,9 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ } }else{ $w = $word; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words; + if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) return $words; + if(is_int(array_search("$w\n",$stopwords))) continue; $words[] = $w; } } -- cgit v1.2.3 From 06af2d035180c4fb746a9b88c11178c516c88092 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Mon, 15 Nov 2010 22:08:06 +0100 Subject: Indexer speed improvement: joined array vs. single lines From my experience with a benchmark of the indexer it is faster to first join the array of all index entries and then write them back together instead of writing every single entry. This might increase memory usage, but I couldn't see a significant increase and this function is also only used for the small index files, not for the large pagewords index. --- inc/indexer.php | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index f5330040a..0a7e2265e 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -67,9 +67,7 @@ function idx_saveIndex($pre, $wlen, &$idx){ $fn = $conf['indexdir'].'/'.$pre.$wlen; $fh = @fopen($fn.'.tmp','w'); if(!$fh) return false; - foreach ($idx as $line) { - fwrite($fh,$line); - } + fwrite($fh,join('', $idx)); fclose($fh); if(isset($conf['fperm'])) chmod($fn.'.tmp', $conf['fperm']); io_rename($fn.'.tmp', $fn.'.idx'); -- cgit v1.2.3 From 037b55733d384c194f7554c832f95a5e566c5884 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Mon, 15 Nov 2010 22:13:36 +0100 Subject: Indexer improvement: replace _freadline by fgets In PHP versions newer than 4.3.0 fgets reads a whole line regardless of its length when no length is given. Thus the loop in _freadline isn't needed. This increases the speed significantly as _freadline was called very often. --- inc/indexer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 0a7e2265e..a07c3b89a 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -149,7 +149,7 @@ function idx_saveIndexLine($pre, $wlen, $idx, $line){ $ih = @fopen($fn.'.idx','r'); if ($ih) { $ln = -1; - while (($curline = _freadline($ih)) !== false) { + while (($curline = fgets($ih)) !== false) { if (++$ln == $idx) { fwrite($fh, $line); } else { @@ -181,7 +181,7 @@ function idx_getIndexLine($pre, $wlen, $idx){ $fh = @fopen($fn,'r'); if(!$fh) return ''; $ln = -1; - while (($line = _freadline($fh)) !== false) { + while (($line = fgets($fh)) !== false) { if (++$ln == $idx) break; } fclose($fh); -- cgit v1.2.3 From e5e503830f067ce7305e22eac58c78c2f4a007d2 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Mon, 15 Nov 2010 22:16:33 +0100 Subject: Indexer improvement: Only write the words index when needed This adds a simple boolean variable that tracks if new words have been added. When editing a page in many cases all words have already been used somewhere else or just one or two words are new. Until this change all words indexes read were always written, now only the changed ones are written. The overhead of the new boolean variable should be low. --- inc/indexer.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/inc/indexer.php b/inc/indexer.php index a07c3b89a..8174f73d0 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -252,6 +252,8 @@ function idx_getPageWords($page){ // arrive here with $words = array(wordlen => array(word => frequency)) + $word_idx_modified = false; + $index = array(); //resulting index foreach (array_keys($words) as $wlen){ $word_idx = idx_getIndex('w',$wlen); @@ -260,6 +262,7 @@ function idx_getPageWords($page){ if(!is_int($wid)){ $wid = count($word_idx); $word_idx[] = "$word\n"; + $word_idx_modified = true; } if(!isset($index[$wlen])) $index[$wlen] = array(); @@ -267,7 +270,7 @@ function idx_getPageWords($page){ } // save back word index - if(!idx_saveIndex('w',$wlen,$word_idx)){ + if($word_idx_modified && !idx_saveIndex('w',$wlen,$word_idx)){ trigger_error("Failed to write word index", E_USER_ERROR); return false; } -- cgit v1.2.3 From 4753bcc0e2fd9417e885e128e8c9ab4bfc566c32 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Mon, 15 Nov 2010 22:36:26 +0100 Subject: Indexer improvement: regex instead of arrays for lines When updating a single line that line was split into an array and in a loop over that array one entry was removed and afterwards a new one added. Tests have shown that using a regex for doing that is much faster which can be easily explained as that regex is very simple to match while a loop over an array isn't that fast. As that update function is called for every word in a page the impact of this change is significant. --- inc/indexer.php | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 8174f73d0..954512673 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -391,26 +391,19 @@ function idx_writeIndexLine($fh,$line,$pid,$count){ * @author Andreas Gohr */ function idx_updateIndexLine($line,$pid,$count){ - $line = trim($line); - $updated = array(); - if($line != ''){ - $parts = explode(':',$line); - // remove doc from given line - foreach($parts as $part){ - if($part == '') continue; - list($doc,$cnt) = explode('*',$part); - if($doc != $pid){ - $updated[] = $part; - } - } + if ($line == ''){ + $newLine = "\n"; + }else{ + $newLine = preg_replace('/(^|:)'.preg_quote($pid, '/').'\*\d*/', '', $line); } - - // add doc if ($count){ - $updated[] = "$pid*$count"; + if (strlen($newLine) > 1){ + return "$pid*$count:".$newLine; + }else{ + return "$pid*$count".$newLine; + } } - - return join(':',$updated)."\n"; + return $newLine; } /** -- cgit v1.2.3 From 6c528220aaf62f4ba5890483797d6661352500bb Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 16 Nov 2010 17:58:28 -0500 Subject: Repurpose io_runcmd for pipes --- inc/io.php | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/inc/io.php b/inc/io.php index 1d69dabc9..9b797ebf2 100644 --- a/inc/io.php +++ b/inc/io.php @@ -533,17 +533,20 @@ function io_rename($from,$to){ * * @author Harry Brueckner * @author Andreas Gohr - * @deprecated */ -function io_runcmd($cmd){ - $fh = popen($cmd, "r"); - if(!$fh) return false; - $ret = ''; - while (!feof($fh)) { - $ret .= fread($fh, 8192); - } - pclose($fh); - return $ret; +function io_runcmd($cmd, $input, &$output){ + $descspec = array( + 0=>array("pipe","r"), + 1=>array("pipe","w"), + 2=>array("pipe","w")); + $ph = proc_open($cmd, $descspec, $pipes); + if(!$ph) return -1; + fclose($pipes[2]); // ignore stderr + fwrite($pipes[0], $input); + fclose($pipes[0]); + $output = stream_get_contents($pipes[1]); + fclose($pipes[1]); + return proc_close($ph); } /** -- cgit v1.2.3 From 1c07b9e622d139fa815c955c89569f96342475fb Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 16 Nov 2010 18:09:53 -0500 Subject: Use external program to split pages into words An external tokenizer inserts extra spaces to mark words in the input text. The text is sent through STDIN and STDOUT file handles. A good choice for Chinese and Japanese is MeCab. http://sourceforge.net/projects/mecab/ With the command line 'mecab -O wakati' --- conf/dokuwiki.php | 2 ++ inc/indexer.php | 32 ++++++++++++------------- lib/plugins/config/lang/en/lang.php | 2 ++ lib/plugins/config/settings/config.metadata.php | 2 ++ 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php index 2405494e0..f10c70e58 100644 --- a/conf/dokuwiki.php +++ b/conf/dokuwiki.php @@ -133,6 +133,8 @@ $conf['broken_iua'] = 0; //Platform with broken ignore_user_abor $conf['xsendfile'] = 0; //Use X-Sendfile (1 = lighttpd, 2 = standard) $conf['renderer_xhtml'] = 'xhtml'; //renderer to use for main page generation $conf['rememberme'] = 1; //Enable/disable remember me on login +$conf['external_tokenizer'] = 0; //Use an external program to split pages into words for indexing +$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati'; //Set target to use when creating links - leave empty for same window $conf['target']['wiki'] = ''; diff --git a/inc/indexer.php b/inc/indexer.php index b3e10a548..1c955a99d 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -662,6 +662,7 @@ function idx_parseIndexLine(&$page_idx,$line){ * @author Andreas Gohr */ function idx_tokenizer($string,&$stopwords,$wc=false){ + global $conf; $words = array(); $wc = ($wc) ? '' : $wc = '\*'; @@ -670,6 +671,16 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ else $sw =& $stopwords; + if ($conf['external_tokenizer']) { + if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output)) + $string = $output; + } else { + if(preg_match('/[^0-9A-Za-z ]/u', $string)) { + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); + if(!is_null($asia)) $string = $asia; //recover from regexp failure + } + } $string = strtr($string, "\r\n\t", ' '); if(preg_match('/[^0-9A-Za-z ]/u', $string)) $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); @@ -677,24 +688,13 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ $wordlist = explode(' ', $string); foreach ($wordlist as $word) { if(preg_match('/[^0-9A-Za-z]/u', $word)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word); - if(!is_null($asia)) $word = $asia; //recover from regexp failure - - $arr = explode(' ', $word); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; - } + $word = utf8_strtolower($word); }else{ - $w = $word; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; + $word = strtolower($word); } + if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; + if(is_int(array_search("$word\n",$stopwords))) continue; + $words[] = $word; } return $words; diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php index a944d6bd7..85214bf98 100644 --- a/lib/plugins/config/lang/en/lang.php +++ b/lib/plugins/config/lang/en/lang.php @@ -141,6 +141,8 @@ $lang['renderer_xhtml'] = 'Renderer to use for main (xhtml) wiki output'; $lang['renderer__core'] = '%s (dokuwiki core)'; $lang['renderer__plugin'] = '%s (plugin)'; $lang['rememberme'] = 'Allow permanent login cookies (remember me)'; +$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing'; +$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer'; $lang['rss_type'] = 'XML feed type'; $lang['rss_linkto'] = 'XML feed links to'; diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php index edba65262..331da5ab8 100644 --- a/lib/plugins/config/settings/config.metadata.php +++ b/lib/plugins/config/settings/config.metadata.php @@ -190,6 +190,8 @@ $meta['broken_iua'] = array('onoff'); $meta['xsendfile'] = array('multichoice','_choices' => array(0,1,2,3)); $meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml')); $meta['readdircache'] = array('numeric'); +$meta['external_tokenizer'] = array('onoff'); +$meta['tokenizer_cmd'] = array('string'); $meta['_network'] = array('fieldset'); $meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i'); -- cgit v1.2.3 From 7c2ef4e8d524fb9262c5a08831220f9fb2dc11fe Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 17 Nov 2010 17:02:31 -0500 Subject: Use a different indexer version when external tokenizer is enabled --- bin/indexer.php | 8 ++------ inc/indexer.php | 17 +++++++++++++++++ lib/exe/indexer.php | 7 ++----- lib/exe/xmlrpc.php | 4 ++-- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/bin/indexer.php b/bin/indexer.php index 48e98b571..497c6146a 100755 --- a/bin/indexer.php +++ b/bin/indexer.php @@ -13,10 +13,6 @@ require_once(DOKU_INC.'inc/auth.php'); require_once(DOKU_INC.'inc/cliopts.php'); session_write_close(); -// Version tag used to force rebuild on upgrade -// Need to keep in sync with lib/exe/indexer.php -if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2); - // handle options $short_opts = 'hcuq'; $long_opts = array('help', 'clear', 'update', 'quiet'); @@ -88,7 +84,7 @@ function _index($id){ if(!$CLEAR){ $idxtag = metaFN($id,'.indexed'); if(@file_exists($idxtag)){ - if(io_readFile($idxtag) >= INDEXER_VERSION){ + if(io_readFile($idxtag) == idx_get_version()){ $last = @filemtime(metaFN($id,'.indexed')); if($last > @filemtime(wikiFN($id))) return; } @@ -98,7 +94,7 @@ function _index($id){ _lock(); _quietecho("$id... "); idx_addPage($id); - io_saveFile(metaFN($id,'.indexed'),INDEXER_VERSION); + io_saveFile(metaFN($id,'.indexed'), idx_get_version()); _quietecho("done.\n"); _unlock(); } diff --git a/inc/indexer.php b/inc/indexer.php index 1c955a99d..4914c9fc6 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -8,6 +8,9 @@ if(!defined('DOKU_INC')) die('meh.'); +// Version tag used to force rebuild on upgrade +define('INDEXER_VERSION', 2); + // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); @@ -42,6 +45,20 @@ define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two charact ']?'); define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); +/** + * Version of the indexer taking into consideration the external tokenizer. + * The indexer is only compatible with data written by the same version. + * + * @author Tom N Harris + */ +function idx_get_version(){ + global $conf; + if($conf['external_tokenizer']) + return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); + else + return INDEXER_VERSION; +} + /** * Measure the length of a string. * Differs from strlen in handling of asian characters. diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 4a6f74ba4..55d860296 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -11,9 +11,6 @@ require_once(DOKU_INC.'inc/init.php'); session_write_close(); //close session if(!defined('NL')) define('NL',"\n"); -// Version tag used to force rebuild on upgrade -define('INDEXER_VERSION', 2); - // keep running after browser closes connection @ignore_user_abort(true); @@ -140,7 +137,7 @@ function runIndexer(){ // check if indexing needed $idxtag = metaFN($ID,'.indexed'); if(@file_exists($idxtag)){ - if(trim(io_readFile($idxtag)) == INDEXER_VERSION){ + if(trim(io_readFile($idxtag)) == idx_get_version()){ $last = @filemtime($idxtag); if($last > @filemtime(wikiFN($ID))){ print "runIndexer(): index for $ID up to date".NL; @@ -168,7 +165,7 @@ function runIndexer(){ idx_addPage($ID); // we're finished - save and free lock - io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION); + io_saveFile(metaFN($ID,'.indexed'), idx_get_version()); @rmdir($lock); print "runIndexer(): finished".NL; return true; diff --git a/lib/exe/xmlrpc.php b/lib/exe/xmlrpc.php index f06792361..410d4f6ba 100644 --- a/lib/exe/xmlrpc.php +++ b/lib/exe/xmlrpc.php @@ -1,7 +1,7 @@ Date: Thu, 18 Nov 2010 13:55:55 -0500 Subject: Restore io_runcmd, use io_exec for exec with pipes --- inc/indexer.php | 4 ++-- inc/io.php | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 4914c9fc6..32fbf4a1a 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -484,7 +484,7 @@ function idx_indexLengths(&$filter){ } else { $lengths = idx_listIndexLengths(); foreach ( $lengths as $key => $length) { - // we keep all the values equal or superior + // we keep all the values equal or superior if ((int)$length >= (int)$filter) { $idx[] = $length; } @@ -689,7 +689,7 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ $sw =& $stopwords; if ($conf['external_tokenizer']) { - if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output)) + if (0 == io_exec($conf['tokenizer_cmd'], $string, $output)) $string = $output; } else { if(preg_match('/[^0-9A-Za-z ]/u', $string)) { diff --git a/inc/io.php b/inc/io.php index 9b797ebf2..a0be00da3 100644 --- a/inc/io.php +++ b/inc/io.php @@ -529,12 +529,30 @@ function io_rename($from,$to){ /** - * Runs an external command and returns it's output as string + * Runs an external command and returns its output as string * * @author Harry Brueckner * @author Andreas Gohr + * @deprecated */ -function io_runcmd($cmd, $input, &$output){ +function io_runcmd($cmd){ + $fh = popen($cmd, "r"); + if(!$fh) return false; + $ret = ''; + while (!feof($fh)) { + $ret .= fread($fh, 8192); + } + pclose($fh); + return $ret; +} + +/** + * Runs an external command with input and output pipes. + * Returns the exit code from the process. + * + * @author Tom N Harris + */ +function io_exec($cmd, $input, &$output){ $descspec = array( 0=>array("pipe","r"), 1=>array("pipe","w"), -- cgit v1.2.3 From 00803e562833be06ab5a869541581314b9b84d58 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Mon, 27 Dec 2010 20:30:46 -0500 Subject: Indexer v3 Rewrite part one (unstable) The indexer functions have been converted to a class interface. Use the Doku_Indexer class to access the indexer with these public methods: addPageWords addMetaKeys deletePage tokenizer lookup lookupKey getPages histogram These functions are provided for general use: idx_get_version idx_get_indexer idx_get_stopwords idx_addPage idx_lookup idx_tokenizer These functions are still available, but are deprecated: idx_getIndex idx_indexLengths All other old idx_ functions are unsupported and have been removed. --- inc/indexer.php | 1288 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 786 insertions(+), 502 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index d4432026e..099b7e9fc 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -4,12 +4,13 @@ * * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) * @author Andreas Gohr + * @author Tom N Harris */ if(!defined('DOKU_INC')) die('meh.'); // Version tag used to force rebuild on upgrade -define('INDEXER_VERSION', 2); +define('INDEXER_VERSION', 3); // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); @@ -77,335 +78,827 @@ function wordlen($w){ } /** - * Write a list of strings to an index file. + * Class that encapsulates operations on the indexer database. * * @author Tom N Harris */ -function idx_saveIndex($pre, $wlen, &$idx){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen; - $fh = @fopen($fn.'.tmp','w'); - if(!$fh) return false; - fwrite($fh,join('', $idx)); - fclose($fh); - if(isset($conf['fperm'])) chmod($fn.'.tmp', $conf['fperm']); - io_rename($fn.'.tmp', $fn.'.idx'); - return true; -} - -/** - * Append a given line to an index file. - * - * @author Andreas Gohr - */ -function idx_appendIndex($pre, $wlen, $line){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen; - $fh = @fopen($fn.'.idx','a'); - if(!$fh) return false; - fwrite($fh,$line); - fclose($fh); - return true; -} +class Doku_Indexer { + + /** + * Adds the contents of a page to the fulltext index + * + * The added text replaces previous words for the same page. + * An empty value erases the page. + * + * @param string $page a page name + * @param string $text the body of the page + * @return boolean the function completed successfully + * @author Tom N Harris + * @author Andreas Gohr + */ + public function addPageWords($page, $text) { + $this->_lock(); + + // load known documents + $page_idx = $this->_addIndexKey('page', '', $page); + if ($page_idx === false) { + $this->_unlock(); + return false; + } -/** - * Read the list of words in an index (if it exists). - * - * @author Tom N Harris - */ -function idx_getIndex($pre, $wlen){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; - if(!@file_exists($fn)) return array(); - return file($fn); -} + $pagewords = array(); + // get word usage in page + $words = $this->_getPageWords($text); + if ($words === false) { + $this->_unlock(); + return false; + } -/** - * Read a line ending with \n. - * Returns false on EOF. - * - * @author Tom N Harris - */ -function _freadline($fh) { - if (feof($fh)) return false; - $ln = ''; - while (($buf = fgets($fh,4096)) !== false) { - $ln .= $buf; - if (substr($buf,-1) == "\n") break; - } - if ($ln === '') return false; - if (substr($ln,-1) != "\n") $ln .= "\n"; - return $ln; -} + if (!empty($words)) { + foreach (array_keys($words) as $wlen) { + $index = $this->_getIndex('i', $wlen); + foreach ($words[$wlen] as $wid => $freq) { + $idx = ($wid_updateTuple($idx, $pid, $freq); + $pagewords[] = "$wlen*$wid"; + } + if (!$this->_saveIndex('i', $wlen, $index)) { + $this->_unlock(); + return false; + } + } + } -/** - * Write a line to an index file. - * - * @author Tom N Harris - */ -function idx_saveIndexLine($pre, $wlen, $idx, $line){ - global $conf; - if(substr($line,-1) != "\n") $line .= "\n"; - $fn = $conf['indexdir'].'/'.$pre.$wlen; - $fh = @fopen($fn.'.tmp','w'); - if(!$fh) return false; - $ih = @fopen($fn.'.idx','r'); - if ($ih) { - $ln = -1; - while (($curline = fgets($ih)) !== false) { - if (++$ln == $idx) { - fwrite($fh, $line); - } else { - fwrite($fh, $curline); + // Remove obsolete index entries + $pageword_idx = $this->_getIndexKey('pageword', '', $pid); + if ($pageword_idx !== '') { + $oldwords = explode(':',$pageword_idx); + $delwords = array_diff($oldwords, $pagewords); + $upwords = array(); + foreach ($delwords as $word) { + if ($word != '') { + list($wlen,$wid) = explode('*', $word); + $wid = (int)$wid; + $upwords[$wlen][] = $wid; + } + } + foreach ($upwords as $wlen => $widx) { + $index = $this->_getIndex('i', $wlen); + foreach ($widx as $wid) { + $index[$wid] = $this->_updateTuple($index[$wid], $pid, 0); + } + $this->_saveIndex('i', $wlen, $index); } } - if ($idx > $ln) { - fwrite($fh,$line); + // Save the reverse index + $pageword_idx = join(':', $pagewords); + if (!$this->_saveIndexKey('pageword', '', $pid, $pageword_idx)) { + $this->_unlock(); + return false; } - fclose($ih); - } else { - fwrite($fh,$line); - } - fclose($fh); - if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']); - io_rename($fn.'.tmp', $fn.'.idx'); - return true; -} -/** - * Read a single line from an index (if it exists). - * - * @author Tom N Harris - */ -function idx_getIndexLine($pre, $wlen, $idx){ - global $conf; - $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; - if(!@file_exists($fn)) return ''; - $fh = @fopen($fn,'r'); - if(!$fh) return ''; - $ln = -1; - while (($line = fgets($fh)) !== false) { - if (++$ln == $idx) break; + $this->_unlock(); + return true; } - fclose($fh); - return "$line"; -} -/** - * Split a page into words - * - * Returns an array of word counts, false if an error occurred. - * Array is keyed on the word length, then the word index. - * - * @author Andreas Gohr - * @author Christopher Smith - */ -function idx_getPageWords($page){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - if(@file_exists($swfile)){ - $stopwords = file($swfile); - }else{ - $stopwords = array(); - } + /** + * Split the words in a page and add them to the index. + * + * @author Andreas Gohr + * @author Christopher Smith + * @author Tom N Harris + */ + private function _getPageWords($text) { + global $conf; + + $tokens = $this->tokenizer($text); + $tokens = array_count_values($tokens); // count the frequency of each token + + $words = array(); + foreach ($tokens as $w=>$c) { + $l = wordlen($w); + if (isset($words[$l])){ + $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); + }else{ + $words[$l] = array($w => $c); + } + } - $body = ''; - $data = array($page, $body); - $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); - if ($evt->advise_before()) $data[1] .= rawWiki($page); - $evt->advise_after(); - unset($evt); + // arrive here with $words = array(wordlen => array(word => frequency)) + $word_idx_modified = false; + $index = array(); //resulting index + foreach (array_keys($words) as $wlen) { + $word_idx = $this->_getIndex('w', $wlen); + foreach ($words[$wlen] as $word => $freq) { + $wid = array_search($word, $word_idx); + if ($wid === false) { + $wid = count($word_idx); + $word_idx[] = $word; + $word_idx_modified = true; + } + if (!isset($index[$wlen])) + $index[$wlen] = array(); + $index[$wlen][$wid] = $freq; + } + // save back the word index + if ($word_idx_modified && !$this->_saveIndex('w', $wlen, $word_idx)) + return false; + } - list($page,$body) = $data; + return $index; + } - $tokens = idx_tokenizer($body, $stopwords); - $tokens = array_count_values($tokens); // count the frequency of each token + /** + * Add keys to the metadata index. + * + * Adding new keys does not remove other keys for the page. + * An empty value will erase the key. + * The $key parameter can be an array to add multiple keys. $value will + * not be used if $key is an array. + * + * @param string $page a page name + * @param mixed $key a key string or array of key=>value pairs + * @param mixed $value the value or list of values + * @return boolean the function completed successfully + * @author Tom N Harris + */ + public function addMetaKeys($page, $key, $value=null) { + if (!is_array($key)) { + $key = array($key => $value); + } elseif (!is_null($value)) { + // $key is array, but $value is not null + trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); + } - // ensure the deaccented or romanised page names of internal links are added to the token array - // (this is necessary for the backlink function -- there maybe a better way!) - if ($conf['deaccent']) { - $links = p_get_metadata($page,'relation references'); + $this->_lock(); - if (!empty($links)) { - $tmp = join(' ',array_keys($links)); // make a single string - $tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space - $link_tokens = array_unique(explode(' ', $tmp)); // break into tokens + // load known documents + $pid = $this->_addIndexKey('page', '', $page); + if ($pid === false) { + $this->_unlock(); + return false; + } - foreach ($link_tokens as $link_token) { - if (isset($tokens[$link_token])) continue; - $tokens[$link_token] = 1; + foreach ($key as $name => $values) { + $metaname = idx_cleanName($name); + $metaidx = $this->_getIndex($metaname, '_i'); + $metawords = $this->_getIndex($metaname, '_w'); + $addwords = false; + $update = array(); + if (!is_array($val)) $values = array($values); + foreach ($values as $val) { + $val = (string)$val; + if ($val !== "") { + $id = array_search($val, $metawords); + if ($id === false) { + $id = count($metawords); + $metawords[$id] = $val; + $addwords = true; + } + $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 1); + $update[$id] = 1; + } else { + $id = array_search($val, $metawords); + if ($id !== false) { + $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0); + $update[$id] = 0; + } + } + } + if (!empty($update)) { + if ($addwords) + $this->_saveIndex($metaname.'_w', '', $metawords); + $this->_saveIndex($metaname.'_i', '', $metaidx); + $val_idx = $this->_getIndexKey($metaname, '_p', $pid); + $val_idx = array_flip(explode(':', $val_idx)); + foreach ($update as $id => $add) { + if ($add) $val_idx[$id] = 1; + else unset($val_idx[$id]); + } + $val_idx = array_keys($val_idx); + $this->_saveIndexKey($metaname.'_p', '', $pid, $val_idx); } + unset($metaidx); + unset($metawords); } + return true; } - $words = array(); - foreach ($tokens as $w => $c) { - $l = wordlen($w); - if(isset($words[$l])){ - $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); - }else{ - $words[$l] = array($w => $c); + /** + * Remove a page from the index + * + * Erases entries in all known indexes. + * + * @param string $page a page name + * @return boolean the function completed successfully + * @author Tom N Harris + */ + public function deletePage($page) { + } + + /** + * Split the text into words for fulltext search + * + * TODO: does this also need &$stopwords ? + * + * @param string $text plain text + * @param boolean $wc are wildcards allowed? + * @return array list of words in the text + * @author Tom N Harris + * @author Andreas Gohr + */ + public function tokenizer($text, $wc=false) { + global $conf; + $words = array(); + $wc = ($wc) ? '' : '\*'; + $stopwords =& idx_get_stopwords(); + + if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') { + if (0 == io_exec($conf['tokenizer_cmd'], $text, $output)) + $text = $output; + } else { + if (preg_match('/[^0-9A-Za-z ]/u', $text)) { + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text); + if (!is_null($asia)) $text = $asia; // recover from regexp falure + } + } + $text = strtr($text, "\r\n\t", ' '); + if (preg_match('/[^0-9A-Za-z ]/u', $text)) + $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc); + + $wordlist = explode(' ', $text); + foreach ($wordlist as $word) { + $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ? + utf8_strtolower($word) : strtolower($word); + if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; + if (array_search($word, $stopwords) !== false) continue; + $words[] = $word; } + return $words; } - // arrive here with $words = array(wordlen => array(word => frequency)) + /** + * Find pages in the fulltext index containing the words, + * + * The search words must be pre-tokenized, meaning only letters and + * numbers with an optional wildcard + * + * The returned array will have the original tokens as key. The values + * in the returned list is an array with the page names as keys and the + * number of times that token appeas on the page as value. + * + * @param array $tokens list of words to search for + * @return array list of page names with usage counts + * @author Tom N Harris + * @author Andreas Gohr + */ + public function lookup($tokens) { + $result = array(); + $wids = $this->_getIndexWords($tokens, $result); + if (empty($wids)) return array(); + // load known words and documents + $page_idx = $this->_getIndex('page', ''); + $docs = array(); + foreach (array_keys($wids) as $wlen) { + $wids[$wlen] = array_unique($wids[$wlen]); + $index = $this->_getIndex('i', $wlen); + foreach($wids[$wlen] as $ixid) { + if ($ixid < count($index)) + $docs["$wlen*$ixid"] = $this->_parseTuples($page_idx, $index[$ixid]); + } + } + // merge found pages into final result array + $final = array(); + foreach ($result as $word => $res) { + $final[$word] = array(); + foreach ($res as $wid) { + $hits = &$docs[$wid]; + foreach ($hits as $hitkey => $hitcnt) { + // make sure the document still exists + if (!page_exists($hitkey, '', false)) continue; + if (!isset($final[$word][$hitkey])) + $final[$word][$hitkey] = $hitcnt; + else + $final[$word][$hitkey] += $hitcnt; + } + } + } + return $final; + } - $word_idx_modified = false; + /** + * Find pages containing a metadata key. + * + * The metadata values are compared as case-sensitive strings. Pass a + * callback function that returns true or false to use a different + * comparison function + * + * @param string $key name of the metadata key to look for + * @param string $value search term to look for + * @param callback $func comparison function + * @return array list with page names + * @author Tom N Harris + */ + public function lookupKey($key, $value, $func=null) { + } - $index = array(); //resulting index - foreach (array_keys($words) as $wlen){ - $word_idx = idx_getIndex('w',$wlen); - foreach ($words[$wlen] as $word => $freq) { - $wid = array_search("$word\n",$word_idx); - if(!is_int($wid)){ - $wid = count($word_idx); - $word_idx[] = "$word\n"; - $word_idx_modified = true; + /** + * Find the index ID of each search term. + * + * The query terms should only contain valid characters, with a '*' at + * either the beginning or end of the word (or both). + * The $result parameter can be used to merge the index locations with + * the appropriate query term. + * + * @param array $words The query terms. + * @param arrayref $result Set to word => array("length*id" ...) + * @return array Set to length => array(id ...) + * @author Tom N Harris + */ + private function _getIndexWords($words, &$result) { + $tokens = array(); + $tokenlength = array(); + $tokenwild = array(); + foreach ($words as $word) { + $result[$word] = array(); + $caret = false; + $dollar = false; + $xword = $word; + $wlen = wordlen($word); + + // check for wildcards + if (substr($xword, 0, 1) == '*') { + $xword = substr($xword, 1); + $caret = true; + $wlen -= 1; + } + if (substr($xword, -1, 1) == '*') { + $xword = substr($xword, 0, -1); + $dollar = true; + $wlen -= 1; + } + if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword)) + continue; + if (!isset($tokens[$xword])) + $tokenlength[$wlen][] = $xword; + if ($caret || $dollar) { + $re = preg_quote($xword, '/'); + if ($caret) $re = '^'.$re; + if ($dollar) $re = $re.'$'; + $tokens[$xword][] = array($word, '/'.$re.'/'); + if (!isset($tokenwild[$xword])) + $tokenwild[$xword] = $wlen; + } else { + $tokens[$xword][] = array($word, null); } - if(!isset($index[$wlen])) - $index[$wlen] = array(); - $index[$wlen][$wid] = $freq; } + asort($tokenwild); + // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) + // $tokenlength = array( base word length => base word ... ) + // $tokenwild = array( base word => base word length ... ) + $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); + $indexes_known = $this->_indexLengths($length_filter); + if (!empty($tokenwild)) sort($indexes_known); + // get word IDs + $wids = array(); + foreach ($indexes_known as $ixlen) { + $word_idx = $this->_getIndex('w', $ixlen); + // handle exact search + if (isset($tokenlength[$ixlen])) { + foreach ($tokenlength[$ixlen] as $xword) { + $wid = array_search($xword, $word_idx); + if ($wid !== false) { + $wids[$ixlen][] = $wid; + foreach ($tokens[$xword] as $w) + $result[$w[0]][] = "$ixlen*$wid"; + } + } + } + // handle wildcard search + foreach ($tokenwild as $xword => $wlen) { + if ($wlen >= $ixlen) break; + foreach ($tokens[$xword] as $w) { + if (is_null($w[1])) continue; + foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { + $wids[$ixlen][] = $wid; + $result[$w[0]][] = "$ixlen*$wid"; + } + } + } + } + return $wids; + } - // save back word index - if($word_idx_modified && !idx_saveIndex('w',$wlen,$word_idx)){ - trigger_error("Failed to write word index", E_USER_ERROR); - return false; + /** + * Return a list of all pages + * + * @param string $key list only pages containing the metadata key (optional) + * @return array list of page names + * @author Tom N Harris + */ + public function getPages($key=null) { + $page_idx = $this->_getIndex('page', ''); + if (is_null($key)) return $page_idx; + } + + /** + * Return a list of words sorted by number of times used + * + * @param int $min bottom frequency threshold + * @param int $max upper frequency limit. No limit if $max<$min + * @param string $key metadata key to list. Uses the fulltext index if not given + * @return array list of words as the keys and frequency as values + * @author Tom N Harris + */ + public function histogram($min=1, $max=0, $key=null) { + } + + /** + * Lock the indexer. + * + * @author Tom N Harris + */ + private function _lock() { + global $conf; + $status = true; + $lock = $conf['lockdir'].'/_indexer.lock'; + while (!@mkdir($lock, $conf['dmode'])) { + usleep(50); + if (time() - @filemtime($lock) > 60*5) { + // looks like a stale lock, remove it + @rmdir($lock); + $status = "stale lock removed"; + } else { + return false; + } } + if ($conf['dperm']) + chmod($lock, $conf['dperm']); + return $status; } - return $index; -} + /** + * Release the indexer lock. + * + * @author Tom N Harris + */ + private function _unlock() { + global $conf; + @rmdir($conf['lockdir'].'/_indexer.lock'); + return true; + } -/** - * Adds/updates the search for the given page - * - * This is the core function of the indexer which does most - * of the work. This function needs to be called with proper - * locking! - * - * @author Andreas Gohr - */ -function idx_addPage($page){ - global $conf; + /** + * Retrieve the entire index. + * + * @author Tom N Harris + */ + private function _getIndex($idx, $suffix) { + global $conf; + $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; + if (!@file_exists($fn, FILE_IGNORE_NEW_LINES)) return array(); + return file($fn); + } - // load known documents - $page_idx = idx_getIndex('page',''); + /** + * Replace the contents of the index with an array. + * + * @author Tom N Harris + */ + private function _saveIndex($idx, $suffix, &$lines) { + global $conf; + $fn = $conf['indexdir'].'/'.$idx.$suffix; + $fh = @fopen($fn.'.tmp', 'w'); + if (!$fh) return false; + fwrite($fh, join("\n", $lines)); + fclose($fh); + if (isset($conf['fperm'])) + chmod($fn.'.tmp', $conf['fperm']); + io_rename($fn.'.tmp', $fn.'.idx'); + if ($suffix !== '') + $this->_cacheIndexDir($idx, $suffix, empty($lines)); + return true; + } - // get page id (this is the linenumber in page.idx) - $pid = array_search("$page\n",$page_idx); - if(!is_int($pid)){ - $pid = count($page_idx); - // page was new - write back - if (!idx_appendIndex('page','',"$page\n")){ - trigger_error("Failed to write page index", E_USER_ERROR); - return false; + /** + * Retrieve a line from the index. + * + * @author Tom N Harris + */ + private function _getIndexKey($idx, $suffix, $id) { + global $conf; + $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; + if (!@file_exists($fn)) return ''; + $fh = @fopen($fn, 'r'); + if (!$fh) return ''; + $ln = -1; + while (($line = fgets($fh)) !== false) { + if (++$ln == $id) break; } + fclose($fh); + return rtrim((string)$line); } - unset($page_idx); // free memory - - idx_saveIndexLine('title', '', $pid, p_get_first_heading($page, false)); - - $pagewords = array(); - // get word usage in page - $words = idx_getPageWords($page); - if($words === false) return false; - - if(!empty($words)) { - foreach(array_keys($words) as $wlen){ - $index = idx_getIndex('i',$wlen); - foreach($words[$wlen] as $wid => $freq){ - if($wid + */ + private function _saveIndexKey($idx, $suffix, $id, $line) { + global $conf; + if (substr($line, -1) != "\n") + $line .= "\n"; + $fn = $conf['indexdir'].'/'.$idx.$suffix; + $fh = @fopen($fn.'.tmp', 'w'); + if (!fh) return false; + $ih = @fopen($fn.'.idx', 'r'); + if ($ih) { + $ln = -1; + while (($curline = fgets($ih)) !== false) { + fwrite($fh, (++$ln == $id) ? $line : $curline); } - // save back word index - if(!idx_saveIndex('i',$wlen,$index)){ - trigger_error("Failed to write index", E_USER_ERROR); + if ($id > $ln) + fwrite($fh, $line); + fclose($ih); + } else { + fwrite($fh, $line); + } + fclose($fh); + if (isset($conf['fperm'])) + chmod($fn.'.tmp', $conf['fperm']); + io_rename($fn.'.tmp', $fn.'.idx'); + if ($suffix !== '') + $this->_cacheIndexDir($idx, $suffix); + return true; + } + + /** + * Retrieve or insert a value in the index. + * + * @author Tom N Harris + */ + private function _addIndexKey($idx, $suffix, $value) { + $index = $this->_getIndex($idx, $suffix); + $id = array_search($value, $index); + if ($id === false) { + $id = count($index); + $index[$id] = $value; + if (!$this->_saveIndex($idx, $suffix, $index)) { + trigger_error("Failed to write $idx index", E_USER_ERROR); return false; } } + return $id; + } + + private function _cacheIndexDir($idx, $suffix, $delete=false) { + global $conf; + if ($idx == 'i') + $cachename = $conf['indexdir'].'/lengths'; + else + $cachename = $conf['indexdir'].'/'.$idx.'lengths'; + $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if ($lengths === false) $lengths = array(); + $old = array_search((string)$suffix, $lengths); + if (empty($lines)) { + if ($old === false) return; + unset($lengths[$old]); + } else { + if ($old !== false) return; + $lengths[] = $suffix; + sort($lengths); + } + $fh = @fopen($cachename.'.tmp', 'w'); + if (!$fh) { + trigger_error("Failed to write index cache", E_USER_ERROR); + return; + } + @fwrite($fh, implode("\n", $lengths)); + @fclose($fh); + if (isset($conf['fperm'])) + chmod($cachename.'.tmp', $conf['fperm']); + io_rename($cachename.'.tmp', $cachename.'.idx'); + } + + /** + * Get the list of lengths indexed in the wiki. + * + * Read the index directory or a cache file and returns + * a sorted array of lengths of the words used in the wiki. + * + * @author YoBoY + */ + private function _listIndexLengths() { + global $conf; + $cachename = $conf['indexdir'].'/lengths'; + clearstatcache(); + if (@file_exists($cachename.'.idx')) { + $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if ($lengths !== false) { + $idx = array(); + foreach ($lengths as $length) + $idx[] = (int)$length; + return $idx; + } + } + + $dir = @opendir($conf['indexdir']); + if ($dir === false) + return array(); + $lengths[] = array(); + while (($f = readdir($dir)) !== false) { + if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { + $i = substr($f, 1, -4); + if (is_numeric($i)) + $lengths[] = (int)$i; + } + } + closedir($dir); + sort($lengths); + // save this in a file + $fh = @fopen($cachename.'.tmp', 'w'); + if (!$fh) { + trigger_error("Failed to write index cache", E_USER_ERROR); + return; + } + @fwrite($fh, implode("\n", $lengths)); + @fclose($fh); + if (isset($conf['fperm'])) + chmod($cachename.'.tmp', $conf['fperm']); + io_rename($cachename.'.tmp', $cachename.'.idx'); + + return $lengths; } - // Remove obsolete index entries - $pageword_idx = trim(idx_getIndexLine('pageword','',$pid)); - if ($pageword_idx !== '') { - $oldwords = explode(':',$pageword_idx); - $delwords = array_diff($oldwords, $pagewords); - $upwords = array(); - foreach ($delwords as $word) { - if($word=='') continue; - list($wlen,$wid) = explode('*',$word); - $wid = (int)$wid; - $upwords[$wlen][] = $wid; - } - foreach ($upwords as $wlen => $widx) { - $index = idx_getIndex('i',$wlen); - foreach ($widx as $wid) { - $index[$wid] = idx_updateIndexLine($index[$wid],$pid,0); + /** + * Get the word lengths that have been indexed. + * + * Reads the index directory and returns an array of lengths + * that there are indices for. + * + * @author YoBoY + */ + private function _indexLengths($filter) { + global $conf; + $idx = array(); + if (is_array($filter)) { + // testing if index files exist only + $path = $conf['indexdir']."/i"; + foreach ($filter as $key => $value) { + if (@file_exists($path.$key.'.idx')) + $idx[] = $key; + } + } else { + $lengths = idx_listIndexLengths(); + foreach ($lengths as $key => $length) { + // keep all the values equal or superior + if ((int)$length >= (int)$filter) + $idx[] = $length; } - idx_saveIndex('i',$wlen,$index); } + return $idx; } - // Save the reverse index - $pageword_idx = join(':',$pagewords)."\n"; - if(!idx_saveIndexLine('pageword','',$pid,$pageword_idx)){ - trigger_error("Failed to write word index", E_USER_ERROR); - return false; + + /** + * Insert or replace a tuple in a line. + * + * @author Tom N Harris + */ + private function _updateTuple($line, $id, $count) { + $newLine = $line; + if ($newLine !== '') + $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine); + $newLine = trim($newLine, ':'); + if ($count) { + if ($strlen($newLine) > 0) + return "$id*$count:".$newLine; + else + return "$id*$count".$newLine; + } + return $newLine; } - return true; + /** + * Split a line into an array of tuples. + * + * @author Tom N Harris + * @author Andreas Gohr + */ + private function _parseTuples(&$keys, $line) { + $result = array(); + if ($line == '') return $result; + $parts = explode(':', $line); + foreach ($parts as $tuple) { + if ($tuple == '') continue; + list($key, $cnt) = explode('*', $tuple); + if (!$cnd) continue; + $key = $keys[$key]; + if (!$key) continue; + $result[$key] = $cnt; + } + return $result; + } } /** - * Write a new index line to the filehandle - * - * This function writes an line for the index file to the - * given filehandle. It removes the given document from - * the given line and readds it when $count is >0. + * Create an instance of the indexer. * - * @deprecated - see idx_updateIndexLine - * @author Andreas Gohr + * @return object a Doku_Indexer + * @author Tom N Harris */ -function idx_writeIndexLine($fh,$line,$pid,$count){ - fwrite($fh,idx_updateIndexLine($line,$pid,$count)); +function & idx_get_indexer() { + static $Indexer = null; + if (is_null($Indexer)) { + $Indexer = new Doku_Indexer(); + } + return $Indexer; } /** - * Modify an index line with new information - * - * This returns a line of the index. It removes the - * given document from the line and readds it if - * $count is >0. + * Returns words that will be ignored. * + * @return array list of stop words * @author Tom N Harris - * @author Andreas Gohr */ -function idx_updateIndexLine($line,$pid,$count){ - if ($line == ''){ - $newLine = "\n"; - }else{ - $newLine = preg_replace('/(^|:)'.preg_quote($pid, '/').'\*\d*/', '', $line); - } - if ($count){ - if (strlen($newLine) > 1){ - return "$pid*$count:".$newLine; +function & idx_get_stopwords() { + static $stopwords = null; + if (is_null($stopwords)) { + global $conf; + $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; + if(@file_exists($swfile)){ + $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); }else{ - return "$pid*$count".$newLine; + $stopwords = array(); } } - return $newLine; + return $stopwords; +} + +/** + * Adds/updates the search index for the given page + * + * Locking is handled internally. + * + * @param string $page name of the page to index + * @return boolean the function completed successfully + * @author Tom N Harris + */ +function idx_addPage($page) { + $body = ''; + $data = array($page, $body); + $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); + if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($page); + $evt->advise_after(); + unset($evt); + list($page,$body) = $data; + + $Indexer =& idx_get_indexer(); + return $Indexer->addPageWords($page, $body); +} + +/** + * Find tokens in the fulltext index + * + * Takes an array of words and will return a list of matching + * pages for each one. + * + * Important: No ACL checking is done here! All results are + * returned, regardless of permissions + * + * @param array $words list of words to search for + * @return array list of pages found, associated with the search terms + */ +function idx_lookup($words) { + $Indexer =& idx_get_indexer(); + return $Indexer->lookup($words); +} + +/** + * Split a string into tokens + * + */ +function idx_tokenizer($string, $wc=false) { + $Indexer =& idx_get_indexer(); + return $Indexer->tokenizer($string, $wc); } +/* For compatibility */ + /** - * Get the list of lenghts indexed in the wiki + * Read the list of words in an index (if it exists). + * + * @author Tom N Harris + */ +function idx_getIndex($idx, $suffix) { + global $conf; + $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; + if (!@file_exists($fn)) return array(); + return file($fn); +} + +/** + * Get the list of lengths indexed in the wiki. * * Read the index directory or a cache file and returns * a sorted array of lengths of the words used in the wiki. @@ -419,10 +912,11 @@ function idx_listIndexLengths() { $docache = false; } else { clearstatcache(); - if (@file_exists($conf['indexdir'].'/lengths.idx') and (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { - if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) ) !== false) { + if (@file_exists($conf['indexdir'].'/lengths.idx') + && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { + if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) { $idx = array(); - foreach ( $lengths as $length) { + foreach ($lengths as $length) { $idx[] = (int)$length; } return $idx; @@ -431,24 +925,24 @@ function idx_listIndexLengths() { $docache = true; } - if ($conf['readdircache'] == 0 or $docache ) { + if ($conf['readdircache'] == 0 || $docache) { $dir = @opendir($conf['indexdir']); - if($dir===false) + if ($dir === false) return array(); $idx[] = array(); while (($f = readdir($dir)) !== false) { - if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){ - $i = substr($f,1,-4); + if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { + $i = substr($f, 1, -4); if (is_numeric($i)) $idx[] = (int)$i; } } closedir($dir); sort($idx); - // we save this in a file. - if ($docache === true) { - $handle = @fopen($conf['indexdir'].'/lengths.idx','w'); - @fwrite($handle, implode("\n",$idx)); + // save this in a file + if ($docache) { + $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); + @fwrite($handle, implode("\n", $idx)); @fclose($handle); } return $idx; @@ -465,250 +959,40 @@ function idx_listIndexLengths() { * * @author YoBoY */ -function idx_indexLengths(&$filter){ +function idx_indexLengths($filter) { global $conf; $idx = array(); - if (is_array($filter)){ - // testing if index files exists only + if (is_array($filter)) { + // testing if index files exist only + $path = $conf['indexdir']."/i"; foreach ($filter as $key => $value) { - if (@file_exists($conf['indexdir']."/i$key.idx")) { + if (@file_exists($path.$key.'.idx')) $idx[] = $key; - } } } else { $lengths = idx_listIndexLengths(); - foreach ( $lengths as $key => $length) { - // we keep all the values equal or superior - if ((int)$length >= (int)$filter) { + foreach ($lengths as $key => $length) { + // keep all the values equal or superior + if ((int)$length >= (int)$filter) $idx[] = $length; - } } } return $idx; } /** - * Find the the index number of each search term. + * Clean a name of a key for use as a file name. * - * This will group together words that appear in the same index. - * So it should perform better, because it only opens each index once. - * Actually, it's not that great. (in my experience) Probably because of the disk cache. - * And the sorted function does more work, making it slightly slower in some cases. - * - * @param array $words The query terms. Words should only contain valid characters, - * with a '*' at either the beginning or end of the word (or both) - * @param arrayref $result Set to word => array("length*id" ...), use this to merge the - * index locations with the appropriate query term. - * @return array Set to length => array(id ...) + * Romanizes non-latin characters, then strips away anything that's + * not a letter, number, or underscore. * * @author Tom N Harris */ -function idx_getIndexWordsSorted($words,&$result){ - // parse and sort tokens - $tokens = array(); - $tokenlength = array(); - $tokenwild = array(); - foreach($words as $word){ - $result[$word] = array(); - $wild = 0; - $xword = $word; - $wlen = wordlen($word); - - // check for wildcards - if(substr($xword,0,1) == '*'){ - $xword = substr($xword,1); - $wild |= 1; - $wlen -= 1; - } - if(substr($xword,-1,1) == '*'){ - $xword = substr($xword,0,-1); - $wild |= 2; - $wlen -= 1; - } - if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue; - if(!isset($tokens[$xword])){ - $tokenlength[$wlen][] = $xword; - } - if($wild){ - $ptn = preg_quote($xword,'/'); - if(($wild&1) == 0) $ptn = '^'.$ptn; - if(($wild&2) == 0) $ptn = $ptn.'$'; - $tokens[$xword][] = array($word, '/'.$ptn.'/'); - if(!isset($tokenwild[$xword])) $tokenwild[$xword] = $wlen; - }else - $tokens[$xword][] = array($word, null); - } - asort($tokenwild); - // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... ) - // $tokenlength = array( base word length => base word ... ) - // $tokenwild = array( base word => base word length ... ) - - $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); - $indexes_known = idx_indexLengths($length_filter); - if(!empty($tokenwild)) sort($indexes_known); - // get word IDs - $wids = array(); - foreach($indexes_known as $ixlen){ - $word_idx = idx_getIndex('w',$ixlen); - // handle exact search - if(isset($tokenlength[$ixlen])){ - foreach($tokenlength[$ixlen] as $xword){ - $wid = array_search("$xword\n",$word_idx); - if(is_int($wid)){ - $wids[$ixlen][] = $wid; - foreach($tokens[$xword] as $w) - $result[$w[0]][] = "$ixlen*$wid"; - } - } - } - // handle wildcard search - foreach($tokenwild as $xword => $wlen){ - if($wlen >= $ixlen) break; - foreach($tokens[$xword] as $w){ - if(is_null($w[1])) continue; - foreach(array_keys(preg_grep($w[1],$word_idx)) as $wid){ - $wids[$ixlen][] = $wid; - $result[$w[0]][] = "$ixlen*$wid"; - } - } - } - } - return $wids; -} - -/** - * Lookup words in index - * - * Takes an array of word and will return a list of matching - * documents for each one. - * - * Important: No ACL checking is done here! All results are - * returned, regardless of permissions - * - * @author Andreas Gohr - */ -function idx_lookup($words){ - global $conf; - - $result = array(); - - $wids = idx_getIndexWordsSorted($words, $result); - if(empty($wids)) return array(); - - // load known words and documents - $page_idx = idx_getIndex('page',''); - - $docs = array(); // hold docs found - foreach(array_keys($wids) as $wlen){ - $wids[$wlen] = array_unique($wids[$wlen]); - $index = idx_getIndex('i',$wlen); - foreach($wids[$wlen] as $ixid){ - if($ixid < count($index)) - $docs["$wlen*$ixid"] = idx_parseIndexLine($page_idx,$index[$ixid]); - } - } - - // merge found pages into final result array - $final = array(); - foreach($result as $word => $res){ - $final[$word] = array(); - foreach($res as $wid){ - $hits = &$docs[$wid]; - foreach ($hits as $hitkey => $hitcnt) { - if (!isset($final[$word][$hitkey])) { - $final[$word][$hitkey] = $hitcnt; - } else { - $final[$word][$hitkey] += $hitcnt; - } - } - } - } - return $final; -} - -/** - * Returns a list of documents and counts from a index line - * - * It omits docs with a count of 0 and pages that no longer - * exist. - * - * @param array $page_idx The list of known pages - * @param string $line A line from the main index - * @author Andreas Gohr - */ -function idx_parseIndexLine(&$page_idx,$line){ - $result = array(); - - $line = trim($line); - if($line == '') return $result; - - $parts = explode(':',$line); - foreach($parts as $part){ - if($part == '') continue; - list($doc,$cnt) = explode('*',$part); - if(!$cnt) continue; - $doc = trim($page_idx[$doc]); - if(!$doc) continue; - // make sure the document still exists - if(!page_exists($doc,'',false)) continue; - - $result[$doc] = $cnt; - } - return $result; -} - -/** - * Tokenizes a string into an array of search words - * - * Uses the same algorithm as idx_getPageWords() - * Takes an arbitrarily complex string and returns a list of words - * suitable for indexing. The string may include spaces and line - * breaks - * - * @param string $string the query as given by the user - * @param arrayref $stopwords array of stopwords - * @param boolean $wc are wildcards allowed? - * @return array list of indexable words - * @author Tom N Harris - * @author Andreas Gohr - */ -function idx_tokenizer($string,&$stopwords,$wc=false){ - global $conf; - $words = array(); - $wc = ($wc) ? '' : $wc = '\*'; - - if (!$stopwords) - $sw = array(); - else - $sw =& $stopwords; - - if ($conf['external_tokenizer']) { - if (0 == io_exec($conf['tokenizer_cmd'], $string, $output)) - $string = $output; - } else { - if(preg_match('/[^0-9A-Za-z ]/u', $string)) { - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); - if(!is_null($asia)) $string = $asia; //recover from regexp failure - } - } - $string = strtr($string, "\r\n\t", ' '); - if(preg_match('/[^0-9A-Za-z ]/u', $string)) - $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); - - $wordlist = explode(' ', $string); - foreach ($wordlist as $word) { - if(preg_match('/[^0-9A-Za-z]/u', $word)){ - $word = utf8_strtolower($word); - }else{ - $word = strtolower($word); - } - if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; - if(is_int(array_search("$word\n",$stopwords))) continue; - $words[] = $word; - } - - return $words; +function idx_cleanName($name) { + $name = utf8_romanize(trim((string)$name)); + $name = preg_replace('#[ \./\\:-]+#', '_', $name); + $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); + return strtolower($name); } -//Setup VIM: ex: et ts=4 enc=utf-8 : +//Setup VIM: ex: et ts=4 : -- cgit v1.2.3 From 9b41be2446ea725a496f34b28ac4db84bece57c9 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 29 Dec 2010 03:50:05 -0500 Subject: Indexer v3 Rewrite part two, update uses of indexer --- bin/indexer.php | 39 +++++++++++++++++++++++----- inc/Sitemapper.php | 2 +- inc/fulltext.php | 73 +++++++++++++++++++++++------------------------------ inc/indexer.php | 54 +++++++++++++++++++++++++++++---------- inc/init.php | 2 ++ lib/exe/indexer.php | 35 +------------------------ lib/exe/xmlrpc.php | 27 +++----------------- 7 files changed, 111 insertions(+), 121 deletions(-) diff --git a/bin/indexer.php b/bin/indexer.php index 497c6146a..0d523df6e 100755 --- a/bin/indexer.php +++ b/bin/indexer.php @@ -24,6 +24,7 @@ if ( $OPTS->isError() ) { } $CLEAR = false; $QUIET = false; +$INDEXER = null; foreach ($OPTS->options as $key => $val) { switch ($key) { case 'h': @@ -66,6 +67,9 @@ function _usage() { function _update(){ global $conf; + global $INDEXER; + + $INDEXER = idx_get_indexer(); $data = array(); _quietecho("Searching pages... "); @@ -78,25 +82,47 @@ function _update(){ } function _index($id){ + global $INDEXER; global $CLEAR; + global $QUIET; // if not cleared only update changed and new files if(!$CLEAR){ $idxtag = metaFN($id,'.indexed'); if(@file_exists($idxtag)){ if(io_readFile($idxtag) == idx_get_version()){ - $last = @filemtime(metaFN($id,'.indexed')); + $last = @filemtime($idxtag); if($last > @filemtime(wikiFN($id))) return; } } } - _lock(); _quietecho("$id... "); - idx_addPage($id); - io_saveFile(metaFN($id,'.indexed'), idx_get_version()); + $body = ''; + $data = array($id, $body); + $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); + if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($id); + $evt->advise_after(); + unset($evt); + list($id,$body) = $data; + $said = false; + while(true) { + $result = $INDEXER->addPageWords($id, $body); + if ($result == "locked") { + if($said){ + _quietecho("."); + }else{ + _quietecho("Waiting for lockfile (max. 5 min)"); + $said = true; + } + sleep(15); + } else { + break; + } + } + if ($result) + io_saveFile(metaFN($id,'.indexed'), idx_get_version()); _quietecho("done.\n"); - _unlock(); } /** @@ -141,7 +167,7 @@ function _clearindex(){ _lock(); _quietecho("Clearing index... "); io_saveFile($conf['indexdir'].'/page.idx',''); - io_saveFile($conf['indexdir'].'/title.idx',''); + //io_saveFile($conf['indexdir'].'/title.idx',''); $dir = @opendir($conf['indexdir']); if($dir!==false){ while(($f = readdir($dir)) !== false){ @@ -150,6 +176,7 @@ function _clearindex(){ @unlink($conf['indexdir']."/$f"); } } + @unlink($conf['indexdir'].'/lengths.idx'); _quietecho("done.\n"); _unlock(); } diff --git a/inc/Sitemapper.php b/inc/Sitemapper.php index 47a3fedb5..bbe1caf26 100644 --- a/inc/Sitemapper.php +++ b/inc/Sitemapper.php @@ -45,7 +45,7 @@ class Sitemapper { dbglog("Sitemapper::generate(): using $sitemap"); // FIXME: Only in debug mode - $pages = idx_getIndex('page', ''); + $pages = idx_get_indexer()->getPages(); dbglog('Sitemapper::generate(): creating sitemap using '.count($pages).' pages'); $items = array(); diff --git a/inc/fulltext.php b/inc/fulltext.php index 7ace3a724..0411b9f99 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){ * @author Kazutaka Miyasaka */ function _ft_pageSearch(&$data) { + $Indexer = idx_get_indexer(); + // parse the given query - $q = ft_queryParser($data['query']); + $q = ft_queryParser($Indexer, $data['query']); $data['highlight'] = $q['highlight']; if (empty($q['parsed_ary'])) return array(); // lookup all words found in the query - $lookup = idx_lookup($q['words']); + $lookup = $Indexer->lookup($q['words']); // get all pages in this dokuwiki site (!: includes nonexistent pages) $pages_all = array(); - foreach (idx_getIndex('page', '') as $id) { - $pages_all[trim($id)] = 0; // base: 0 hit + foreach ($Indexer->getPages() as $id) { + $pages_all[$id] = 0; // base: 0 hit } // process the query @@ -126,15 +128,12 @@ function _ft_pageSearch(&$data) { * evaluates the instructions of the found pages */ function ft_backlinks($id){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - $result = array(); // quick lookup of the pagename + // FIXME use metadata key lookup $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) + $matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .) $docs = array_keys(ft_resultCombine(array_values($matches))); $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages if(!count($docs)) return $result; @@ -168,17 +167,14 @@ function ft_backlinks($id){ * Aborts after $max found results */ function ft_mediause($id,$max){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - if(!$max) $max = 1; // need to find at least one $result = array(); // quick lookup of the mediafile + // FIXME use metadata key lookup $media = noNS($id); - $matches = idx_lookup(idx_tokenizer($media,$stopwords)); + $matches = idx_lookup(idx_tokenizer($media)); $docs = array_keys(ft_resultCombine(array_values($matches))); if(!count($docs)) return $result; @@ -229,7 +225,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){ } function _ft_pageLookup(&$data){ - global $conf; // split out original parameters $id = $data['id']; if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) { @@ -239,29 +234,27 @@ function _ft_pageLookup(&$data){ $in_ns = $data['in_ns']; $in_title = $data['in_title']; + $cleaned = cleanID($id); - $pages = array_map('rtrim', idx_getIndex('page', '')); - $titles = array_map('rtrim', idx_getIndex('title', '')); - // check for corrupt title index #FS2076 - if(count($pages) != count($titles)){ - $titles = array_fill(0,count($pages),''); - @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php - } - $pages = array_combine($pages, $titles); + $Indexer = idx_get_indexer(); + $page_idx = $Indexer->getPages(); - $cleaned = cleanID($id); + $pages = array(); if ($id !== '' && $cleaned !== '') { - foreach ($pages as $p_id => $p_title) { - if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) && - (!$in_title || (stripos($p_title, $id) === false)) ) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } + //if ($in_title) + // $titles = $Indexer->lookupKey('title', "*$id*"); } if (isset($ns)) { - foreach (array_keys($pages) as $p_id) { - if (strpos($p_id, $ns) !== 0) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if (strpos($p_id, $ns) === 0) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } } @@ -494,11 +487,7 @@ function ft_resultComplement($args) { * @author Andreas Gohr * @author Kazutaka Miyasaka */ -function ft_queryParser($query){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - +function ft_queryParser($Indexer, $query){ /** * parse a search query and transform it into intermediate representation * @@ -544,7 +533,7 @@ function ft_queryParser($query){ if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { // phrase-include and phrase-exclude $not = $matches[1] ? 'NOT' : ''; - $parsed = $not.ft_termParser($matches[2], $stopwords, false, true); + $parsed = $not.ft_termParser($Indexer, $matches[2], false, true); } else { // fix incomplete phrase $term = str_replace('"', ' ', $term); @@ -591,10 +580,10 @@ function ft_queryParser($query){ $parsed .= '(N+:'.$matches[1].')'; } elseif (preg_match('/^-(.+)$/', $token, $matches)) { // word-exclude - $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')'; + $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')'; } else { // word-include - $parsed .= ft_termParser($token, $stopwords); + $parsed .= ft_termParser($Indexer, $token); } } } @@ -728,18 +717,18 @@ function ft_queryParser($query){ * * @author Kazutaka Miyasaka */ -function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) { +function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) { $parsed = ''; if ($consider_asian) { // successive asian characters need to be searched as a phrase $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($words as $word) { if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true; - $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); + $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode); } } else { $term_noparen = str_replace(array('(', ')'), ' ', $term); - $words = idx_tokenizer($term_noparen, $stopwords, true); + $words = $Indexer->tokenizer($term_noparen, true); // W_: no need to highlight if (empty($words)) { diff --git a/inc/indexer.php b/inc/indexer.php index 099b7e9fc..a61f3772a 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -97,7 +97,8 @@ class Doku_Indexer { * @author Andreas Gohr */ public function addPageWords($page, $text) { - $this->_lock(); + if (!$this->_lock()) + return "locked"; // load known documents $page_idx = $this->_addIndexKey('page', '', $page); @@ -348,12 +349,12 @@ class Doku_Indexer { * in the returned list is an array with the page names as keys and the * number of times that token appeas on the page as value. * - * @param array $tokens list of words to search for + * @param arrayref $tokens list of words to search for * @return array list of page names with usage counts * @author Tom N Harris * @author Andreas Gohr */ - public function lookup($tokens) { + public function lookup(&$tokens) { $result = array(); $wids = $this->_getIndexWords($tokens, $result); if (empty($wids)) return array(); @@ -397,10 +398,11 @@ class Doku_Indexer { * @param string $key name of the metadata key to look for * @param string $value search term to look for * @param callback $func comparison function - * @return array list with page names + * @return array list with page names, keys are query values if more than one given * @author Tom N Harris */ public function lookupKey($key, $value, $func=null) { + return array(); } /** @@ -411,12 +413,12 @@ class Doku_Indexer { * The $result parameter can be used to merge the index locations with * the appropriate query term. * - * @param array $words The query terms. + * @param arrayref $words The query terms. * @param arrayref $result Set to word => array("length*id" ...) * @return array Set to length => array(id ...) * @author Tom N Harris */ - private function _getIndexWords($words, &$result) { + private function _getIndexWords(&$words, &$result) { $tokens = array(); $tokenlength = array(); $tokenwild = array(); @@ -807,7 +809,7 @@ class Doku_Indexer { * @return object a Doku_Indexer * @author Tom N Harris */ -function & idx_get_indexer() { +function idx_get_indexer() { static $Indexer = null; if (is_null($Indexer)) { $Indexer = new Doku_Indexer(); @@ -841,10 +843,23 @@ function & idx_get_stopwords() { * Locking is handled internally. * * @param string $page name of the page to index + * @param boolean $verbose print status messages * @return boolean the function completed successfully * @author Tom N Harris */ -function idx_addPage($page) { +function idx_addPage($page, $verbose=false) { + // check if indexing needed + $idxtag = metaFN($page,'.indexed'); + if(@file_exists($idxtag)){ + if(trim(io_readFile($idxtag)) == idx_get_version()){ + $last = @filemtime($idxtag); + if($last > @filemtime(wikiFN($ID))){ + if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); + return false; + } + } + } + $body = ''; $data = array($page, $body); $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); @@ -853,8 +868,19 @@ function idx_addPage($page) { unset($evt); list($page,$body) = $data; - $Indexer =& idx_get_indexer(); - return $Indexer->addPageWords($page, $body); + $Indexer = idx_get_indexer(); + $result = $Indexer->addPageWords($page, $body); + if ($result == "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + if ($result) + io_saveFile(metaFN($page,'.indexed'), idx_get_version()); + if ($verbose) { + print("Indexer: finished".DOKU_LF); + return true; + } + return $result; } /** @@ -866,11 +892,11 @@ function idx_addPage($page) { * Important: No ACL checking is done here! All results are * returned, regardless of permissions * - * @param array $words list of words to search for + * @param arrayref $words list of words to search for * @return array list of pages found, associated with the search terms */ -function idx_lookup($words) { - $Indexer =& idx_get_indexer(); +function idx_lookup(&$words) { + $Indexer = idx_get_indexer(); return $Indexer->lookup($words); } @@ -879,7 +905,7 @@ function idx_lookup($words) { * */ function idx_tokenizer($string, $wc=false) { - $Indexer =& idx_get_indexer(); + $Indexer = idx_get_indexer(); return $Indexer->tokenizer($string, $wc); } diff --git a/inc/init.php b/inc/init.php index ed4409729..1dc31a31f 100644 --- a/inc/init.php +++ b/inc/init.php @@ -276,6 +276,7 @@ function init_files(){ } # create title index (needs to have same length as page.idx) + /* $file = $conf['indexdir'].'/title.idx'; if(!@file_exists($file)){ $pages = file($conf['indexdir'].'/page.idx'); @@ -290,6 +291,7 @@ function init_files(){ nice_die("$file is not writable. Check your permissions settings!"); } } + */ } /** diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 55d860296..a5a7d6b2a 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -134,41 +134,8 @@ function runIndexer(){ if(!$ID) return false; - // check if indexing needed - $idxtag = metaFN($ID,'.indexed'); - if(@file_exists($idxtag)){ - if(trim(io_readFile($idxtag)) == idx_get_version()){ - $last = @filemtime($idxtag); - if($last > @filemtime(wikiFN($ID))){ - print "runIndexer(): index for $ID up to date".NL; - return false; - } - } - } - - // try to aquire a lock - $lock = $conf['lockdir'].'/_indexer.lock'; - while(!@mkdir($lock,$conf['dmode'])){ - usleep(50); - if(time()-@filemtime($lock) > 60*5){ - // looks like a stale lock - remove it - @rmdir($lock); - print "runIndexer(): stale lock removed".NL; - }else{ - print "runIndexer(): indexer locked".NL; - return false; - } - } - if($conf['dperm']) chmod($lock, $conf['dperm']); - // do the work - idx_addPage($ID); - - // we're finished - save and free lock - io_saveFile(metaFN($ID,'.indexed'), idx_get_version()); - @rmdir($lock); - print "runIndexer(): finished".NL; - return true; + return idx_addPage($ID, true); } /** diff --git a/lib/exe/xmlrpc.php b/lib/exe/xmlrpc.php index 410d4f6ba..84068f96e 100644 --- a/lib/exe/xmlrpc.php +++ b/lib/exe/xmlrpc.php @@ -355,9 +355,8 @@ class dokuwiki_xmlrpc_server extends IXR_IntrospectionServer { */ function listPages(){ $list = array(); - $pages = array_filter(array_filter(idx_getIndex('page', ''), - 'isVisiblePage'), - 'page_exists'); + $pages = idx_get_indexer()->getPages(); + $pages = array_filter(array_filter($pages,'isVisiblePage'),'page_exists'); foreach(array_keys($pages) as $idx) { $perm = auth_quickaclcheck($pages[$idx]); @@ -552,27 +551,7 @@ class dokuwiki_xmlrpc_server extends IXR_IntrospectionServer { unlock($id); // run the indexer if page wasn't indexed yet - if(!@file_exists(metaFN($id, '.indexed'))) { - // try to aquire a lock - $lock = $conf['lockdir'].'/_indexer.lock'; - while(!@mkdir($lock,$conf['dmode'])){ - usleep(50); - if(time()-@filemtime($lock) > 60*5){ - // looks like a stale lock - remove it - @rmdir($lock); - }else{ - return false; - } - } - if($conf['dperm']) chmod($lock, $conf['dperm']); - - // do the work - idx_addPage($id); - - // we're finished - save and free lock - io_saveFile(metaFN($id,'.indexed'), idx_get_version()); - @rmdir($lock); - } + idx_addPage($id); return 0; } -- cgit v1.2.3 From d64516f5d992bb47a765949743506d8433a07d55 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sat, 22 Jan 2011 23:01:56 +0100 Subject: Indexer v3 Rewrite: fix obvious typos and type errors --- inc/indexer.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index a61f3772a..087113587 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -277,7 +277,7 @@ class Doku_Indexer { else unset($val_idx[$id]); } $val_idx = array_keys($val_idx); - $this->_saveIndexKey($metaname.'_p', '', $pid, $val_idx); + $this->_saveIndexKey($metaname.'_p', '', $pid, implode(':', $val_idx)); } unset($metaidx); unset($metawords); @@ -559,8 +559,8 @@ class Doku_Indexer { private function _getIndex($idx, $suffix) { global $conf; $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; - if (!@file_exists($fn, FILE_IGNORE_NEW_LINES)) return array(); - return file($fn); + if (!@file_exists($fn)) return array(); + return file($fn, FILE_IGNORE_NEW_LINES); } /** @@ -773,7 +773,7 @@ class Doku_Indexer { $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine); $newLine = trim($newLine, ':'); if ($count) { - if ($strlen($newLine) > 0) + if (strlen($newLine) > 0) return "$id*$count:".$newLine; else return "$id*$count".$newLine; @@ -794,7 +794,7 @@ class Doku_Indexer { foreach ($parts as $tuple) { if ($tuple == '') continue; list($key, $cnt) = explode('*', $tuple); - if (!$cnd) continue; + if (!$cnt) continue; $key = $keys[$key]; if (!$key) continue; $result[$key] = $cnt; -- cgit v1.2.3 From 4373c7b59390347515bcf9615f4e9133a5b88aee Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sat, 22 Jan 2011 23:06:00 +0100 Subject: Indexer v3 Rewrite: _saveIndexKey now really writes on the desired line Now _saveIndexKey inserts empty lines when the index isn't long enough. This is necessary because the page ids are taken from the global page index, but there is not every page in the metadata key specific index so e.g. line 10 might be the first entry in the index. --- inc/indexer.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/inc/indexer.php b/inc/indexer.php index 087113587..34ce0cdd0 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -620,10 +620,16 @@ class Doku_Indexer { while (($curline = fgets($ih)) !== false) { fwrite($fh, (++$ln == $id) ? $line : $curline); } - if ($id > $ln) + if ($id > $ln) { + while ($id > ++$ln) + fwrite($fh, "\n"); fwrite($fh, $line); + } fclose($ih); } else { + $ln = -1; + while ($id > ++$ln) + fwrite($fh, "\n"); fwrite($fh, $line); } fclose($fh); -- cgit v1.2.3 From cd763a5b1584197f6e9adf3bbb4f982b6bbaca05 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sat, 22 Jan 2011 23:10:01 +0100 Subject: Indexer v3 Rewrite: implement lookupKey() Saving and looking up metadata key/value pairs seems to work now at least with some basic tests. --- inc/indexer.php | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/inc/indexer.php b/inc/indexer.php index 34ce0cdd0..4219dbe75 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -400,9 +400,44 @@ class Doku_Indexer { * @param callback $func comparison function * @return array list with page names, keys are query values if more than one given * @author Tom N Harris + * @author Michael Hamann */ public function lookupKey($key, $value, $func=null) { - return array(); + $metaname = idx_cleanName($key); + + // get all words in order to search the matching ids + $words = $this->_getIndex($metaname, '_w'); + + // the matching ids for the provided value(s) + $value_ids = array(); + + if (!is_array($value)) $value = array($value); + + foreach ($value as $val) { + if (is_null($func)) { + if (($i = array_search($val, $words)) !== false) + $value_ids[$i] = $val; + } else { + foreach ($words as $i => $word) { + if (call_user_func_array($func, array($word, $value))) + $value_ids[$i] = $val; + } + } + } + + unset($words); // free the used memory + + // load all lines and pages so the used lines can be taken and matched with the pages + $lines = $this->_getIndex($metaname, '_i'); + $page_idx = $this->_getIndex('page', ''); + + $result = array(); + foreach ($value_ids as $value_id => $val) { + // parse the tuples of the form page_id*1:page2_id*1 and so on, return value + // is an array with page_id => 1, page2_id => 1 etc. so take the keys only + $result[$val] = array_keys($this->_parseTuples($page_idx, $lines[$value_id])); + } + return $result; } /** -- cgit v1.2.3 From e1e1a7e012189660a2cfd7631e82234b5ae92f69 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sun, 23 Jan 2011 01:52:31 +0100 Subject: Indexer v3 Rewrite: fix addMetaKeys and locking This fixes addMetaKeys so it actually removes values. This also changes the functionality of the function: It now updates the key for the page with the current value instead of adding new values as this will be the default use case. A new parameter could be added to restore the "old" behavior when needed. addMetaKeys now only saves the index when the content has really been changed. Furthermore no empty number is added anymore to the reverse index when it has been empty previously. addMetaKeys now releases the lock again and really fails when the lock can't be gained. --- inc/indexer.php | 68 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 4219dbe75..d3d05ecd8 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -210,7 +210,7 @@ class Doku_Indexer { } /** - * Add keys to the metadata index. + * Add/update keys to/of the metadata index. * * Adding new keys does not remove other keys for the page. * An empty value will erase the key. @@ -222,6 +222,7 @@ class Doku_Indexer { * @param mixed $value the value or list of values * @return boolean the function completed successfully * @author Tom N Harris + * @author Michael Hamann */ public function addMetaKeys($page, $key, $value=null) { if (!is_array($key)) { @@ -231,7 +232,8 @@ class Doku_Indexer { trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); } - $this->_lock(); + if (!$this->_lock()) + return "locked"; // load known documents $pid = $this->_addIndexKey('page', '', $page); @@ -245,8 +247,19 @@ class Doku_Indexer { $metaidx = $this->_getIndex($metaname, '_i'); $metawords = $this->_getIndex($metaname, '_w'); $addwords = false; - $update = array(); - if (!is_array($val)) $values = array($values); + + if (!is_array($values)) $values = array($values); + + $val_idx = $this->_getIndexKey($metaname, '_p', $pid); + if ($val_idx != '') { + $val_idx = explode(':', $val_idx); + // -1 means remove, 0 keep, 1 add + $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); + } else { + $val_idx = array(); + } + + foreach ($values as $val) { $val = (string)$val; if ($val !== "") { @@ -256,32 +269,39 @@ class Doku_Indexer { $metawords[$id] = $val; $addwords = true; } + // test if value is already in the index + if (isset($val_idx[$id]) && $val_idx[$id] <= 0) + $val_idx[$id] = 0; + else // else add it + $val_idx[$id] = 1; + } + } + + if ($addwords) + $this->_saveIndex($metaname.'_w', '', $metawords); + $vals_changed = false; + foreach ($val_idx as $id => $action) { + if ($action == -1) { + $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0); + $vals_changed = true; + unset($val_idx[$id]); + } elseif ($action == 1) { $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 1); - $update[$id] = 1; - } else { - $id = array_search($val, $metawords); - if ($id !== false) { - $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0); - $update[$id] = 0; - } + $vals_changed = true; } } - if (!empty($update)) { - if ($addwords) - $this->_saveIndex($metaname.'_w', '', $metawords); + + if ($vals_changed) { $this->_saveIndex($metaname.'_i', '', $metaidx); - $val_idx = $this->_getIndexKey($metaname, '_p', $pid); - $val_idx = array_flip(explode(':', $val_idx)); - foreach ($update as $id => $add) { - if ($add) $val_idx[$id] = 1; - else unset($val_idx[$id]); - } - $val_idx = array_keys($val_idx); - $this->_saveIndexKey($metaname.'_p', '', $pid, implode(':', $val_idx)); + $val_idx = implode(':', array_keys($val_idx)); + $this->_saveIndexKey($metaname.'_p', '', $pid, $val_idx); } + unset($metaidx); unset($metawords); } + + $this->_unlock(); return true; } @@ -398,7 +418,7 @@ class Doku_Indexer { * @param string $key name of the metadata key to look for * @param string $value search term to look for * @param callback $func comparison function - * @return array list with page names, keys are query values if more than one given + * @return array lists with page names, keys are query values * @author Tom N Harris * @author Michael Hamann */ @@ -911,7 +931,7 @@ function idx_addPage($page, $verbose=false) { $Indexer = idx_get_indexer(); $result = $Indexer->addPageWords($page, $body); - if ($result == "locked") { + if ($result === "locked") { if ($verbose) print("Indexer: locked".DOKU_LF); return false; } -- cgit v1.2.3 From 320f489ae6a653f52f9d489b84b9bdd26f4241ac Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sun, 23 Jan 2011 02:00:32 +0100 Subject: Indexer v3 Rewrite: Use the metadata index for backlinks; add INDEXER_METADATA_INDEX event This new event allows plugins to add or modify the metadata that will be indexed. Collecting this metadata in an event allows plugins to see if other plugins have already added the metadata they need and leads to just one single indexer call thus fewer files are read and written. Plugins could also replace/prevent the metadata indexer call using this event. --- inc/fulltext.php | 19 +++---------------- inc/indexer.php | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/inc/fulltext.php b/inc/fulltext.php index 0411b9f99..35ee4ba34 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -124,26 +124,13 @@ function _ft_pageSearch(&$data) { /** * Returns the backlinks for a given page * - * Does a quick lookup with the fulltext index, then - * evaluates the instructions of the found pages + * Uses the metadata index. */ function ft_backlinks($id){ $result = array(); - // quick lookup of the pagename - // FIXME use metadata key lookup - $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .) - $docs = array_keys(ft_resultCombine(array_values($matches))); - $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages - if(!count($docs)) return $result; - - // check metadata for matching links - foreach($docs as $match){ - // metadata relation reference links are already resolved - $links = p_get_metadata($match,'relation references'); - if (isset($links[$id])) $result[] = $match; - } + $result = idx_get_indexer()->lookupKey('relation_references', $id); + $result = $result[$id]; if(!count($result)) return $result; diff --git a/inc/indexer.php b/inc/indexer.php index d3d05ecd8..8859ada33 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -935,6 +935,25 @@ function idx_addPage($page, $verbose=false) { if ($verbose) print("Indexer: locked".DOKU_LF); return false; } + + if ($result) { + $data = array('page' => $page, 'metadata' => array()); + + if (($references = p_get_metadata($page, 'relation references')) !== null) + $data['metadata']['relation_references'] = array_keys($references); + + $evt = new Doku_Event('INDEXER_METADATA_INDEX', $data); + if ($evt->advise_before()) { + $result = $Indexer->addMetaKeys($page, $data['metadata']); + if ($result === "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + } + $evt->advise_after(); + unset($evt); + } + if ($result) io_saveFile(metaFN($page,'.indexed'), idx_get_version()); if ($verbose) { -- cgit v1.2.3 From 8605afb1b4e2a6a9e11e21a7bf0775bbb0d5af03 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sun, 23 Jan 2011 20:23:26 +0100 Subject: Add INDEXER_VERSION_GET event so plugins can add their version This allows plugins to add their own version strings like plugin_tag=1 so pages can be reindexed when plugins update their index content. --- inc/indexer.php | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 8859ada33..91d6842e4 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -51,13 +51,18 @@ define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); * The indexer is only compatible with data written by the same version. * * @author Tom N Harris + * @author Michael Hamann */ function idx_get_version(){ global $conf; if($conf['external_tokenizer']) - return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); + $version = INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); else - return INDEXER_VERSION; + $version = INDEXER_VERSION; + + $data = array($version); + trigger_event('INDEXER_VERSION_GET', $data, null, false); + return implode('+', $data); } /** -- cgit v1.2.3 From bbc85ee4bc98fadf89707309f923f8ae2c16f727 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Mon, 24 Jan 2011 02:52:10 -0500 Subject: Indexer v3 Rewrite: streamline indexing of deleted or disabled pages --- inc/indexer.php | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/inc/indexer.php b/inc/indexer.php index 8859ada33..5f37ec46c 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -315,6 +315,46 @@ class Doku_Indexer { * @author Tom N Harris */ public function deletePage($page) { + if (!$this->_lock()) + return "locked"; + + // load known documents + $page_idx = $this->_getIndexKey('page', '', $page); + if ($page_idx === false) { + $this->_unlock(); + return false; + } + + // Remove obsolete index entries + $pageword_idx = $this->_getIndexKey('pageword', '', $pid); + if ($pageword_idx !== '') { + $delwords = explode(':',$pageword_idx); + $upwords = array(); + foreach ($delwords as $word) { + if ($word != '') { + list($wlen,$wid) = explode('*', $word); + $wid = (int)$wid; + $upwords[$wlen][] = $wid; + } + } + foreach ($upwords as $wlen => $widx) { + $index = $this->_getIndex('i', $wlen); + foreach ($widx as $wid) { + $index[$wid] = $this->_updateTuple($index[$wid], $pid, 0); + } + $this->_saveIndex('i', $wlen, $index); + } + } + // Save the reverse index + if (!$this->_saveIndexKey('pageword', '', $pid, "")) { + $this->_unlock(); + return false; + } + + // XXX TODO: delete meta keys + + $this->_unlock(); + return true; } /** @@ -921,6 +961,36 @@ function idx_addPage($page, $verbose=false) { } } + if (!page_exists($page)) { + if (!@file_exists($idxtag)) { + if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); + return false; + } + $Indexer = idx_get_indexer(); + $result = $Indexer->deletePage($page); + if ($result === "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + @unlink($idxtag); + return $result; + } + $indexenabled = p_get_metadata($page, 'internal index', false); + if ($indexenabled === false) { + $result = false; + if (@file_exists($idxtag)) { + $Indexer = idx_get_indexer(); + $result = $Indexer->deletePage($page); + if ($result === "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + @unlink($idxtag); + } + if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); + return $result; + } + $body = ''; $data = array($page, $body); $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); @@ -939,7 +1009,8 @@ function idx_addPage($page, $verbose=false) { if ($result) { $data = array('page' => $page, 'metadata' => array()); - if (($references = p_get_metadata($page, 'relation references')) !== null) + $data['metadata']['title'] = p_get_metadata($page, 'title', false); + if (($references = p_get_metadata($page, 'relation references', false)) !== null) $data['metadata']['relation_references'] = array_keys($references); $evt = new Doku_Event('INDEXER_METADATA_INDEX', $data); -- cgit v1.2.3 From f078bb0088870b4b68b348d546afa30a80a07e87 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Mon, 24 Jan 2011 03:46:11 -0500 Subject: Indexer Rewrite v3: wildcards in lookupKey and automatically unwrap single result --- inc/fulltext.php | 9 ++++++--- inc/indexer.php | 48 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/inc/fulltext.php b/inc/fulltext.php index 35ee4ba34..f477e826e 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -130,7 +130,6 @@ function ft_backlinks($id){ $result = array(); $result = idx_get_indexer()->lookupKey('relation_references', $id); - $result = $result[$id]; if(!count($result)) return $result; @@ -234,8 +233,12 @@ function _ft_pageLookup(&$data){ $pages[$p_id] = p_get_first_heading($p_id, false); } } - //if ($in_title) - // $titles = $Indexer->lookupKey('title', "*$id*"); + if ($in_title) { + foreach ($Indexer->lookupKey('title', "*$id*") as $p_id) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); + } + } } if (isset($ns)) { foreach ($page_idx as $p_id) { diff --git a/inc/indexer.php b/inc/indexer.php index 5f37ec46c..6af2de15d 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -458,7 +458,7 @@ class Doku_Indexer { * @param string $key name of the metadata key to look for * @param string $value search term to look for * @param callback $func comparison function - * @return array lists with page names, keys are query values + * @return array lists with page names, keys are query values if $key is array * @author Tom N Harris * @author Michael Hamann */ @@ -471,15 +471,38 @@ class Doku_Indexer { // the matching ids for the provided value(s) $value_ids = array(); - if (!is_array($value)) $value = array($value); + if (!is_array($value)) + $value_array = array($value); + else + $value_array =& $value; - foreach ($value as $val) { - if (is_null($func)) { - if (($i = array_search($val, $words)) !== false) - $value_ids[$i] = $val; - } else { + if (!is_null($func)) { + foreach ($value_array as $val) { foreach ($words as $i => $word) { - if (call_user_func_array($func, array($word, $value))) + if (call_user_func_array($func, array($word, $val))) + $value_ids[$i] = $val; + } + } + } else { + foreach ($value_array as $val) { + $xval = $val; + $caret = false; + $dollar = false; + // check for wildcards + if (substr($xval, 0, 1) == '*') { + $xval = substr($xval, 1); + $caret = '^'; + } + if (substr($xval, -1, 1) == '*') { + $xval = substr($xval, 0, -1); + $dollar = '$'; + } + if ($caret || $dollar) { + $re = $caret.preg_quote($xval, '/').$dollar; + foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i) + $value_ids[$i] = $val; + } else { + if (($i = array_search($val, $words)) !== false) $value_ids[$i] = $val; } } @@ -497,6 +520,7 @@ class Doku_Indexer { // is an array with page_id => 1, page2_id => 1 etc. so take the keys only $result[$val] = array_keys($this->_parseTuples($page_idx, $lines[$value_id])); } + if (!is_array($value)) $result = $result[$value]; return $result; } @@ -527,12 +551,12 @@ class Doku_Indexer { // check for wildcards if (substr($xword, 0, 1) == '*') { $xword = substr($xword, 1); - $caret = true; + $caret = '^'; $wlen -= 1; } if (substr($xword, -1, 1) == '*') { $xword = substr($xword, 0, -1); - $dollar = true; + $dollar = '$'; $wlen -= 1; } if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword)) @@ -540,9 +564,7 @@ class Doku_Indexer { if (!isset($tokens[$xword])) $tokenlength[$wlen][] = $xword; if ($caret || $dollar) { - $re = preg_quote($xword, '/'); - if ($caret) $re = '^'.$re; - if ($dollar) $re = $re.'$'; + $re = $caret.preg_quote($xword, '/').$dollar; $tokens[$xword][] = array($word, '/'.$re.'/'); if (!isset($tokenwild[$xword])) $tokenwild[$xword] = $wlen; -- cgit v1.2.3 From c1209673030dd03537a2ece21331203ff0a6bf34 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Fri, 18 Feb 2011 18:30:49 -0500 Subject: Special handling of title metadata index --- inc/indexer.php | 86 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 18 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 3a0331302..b6a586985 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -247,6 +247,15 @@ class Doku_Indexer { return false; } + // Special handling for titles so the index file is simpler + if (array_key_exists('title', $key)) { + $value = $key['title']; + if (is_array($value)) + $value = $value[0]; + $this->_saveIndexKey('title', '', $pid, $value); + unset($key['title']); + } + foreach ($key as $name => $values) { $metaname = idx_cleanName($name); $metaidx = $this->_getIndex($metaname, '_i'); @@ -357,6 +366,7 @@ class Doku_Indexer { } // XXX TODO: delete meta keys + $this->_saveIndexKey('title', '', $pid, ""); $this->_unlock(); return true; @@ -468,24 +478,28 @@ class Doku_Indexer { * @author Michael Hamann */ public function lookupKey($key, $value, $func=null) { - $metaname = idx_cleanName($key); - - // get all words in order to search the matching ids - $words = $this->_getIndex($metaname, '_w'); - - // the matching ids for the provided value(s) - $value_ids = array(); - if (!is_array($value)) $value_array = array($value); else $value_array =& $value; + // the matching ids for the provided value(s) + $value_ids = array(); + + $metaname = idx_cleanName($key); + + // get all words in order to search the matching ids + if ($key == 'title') { + $words = $this->_getIndex('title', ''); + } else { + $words = $this->_getIndex($metaname, '_w'); + } + if (!is_null($func)) { foreach ($value_array as $val) { foreach ($words as $i => $word) { if (call_user_func_array($func, array($word, $val))) - $value_ids[$i] = $val; + $value_ids[$i][] = $val; } } } else { @@ -505,25 +519,42 @@ class Doku_Indexer { if ($caret || $dollar) { $re = $caret.preg_quote($xval, '/').$dollar; foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i) - $value_ids[$i] = $val; + $value_ids[$i][] = $val; } else { if (($i = array_search($val, $words)) !== false) - $value_ids[$i] = $val; + $value_ids[$i][] = $val; } } } unset($words); // free the used memory - // load all lines and pages so the used lines can be taken and matched with the pages - $lines = $this->_getIndex($metaname, '_i'); + $result = array(); $page_idx = $this->_getIndex('page', ''); - $result = array(); - foreach ($value_ids as $value_id => $val) { - // parse the tuples of the form page_id*1:page2_id*1 and so on, return value - // is an array with page_id => 1, page2_id => 1 etc. so take the keys only - $result[$val] = array_keys($this->_parseTuples($page_idx, $lines[$value_id])); + // Special handling for titles + if ($key == 'title') { + foreach ($value_ids as $pid => $val_list) { + $page = $page_idx[$pid]; + foreach ($val_list as $val) { + $result[$val][] = $page; + } + } + } else { + // load all lines and pages so the used lines can be taken and matched with the pages + $lines = $this->_getIndex($metaname, '_i'); + + foreach ($value_ids as $value_id => $val_list) { + // parse the tuples of the form page_id*1:page2_id*1 and so on, return value + // is an array with page_id => 1, page2_id => 1 etc. so take the keys only + $pages = array_keys($this->_parseTuples($page_idx, $lines[$value_id])); + foreach ($val_list as $val) { + if (!isset($result[$val])) + $result[$val] = $pages; + else + $result[$val] = array_merge($result[$val], $pages); + } + } } if (!is_array($value)) $result = $result[$value]; return $result; @@ -616,6 +647,7 @@ class Doku_Indexer { /** * Return a list of all pages + * Warning: pages may not exist! * * @param string $key list only pages containing the metadata key (optional) * @return array list of page names @@ -624,6 +656,24 @@ class Doku_Indexer { public function getPages($key=null) { $page_idx = $this->_getIndex('page', ''); if (is_null($key)) return $page_idx; + + $metaname = idx_cleanName($key); + + // Special handling for titles + if ($key == 'title') { + $title_idx = $this->_getIndex('title', ''); + array_splice($page_idx, count($title_idx)); + foreach ($title_idx as $i => $title) + if ($title === "") unset($page_idx[$i]); + return $page_idx; + } + + $pages = array(); + $lines = $this->_getIndex($metaname, '_i'); + foreach ($lines as $line) { + $pages = array_merge($pages, $this->_parseTuples($page_idx, $line)); + } + return array_keys($pages); } /** -- cgit v1.2.3 From b00bd361a6fb93d2ef2433f18f3f238a7498c041 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 22 Feb 2011 02:14:05 -0500 Subject: Indexer::lookupKey callback receives value reference as first arg --- inc/indexer.php | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index b6a586985..0e0340d40 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -422,7 +422,7 @@ class Doku_Indexer { * * The returned array will have the original tokens as key. The values * in the returned list is an array with the page names as keys and the - * number of times that token appeas on the page as value. + * number of times that token appears on the page as value. * * @param arrayref $tokens list of words to search for * @return array list of page names with usage counts @@ -468,16 +468,18 @@ class Doku_Indexer { * * The metadata values are compared as case-sensitive strings. Pass a * callback function that returns true or false to use a different - * comparison function + * comparison function. The function will be called with the $value being + * searched for as the first argument, and the word in the index as the + * second argument. * * @param string $key name of the metadata key to look for * @param string $value search term to look for * @param callback $func comparison function - * @return array lists with page names, keys are query values if $key is array + * @return array lists with page names, keys are query values if $value is array * @author Tom N Harris * @author Michael Hamann */ - public function lookupKey($key, $value, $func=null) { + public function lookupKey($key, &$value, $func=null) { if (!is_array($value)) $value_array = array($value); else @@ -496,9 +498,9 @@ class Doku_Indexer { } if (!is_null($func)) { - foreach ($value_array as $val) { + foreach ($value_array as &$val) { foreach ($words as $i => $word) { - if (call_user_func_array($func, array($word, $val))) + if (call_user_func_array($func, array(&$val, $word))) $value_ids[$i][] = $val; } } -- cgit v1.2.3 From 0604da3403f907227d3dfe13e6e58d2e78d0c855 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 22 Feb 2011 02:31:20 -0500 Subject: Removing a page from the index deletes related metadata. Cache key names in index. --- inc/indexer.php | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/inc/indexer.php b/inc/indexer.php index 0e0340d40..1809b1c8f 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -258,6 +258,7 @@ class Doku_Indexer { foreach ($key as $name => $values) { $metaname = idx_cleanName($name); + $this->_addIndexKey('metadata', '', $metaname); $metaidx = $this->_getIndex($metaname, '_i'); $metawords = $this->_getIndex($metaname, '_w'); $addwords = false; @@ -365,8 +366,17 @@ class Doku_Indexer { return false; } - // XXX TODO: delete meta keys $this->_saveIndexKey('title', '', $pid, ""); + $keyidx = $this->_getIndex('metadata', ''); + foreach ($keyidx as $metaname) { + $val_idx = explode(':', $this->_getIndexKey($metaname.'_p', '', $pid)); + $meta_idx = $this->_getIndex($metaname.'_i', ''); + foreach ($val_idx as $id) { + $meta_idx[$id] = $this->_updateTuple($meta_idx[$id], $pid, 0); + } + $this->_saveIndex($metaname.'_i', '', $meta_idx); + $this->_saveIndexKey($metaname.'_p', '', $pid, ''); + } $this->_unlock(); return true; -- cgit v1.2.3 From d0d6fe1be56ef474d844b3556af7ba2a5961d798 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 22 Feb 2011 02:53:20 -0500 Subject: Indexer version tag should include plugin names --- inc/indexer.php | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 1809b1c8f..bcda2a9b9 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -50,19 +50,33 @@ define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); * Version of the indexer taking into consideration the external tokenizer. * The indexer is only compatible with data written by the same version. * + * Triggers INDEXER_VERSION_GET + * Plugins that modify what gets indexed should hook this event and + * add their version info to the event data like so: + * $data[$plugin_name] = $plugin_version; + * * @author Tom N Harris * @author Michael Hamann */ function idx_get_version(){ - global $conf; - if($conf['external_tokenizer']) - $version = INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); - else - $version = INDEXER_VERSION; - - $data = array($version); - trigger_event('INDEXER_VERSION_GET', $data, null, false); - return implode('+', $data); + static $indexer_version = null; + if ($indexer_version == null) { + global $conf; + if($conf['external_tokenizer']) + $version = INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); + else + $version = INDEXER_VERSION; + + // DokuWiki version is included for the convenience of plugins + $data = array('dokuwiki'=>$version); + trigger_event('INDEXER_VERSION_GET', $data, null, false); + unset($data['dokuwiki']); // this needs to be first + ksort($data); + foreach ($data as $plugin=>$vers) + $version .= '+'.$plugin.'='.$vers; + $indexer_version = $version; + } + return $indexer_version; } /** -- cgit v1.2.3 From 175193d28ead34dd3a45395407813c080f1b2f25 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 22 Feb 2011 03:38:22 -0500 Subject: Implement histogram method of indexer --- inc/indexer.php | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/inc/indexer.php b/inc/indexer.php index bcda2a9b9..c28f24f75 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -712,6 +712,54 @@ class Doku_Indexer { * @author Tom N Harris */ public function histogram($min=1, $max=0, $key=null) { + if ($min < 1) + $min = 1; + if ($max < $min) + $max = 0; + + $result = array(); + + if ($key == 'title') { + $index = $this->_getIndex('title', ''); + $index = array_count_values($index); + foreach ($index as $val => $cnt) { + if ($cnt >= $min && (!$max || $cnt <= $max)) + $result[$val] = $cnt; + } + } + elseif (!is_null($key)) { + $metaname = idx_cleanName($key); + $index = $this->_getIndex($metaname.'_i', ''); + $val_idx = array(); + foreach ($index as $wid => $line) { + $freq = $this->_countTuples($line); + if ($freq >= $min && (!$max || $freq <= $max)) + $val_idx[$wid] = $freq; + } + if (!empty($val_idx)) { + $words = $this->_getIndex($metaname.'_w', ''); + foreach ($val_idx as $wid => $freq) + $result[$words[$wid]] = $freq; + } + } + else { + $lengths = idx_listIndexLengths(); + foreach ($lengths as $length) { + $index = $this->_getIndex('i', $length); + $words = null; + foreach ($index as $wid => $line) { + $freq = $this->_countTuples($line); + if ($freq >= $min && (!$max || $freq <= $max)) { + if ($words === null) + $words = $this->_getIndex('w', $length); + $result[$words[$wid]] = $freq; + } + } + } + } + + arsort($result); + return $result; } /** @@ -1005,6 +1053,22 @@ class Doku_Indexer { } return $result; } + + /** + * Sum the counts in a list of tuples. + * + * @author Tom N Harris + */ + private function _countTuples($line) { + $freq = 0; + $parts = explode(':', $line); + foreach ($parts as $tuple) { + if ($tuple == '') continue; + list($pid, $cnt) = explode('*', $tuple); + $freq += (int)$cnt; + } + return $freq; + } } /** -- cgit v1.2.3 From 5981eb09ea0468a670c1cdb238962bf54180c599 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Tue, 22 Feb 2011 23:02:46 -0500 Subject: Fix variable name type in indexer --- inc/indexer.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index c28f24f75..5ab0ec002 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -120,8 +120,8 @@ class Doku_Indexer { return "locked"; // load known documents - $page_idx = $this->_addIndexKey('page', '', $page); - if ($page_idx === false) { + $pid = $this->_addIndexKey('page', '', $page); + if ($pid === false) { $this->_unlock(); return false; } @@ -348,8 +348,8 @@ class Doku_Indexer { return "locked"; // load known documents - $page_idx = $this->_getIndexKey('page', '', $page); - if ($page_idx === false) { + $pid = $this->_getIndexKey('page', '', $page); + if ($pid === false) { $this->_unlock(); return false; } -- cgit v1.2.3 From 287bc2877f03f25914816a266c305c6cf6c8c772 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 23 Feb 2011 14:32:32 -0500 Subject: Increase version tag for new indexer --- inc/indexer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/indexer.php b/inc/indexer.php index 5ab0ec002..2e36b6ed7 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -10,7 +10,7 @@ if(!defined('DOKU_INC')) die('meh.'); // Version tag used to force rebuild on upgrade -define('INDEXER_VERSION', 3); +define('INDEXER_VERSION', 4); // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); -- cgit v1.2.3 From b8c040db1fdc0eee80963e57d95a15fd3813912d Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 23 Feb 2011 15:01:10 -0500 Subject: Add minimum length option to index histogram --- inc/indexer.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 2e36b6ed7..6b21797af 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -707,11 +707,12 @@ class Doku_Indexer { * * @param int $min bottom frequency threshold * @param int $max upper frequency limit. No limit if $max<$min + * @param int $length minimum length of words to count * @param string $key metadata key to list. Uses the fulltext index if not given * @return array list of words as the keys and frequency as values * @author Tom N Harris */ - public function histogram($min=1, $max=0, $key=null) { + public function histogram($min=1, $max=0, $minlen=3, $key=null) { if ($min < 1) $min = 1; if ($max < $min) @@ -723,7 +724,7 @@ class Doku_Indexer { $index = $this->_getIndex('title', ''); $index = array_count_values($index); foreach ($index as $val => $cnt) { - if ($cnt >= $min && (!$max || $cnt <= $max)) + if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) $result[$val] = $cnt; } } @@ -733,7 +734,7 @@ class Doku_Indexer { $val_idx = array(); foreach ($index as $wid => $line) { $freq = $this->_countTuples($line); - if ($freq >= $min && (!$max || $freq <= $max)) + if ($freq >= $min && (!$max || $freq <= $max) && strlen($val) >= $minlen) $val_idx[$wid] = $freq; } if (!empty($val_idx)) { @@ -745,6 +746,7 @@ class Doku_Indexer { else { $lengths = idx_listIndexLengths(); foreach ($lengths as $length) { + if ($length < $minlen) continue; $index = $this->_getIndex('i', $length); $words = null; foreach ($index as $wid => $line) { -- cgit v1.2.3 From 7233c152c0a107c0f12dbc09f5493022b264dddb Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Thu, 24 Feb 2011 23:53:51 +0100 Subject: Fix pass by reference error, always return an array in lookupKey() --- inc/fulltext.php | 3 ++- inc/indexer.php | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/inc/fulltext.php b/inc/fulltext.php index 891558f96..8155325ee 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -234,7 +234,8 @@ function _ft_pageLookup(&$data){ } } if ($in_title) { - foreach ($Indexer->lookupKey('title', "*$id*") as $p_id) { + $wildcard_id = "*$id*"; + foreach ($Indexer->lookupKey('title', $wildcard_id) as $p_id) { if (!isset($pages[$p_id])) $pages[$p_id] = p_get_first_heading($p_id, false); } diff --git a/inc/indexer.php b/inc/indexer.php index 5aa321d46..eaab7736a 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -566,7 +566,12 @@ class Doku_Indexer { unset($words); // free the used memory + // initialize the result so it won't be null $result = array(); + foreach ($value_array as $val) { + $result[$val] = array(); + } + $page_idx = $this->_getIndex('page', ''); // Special handling for titles -- cgit v1.2.3 From 675bf41fb9fe7d43646c3ff2de5a1e701818ed2c Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Fri, 25 Feb 2011 17:56:21 -0500 Subject: Reduce memory footprint of tokenizer; make returned arrays use contiguous keys --- inc/indexer.php | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index eaab7736a..6913dd4e3 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -439,14 +439,14 @@ class Doku_Indexer { $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc); $wordlist = explode(' ', $text); - foreach ($wordlist as $word) { + foreach ($wordlist as $i => &$word) { $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ? utf8_strtolower($word) : strtolower($word); - if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; - if (array_search($word, $stopwords) !== false) continue; - $words[] = $word; + if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) + || array_search($word, $stopwords) !== false) + unset($wordlist[$i]); } - return $words; + return array_values($wordlist); } /** @@ -707,7 +707,7 @@ class Doku_Indexer { array_splice($page_idx, count($title_idx)); foreach ($title_idx as $i => $title) if ($title === "") unset($page_idx[$i]); - return $page_idx; + return array_values($page_idx); } $pages = array(); @@ -1068,7 +1068,7 @@ class Doku_Indexer { if ($line == '') return $result; $parts = explode(':', $line); foreach ($parts as $tuple) { - if ($tuple == '') continue; + if ($tuple === '') continue; list($key, $cnt) = explode('*', $tuple); if (!$cnt) continue; $key = $keys[$key]; @@ -1087,7 +1087,7 @@ class Doku_Indexer { $freq = 0; $parts = explode(':', $line); foreach ($parts as $tuple) { - if ($tuple == '') continue; + if ($tuple === '') continue; list($pid, $cnt) = explode('*', $tuple); $freq += (int)$cnt; } -- cgit v1.2.3 From 1538718db8939adf4ce057f2b7fb6d2eea309757 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Fri, 25 Feb 2011 18:23:47 -0500 Subject: Restrict metadata values in indexer to string; skip unnecessary test --- inc/indexer.php | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 6913dd4e3..fc7813ba1 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -505,10 +505,11 @@ class Doku_Indexer { * callback function that returns true or false to use a different * comparison function. The function will be called with the $value being * searched for as the first argument, and the word in the index as the - * second argument. + * second argument. The function preg_match can be used directly if the + * values are regexes. * * @param string $key name of the metadata key to look for - * @param string $value search term to look for + * @param string $value search term to look for, must be a string or array of strings * @param callback $func comparison function * @return array lists with page names, keys are query values if $value is array * @author Tom N Harris @@ -533,9 +534,9 @@ class Doku_Indexer { } if (!is_null($func)) { - foreach ($value_array as &$val) { + foreach ($value_array as $val) { foreach ($words as $i => $word) { - if (call_user_func_array($func, array(&$val, $word))) + if (call_user_func_array($func, array($val, $word))) $value_ids[$i][] = $val; } } @@ -591,10 +592,7 @@ class Doku_Indexer { // is an array with page_id => 1, page2_id => 1 etc. so take the keys only $pages = array_keys($this->_parseTuples($page_idx, $lines[$value_id])); foreach ($val_list as $val) { - if (!isset($result[$val])) - $result[$val] = $pages; - else - $result[$val] = array_merge($result[$val], $pages); + $result[$val] = array_merge($result[$val], $pages); } } } -- cgit v1.2.3 From b6d540bdf1d129168ec20fb4c54956edb07c189b Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 27 Feb 2011 20:36:15 -0500 Subject: Fix wildcard search --- inc/indexer.php | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index fc7813ba1..270f717b5 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -543,18 +543,18 @@ class Doku_Indexer { } else { foreach ($value_array as $val) { $xval = $val; - $caret = false; - $dollar = false; + $caret = '^'; + $dollar = '$'; // check for wildcards if (substr($xval, 0, 1) == '*') { $xval = substr($xval, 1); - $caret = '^'; + $caret = ''; } if (substr($xval, -1, 1) == '*') { $xval = substr($xval, 0, -1); - $dollar = '$'; + $dollar = ''; } - if ($caret || $dollar) { + if (!$caret || !$dollar) { $re = $caret.preg_quote($xval, '/').$dollar; foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i) $value_ids[$i][] = $val; @@ -619,27 +619,27 @@ class Doku_Indexer { $tokenwild = array(); foreach ($words as $word) { $result[$word] = array(); - $caret = false; - $dollar = false; + $caret = '^'; + $dollar = '$'; $xword = $word; $wlen = wordlen($word); // check for wildcards if (substr($xword, 0, 1) == '*') { $xword = substr($xword, 1); - $caret = '^'; + $caret = ''; $wlen -= 1; } if (substr($xword, -1, 1) == '*') { $xword = substr($xword, 0, -1); - $dollar = '$'; + $dollar = ''; $wlen -= 1; } - if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword)) + if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) continue; if (!isset($tokens[$xword])) $tokenlength[$wlen][] = $xword; - if ($caret || $dollar) { + if (!$caret || !$dollar) { $re = $caret.preg_quote($xword, '/').$dollar; $tokens[$xword][] = array($word, '/'.$re.'/'); if (!isset($tokenwild[$xword])) -- cgit v1.2.3 From 39d6fd3051102c9f2fb5436c7bcaf44d6068fde8 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sun, 6 Mar 2011 13:17:15 +0100 Subject: Merge the two indexer events and use string keys This merges the INDEXER_PAGE_ADD and INDEXER_METADATA_INDEX events and introduces the new string keys 'page', 'body' and 'metadata' in the event data. All plugins that use INDEXER_PAGE_ADD need to be adjusted to use the key 'page' instead of 0 and 'body' instead of 1. --- inc/indexer.php | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/inc/indexer.php b/inc/indexer.php index 270f717b5..f0d951230 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -1181,12 +1181,16 @@ function idx_addPage($page, $verbose=false) { } $body = ''; - $data = array($page, $body); + $metadata = array(); + $metadata['title'] = p_get_metadata($page, 'title', false); + if (($references = p_get_metadata($page, 'relation references', false)) !== null) + $metadata['relation_references'] = array_keys($references); + $data = compact('page', 'body', 'metadata'); $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); - if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($page); + if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); $evt->advise_after(); unset($evt); - list($page,$body) = $data; + extract($data); $Indexer = idx_get_indexer(); $result = $Indexer->addPageWords($page, $body); @@ -1196,22 +1200,11 @@ function idx_addPage($page, $verbose=false) { } if ($result) { - $data = array('page' => $page, 'metadata' => array()); - - $data['metadata']['title'] = p_get_metadata($page, 'title', false); - if (($references = p_get_metadata($page, 'relation references', false)) !== null) - $data['metadata']['relation_references'] = array_keys($references); - - $evt = new Doku_Event('INDEXER_METADATA_INDEX', $data); - if ($evt->advise_before()) { - $result = $Indexer->addMetaKeys($page, $data['metadata']); - if ($result === "locked") { - if ($verbose) print("Indexer: locked".DOKU_LF); - return false; - } + $result = $Indexer->addMetaKeys($page, $metadata); + if ($result === "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; } - $evt->advise_after(); - unset($evt); } if ($result) -- cgit v1.2.3 From ad79cb7c93a655f864c633433e743b03685b5719 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sun, 6 Mar 2011 14:48:58 +0100 Subject: Adjust bin/indexer.php for the new indexer Now the indexer is directly called instead of duplicating a large part of the indexer code. --- bin/indexer.php | 32 +++----------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/bin/indexer.php b/bin/indexer.php index 85e990bbe..6ee0a9e8d 100755 --- a/bin/indexer.php +++ b/bin/indexer.php @@ -87,41 +87,15 @@ function _index($id){ global $QUIET; // if not cleared only update changed and new files - if(!$CLEAR){ + if($CLEAR){ $idxtag = metaFN($id,'.indexed'); if(@file_exists($idxtag)){ - if(io_readFile($idxtag) == idx_get_version()){ - $last = @filemtime($idxtag); - if($last > @filemtime(wikiFN($id))) return; - } + @unlink($idxtag); } } _quietecho("$id... "); - $body = ''; - $data = array($id, $body); - $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); - if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($id); - $evt->advise_after(); - unset($evt); - list($id,$body) = $data; - $said = false; - while(true) { - $result = $INDEXER->addPageWords($id, $body); - if ($result == "locked") { - if($said){ - _quietecho("."); - }else{ - _quietecho("Waiting for lockfile (max. 5 min)"); - $said = true; - } - sleep(15); - } else { - break; - } - } - if ($result) - io_saveFile(metaFN($id,'.indexed'), idx_get_version()); + idx_addPage($id, !$QUIET); _quietecho("done.\n"); } -- cgit v1.2.3