diff options
author | Tom N Harris <tnharris@whoopdedo.org> | 2007-09-19 21:42:44 +0200 |
---|---|---|
committer | Tom N Harris <tnharris@whoopdedo.org> | 2007-09-19 21:42:44 +0200 |
commit | a0c5c34961149cd101a49729d5dc0b73ed8ae2a5 (patch) | |
tree | 0b0e2a1311cfd50ffb74ee033a17718181baf5a8 | |
parent | 11d9dfa5f39e8005a998d25bca1885847c2cb561 (diff) | |
download | rpg-a0c5c34961149cd101a49729d5dc0b73ed8ae2a5.tar.gz rpg-a0c5c34961149cd101a49729d5dc0b73ed8ae2a5.tar.bz2 |
Remove obsolete words from search index
Creates another index file 'pagewords.idx' for the words in each page.
Words that are deleted from a page can then be removed from the word index.
The indexer version is incremented to force rebuilding of the index.
Also, a minor flaw in the regexp for asian words is fixed.
darcs-hash:20070919194244-6942e-2e08157dcf4fdf166b35b36a0faf8a3dfb415ad9.gz
-rwxr-xr-x | bin/indexer.php | 10 | ||||
-rw-r--r-- | inc/indexer.php | 103 | ||||
-rw-r--r-- | lib/exe/indexer.php | 6 |
3 files changed, 101 insertions, 18 deletions
diff --git a/bin/indexer.php b/bin/indexer.php index d5f4c6446..c62f97882 100755 --- a/bin/indexer.php +++ b/bin/indexer.php @@ -13,7 +13,7 @@ session_write_close(); // Version tag used to force rebuild on upgrade // Need to keep in sync with lib/exe/indexer.php -if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 1); +if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2); // handle options $short_opts = 'hcuq'; @@ -68,6 +68,14 @@ function _usage() { function _update(){ global $conf; + + // upgrade to version 2 + if (!@file_exists($conf['indexdir'].'/pageword.idx')){ + _lock(); + idx_upgradePageWords(); + _unlock(); + } + $data = array(); _quietecho("Searching pages... "); search($data,$conf['datadir'],'search_allpages',array()); diff --git a/inc/indexer.php b/inc/indexer.php index 319e2b6e4..ca446827d 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -21,7 +21,7 @@ define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai define('IDX_ASIAN2','['. '\x{2E80}-\x{3040}'. // CJK -> Hangul '\x{309D}-\x{30A0}'. - '\x{30FB}-\x{31EF}\x{3200}-\x{D7AF}'. + '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms ']'); @@ -216,27 +216,62 @@ function idx_addPage($page){ } } + $pagewords = array(); // get word usage in page $words = idx_getPageWords($page); if($words === false) return false; - if(!count($words)) return true; - foreach(array_keys($words) as $wlen){ - $index = idx_getIndex('i',$wlen); - foreach($words[$wlen] as $wid => $freq){ - if($wid<count($index)){ - $index[$wid] = idx_updateIndexLine($index[$wid],$pid,$freq); - }else{ - // New words **should** have been added in increasing order - // starting with the first unassigned index. - // If someone can show how this isn't true, then I'll need to sort - // or do something special. - $index[$wid] = idx_updateIndexLine('',$pid,$freq); + if(!empty($words)) { + foreach(array_keys($words) as $wlen){ + $index = idx_getIndex('i',$wlen); + foreach($words[$wlen] as $wid => $freq){ + if($wid<count($index)){ + $index[$wid] = idx_updateIndexLine($index[$wid],$pid,$freq); + }else{ + // New words **should** have been added in increasing order + // starting with the first unassigned index. + // If someone can show how this isn't true, then I'll need to sort + // or do something special. + $index[$wid] = idx_updateIndexLine('',$pid,$freq); + } + $pagewords[] = "$wlen*$wid"; + } + // save back word index + if(!idx_saveIndex('i',$wlen,$index)){ + trigger_error("Failed to write index", E_USER_ERROR); + return false; } } - // save back word index - if(!idx_saveIndex('i',$wlen,$index)){ - trigger_error("Failed to write index", E_USER_ERROR); + } + + // Remove obsolete index entries + $pageword_idx = idx_getIndex('pageword',''); + if ($pid<count($pageword_idx)) { + $oldwords = explode(':',trim($pageword_idx[$pid])); + $delwords = array_diff($oldwords, $pagewords); + foreach ($delwords as $word) { + if($word=='') continue; + list($wlen,$wid) = explode('*',$word); + $wid = (int)$wid; + // make the disk cache work for its money + // $pagewords is sorted, so this shouldn't be a significant penalty + $index = idx_getIndex('i',$wlen); + $index[$wid] = idx_updateIndexLine($index[$wid],$pid,0); + idx_saveIndex('i',$wlen,$index); + } + if (!empty($delwords)) { + // Save the reverse index + $pageword_idx[$pid] = join(':',$pagewords)."\n"; + if(!idx_saveIndex('pageword','',$pageword_idx)){ + trigger_error("Failed to write word index", E_USER_ERROR); + return false; + } + } + } else { + // Save the reverse index + $pageword_idx[$pid] = join(':',$pagewords)."\n"; + if(!idx_saveIndex('pageword','',$pageword_idx)){ + trigger_error("Failed to write word index", E_USER_ERROR); return false; } } @@ -532,4 +567,40 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ return $words; } +/** + * Create a pagewords index from the existing index. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_upgradePageWords(){ + global $conf; + $page_idx = idx_getIndex('page',''); + if (empty($page_idx)) return; + $pagewords = array(); + for ($n=0;$n<count($page_idx);$n++) $pagewords[] = array(); + unset($page_idx); + + $n=0; + foreach (idx_indexLengths($n) as $wlen) { + $lines = idx_getIndex('i',$wlen); + for ($wid=0;$wid<count($lines);$wid++) { + $wkey = "$wlen*$wid"; + foreach (explode(':',trim($lines[$wid])) as $part) { + if($part == '') continue; + list($doc,$cnt) = explode('*',$part); + $pagewords[(int)$doc][] = $wkey; + } + } + } + + $pageword_idx = array(); + foreach ($pagewords as $line) + $pageword_idx[] = join(':',$line)."\n"; + if(!idx_saveIndex('pageword','',$pageword_idx)){ + trigger_error("Failed to write word index", E_USER_ERROR); + return false; + } + return true; +} + //Setup VIM: ex: et ts=4 enc=utf-8 : diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 04e46a6c8..282aa73ac 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -14,7 +14,7 @@ session_write_close(); //close session if(!defined('NL')) define('NL',"\n"); // Version tag used to force rebuild on upgrade -define('INDEXER_VERSION', 1); +define('INDEXER_VERSION', 2); // keep running after browser closes connection @ignore_user_abort(true); @@ -167,6 +167,10 @@ function runIndexer(){ require_once(DOKU_INC.'inc/indexer.php'); + // upgrade to version 2 + if (!@file_exists($conf['indexdir'].'/pageword.idx')) + idx_upgradePageWords(); + // do the work idx_addPage($ID); |