From a0c5c34961149cd101a49729d5dc0b73ed8ae2a5 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 19 Sep 2007 21:42:44 +0200 Subject: Remove obsolete words from search index Creates another index file 'pagewords.idx' for the words in each page. Words that are deleted from a page can then be removed from the word index. The indexer version is incremented to force rebuilding of the index. Also, a minor flaw in the regexp for asian words is fixed. darcs-hash:20070919194244-6942e-2e08157dcf4fdf166b35b36a0faf8a3dfb415ad9.gz --- bin/indexer.php | 10 ++++- inc/indexer.php | 103 ++++++++++++++++++++++++++++++++++++++++++++-------- lib/exe/indexer.php | 6 ++- 3 files changed, 101 insertions(+), 18 deletions(-) diff --git a/bin/indexer.php b/bin/indexer.php index d5f4c6446..c62f97882 100755 --- a/bin/indexer.php +++ b/bin/indexer.php @@ -13,7 +13,7 @@ session_write_close(); // Version tag used to force rebuild on upgrade // Need to keep in sync with lib/exe/indexer.php -if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 1); +if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2); // handle options $short_opts = 'hcuq'; @@ -68,6 +68,14 @@ function _usage() { function _update(){ global $conf; + + // upgrade to version 2 + if (!@file_exists($conf['indexdir'].'/pageword.idx')){ + _lock(); + idx_upgradePageWords(); + _unlock(); + } + $data = array(); _quietecho("Searching pages... "); search($data,$conf['datadir'],'search_allpages',array()); diff --git a/inc/indexer.php b/inc/indexer.php index 319e2b6e4..ca446827d 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -21,7 +21,7 @@ define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai define('IDX_ASIAN2','['. '\x{2E80}-\x{3040}'. // CJK -> Hangul '\x{309D}-\x{30A0}'. - '\x{30FB}-\x{31EF}\x{3200}-\x{D7AF}'. + '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms ']'); @@ -216,27 +216,62 @@ function idx_addPage($page){ } } + $pagewords = array(); // get word usage in page $words = idx_getPageWords($page); if($words === false) return false; - if(!count($words)) return true; - foreach(array_keys($words) as $wlen){ - $index = idx_getIndex('i',$wlen); - foreach($words[$wlen] as $wid => $freq){ - if($wid $freq){ + if($wid + */ +function idx_upgradePageWords(){ + global $conf; + $page_idx = idx_getIndex('page',''); + if (empty($page_idx)) return; + $pagewords = array(); + for ($n=0;$n