From a0c5c34961149cd101a49729d5dc0b73ed8ae2a5 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Wed, 19 Sep 2007 21:42:44 +0200 Subject: Remove obsolete words from search index Creates another index file 'pagewords.idx' for the words in each page. Words that are deleted from a page can then be removed from the word index. The indexer version is incremented to force rebuilding of the index. Also, a minor flaw in the regexp for asian words is fixed. darcs-hash:20070919194244-6942e-2e08157dcf4fdf166b35b36a0faf8a3dfb415ad9.gz --- inc/indexer.php | 103 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 16 deletions(-) (limited to 'inc') diff --git a/inc/indexer.php b/inc/indexer.php index 319e2b6e4..ca446827d 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -21,7 +21,7 @@ define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai define('IDX_ASIAN2','['. '\x{2E80}-\x{3040}'. // CJK -> Hangul '\x{309D}-\x{30A0}'. - '\x{30FB}-\x{31EF}\x{3200}-\x{D7AF}'. + '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms ']'); @@ -216,27 +216,62 @@ function idx_addPage($page){ } } + $pagewords = array(); // get word usage in page $words = idx_getPageWords($page); if($words === false) return false; - if(!count($words)) return true; - foreach(array_keys($words) as $wlen){ - $index = idx_getIndex('i',$wlen); - foreach($words[$wlen] as $wid => $freq){ - if($wid $freq){ + if($wid + */ +function idx_upgradePageWords(){ + global $conf; + $page_idx = idx_getIndex('page',''); + if (empty($page_idx)) return; + $pagewords = array(); + for ($n=0;$n