summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom N Harris <tnharris@whoopdedo.org>2007-09-19 21:42:44 +0200
committerTom N Harris <tnharris@whoopdedo.org>2007-09-19 21:42:44 +0200
commita0c5c34961149cd101a49729d5dc0b73ed8ae2a5 (patch)
tree0b0e2a1311cfd50ffb74ee033a17718181baf5a8
parent11d9dfa5f39e8005a998d25bca1885847c2cb561 (diff)
downloadrpg-a0c5c34961149cd101a49729d5dc0b73ed8ae2a5.tar.gz
rpg-a0c5c34961149cd101a49729d5dc0b73ed8ae2a5.tar.bz2
Remove obsolete words from search index
Creates another index file 'pagewords.idx' for the words in each page. Words that are deleted from a page can then be removed from the word index. The indexer version is incremented to force rebuilding of the index. Also, a minor flaw in the regexp for asian words is fixed. darcs-hash:20070919194244-6942e-2e08157dcf4fdf166b35b36a0faf8a3dfb415ad9.gz
-rwxr-xr-xbin/indexer.php10
-rw-r--r--inc/indexer.php103
-rw-r--r--lib/exe/indexer.php6
3 files changed, 101 insertions, 18 deletions
diff --git a/bin/indexer.php b/bin/indexer.php
index d5f4c6446..c62f97882 100755
--- a/bin/indexer.php
+++ b/bin/indexer.php
@@ -13,7 +13,7 @@ session_write_close();
// Version tag used to force rebuild on upgrade
// Need to keep in sync with lib/exe/indexer.php
-if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 1);
+if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2);
// handle options
$short_opts = 'hcuq';
@@ -68,6 +68,14 @@ function _usage() {
function _update(){
global $conf;
+
+ // upgrade to version 2
+ if (!@file_exists($conf['indexdir'].'/pageword.idx')){
+ _lock();
+ idx_upgradePageWords();
+ _unlock();
+ }
+
$data = array();
_quietecho("Searching pages... ");
search($data,$conf['datadir'],'search_allpages',array());
diff --git a/inc/indexer.php b/inc/indexer.php
index 319e2b6e4..ca446827d 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -21,7 +21,7 @@ define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai
define('IDX_ASIAN2','['.
'\x{2E80}-\x{3040}'. // CJK -> Hangul
'\x{309D}-\x{30A0}'.
- '\x{30FB}-\x{31EF}\x{3200}-\x{D7AF}'.
+ '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'.
'\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs
'\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms
']');
@@ -216,27 +216,62 @@ function idx_addPage($page){
}
}
+ $pagewords = array();
// get word usage in page
$words = idx_getPageWords($page);
if($words === false) return false;
- if(!count($words)) return true;
- foreach(array_keys($words) as $wlen){
- $index = idx_getIndex('i',$wlen);
- foreach($words[$wlen] as $wid => $freq){
- if($wid<count($index)){
- $index[$wid] = idx_updateIndexLine($index[$wid],$pid,$freq);
- }else{
- // New words **should** have been added in increasing order
- // starting with the first unassigned index.
- // If someone can show how this isn't true, then I'll need to sort
- // or do something special.
- $index[$wid] = idx_updateIndexLine('',$pid,$freq);
+ if(!empty($words)) {
+ foreach(array_keys($words) as $wlen){
+ $index = idx_getIndex('i',$wlen);
+ foreach($words[$wlen] as $wid => $freq){
+ if($wid<count($index)){
+ $index[$wid] = idx_updateIndexLine($index[$wid],$pid,$freq);
+ }else{
+ // New words **should** have been added in increasing order
+ // starting with the first unassigned index.
+ // If someone can show how this isn't true, then I'll need to sort
+ // or do something special.
+ $index[$wid] = idx_updateIndexLine('',$pid,$freq);
+ }
+ $pagewords[] = "$wlen*$wid";
+ }
+ // save back word index
+ if(!idx_saveIndex('i',$wlen,$index)){
+ trigger_error("Failed to write index", E_USER_ERROR);
+ return false;
}
}
- // save back word index
- if(!idx_saveIndex('i',$wlen,$index)){
- trigger_error("Failed to write index", E_USER_ERROR);
+ }
+
+ // Remove obsolete index entries
+ $pageword_idx = idx_getIndex('pageword','');
+ if ($pid<count($pageword_idx)) {
+ $oldwords = explode(':',trim($pageword_idx[$pid]));
+ $delwords = array_diff($oldwords, $pagewords);
+ foreach ($delwords as $word) {
+ if($word=='') continue;
+ list($wlen,$wid) = explode('*',$word);
+ $wid = (int)$wid;
+ // make the disk cache work for its money
+ // $pagewords is sorted, so this shouldn't be a significant penalty
+ $index = idx_getIndex('i',$wlen);
+ $index[$wid] = idx_updateIndexLine($index[$wid],$pid,0);
+ idx_saveIndex('i',$wlen,$index);
+ }
+ if (!empty($delwords)) {
+ // Save the reverse index
+ $pageword_idx[$pid] = join(':',$pagewords)."\n";
+ if(!idx_saveIndex('pageword','',$pageword_idx)){
+ trigger_error("Failed to write word index", E_USER_ERROR);
+ return false;
+ }
+ }
+ } else {
+ // Save the reverse index
+ $pageword_idx[$pid] = join(':',$pagewords)."\n";
+ if(!idx_saveIndex('pageword','',$pageword_idx)){
+ trigger_error("Failed to write word index", E_USER_ERROR);
return false;
}
}
@@ -532,4 +567,40 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
return $words;
}
+/**
+ * Create a pagewords index from the existing index.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_upgradePageWords(){
+ global $conf;
+ $page_idx = idx_getIndex('page','');
+ if (empty($page_idx)) return;
+ $pagewords = array();
+ for ($n=0;$n<count($page_idx);$n++) $pagewords[] = array();
+ unset($page_idx);
+
+ $n=0;
+ foreach (idx_indexLengths($n) as $wlen) {
+ $lines = idx_getIndex('i',$wlen);
+ for ($wid=0;$wid<count($lines);$wid++) {
+ $wkey = "$wlen*$wid";
+ foreach (explode(':',trim($lines[$wid])) as $part) {
+ if($part == '') continue;
+ list($doc,$cnt) = explode('*',$part);
+ $pagewords[(int)$doc][] = $wkey;
+ }
+ }
+ }
+
+ $pageword_idx = array();
+ foreach ($pagewords as $line)
+ $pageword_idx[] = join(':',$line)."\n";
+ if(!idx_saveIndex('pageword','',$pageword_idx)){
+ trigger_error("Failed to write word index", E_USER_ERROR);
+ return false;
+ }
+ return true;
+}
+
//Setup VIM: ex: et ts=4 enc=utf-8 :
diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php
index 04e46a6c8..282aa73ac 100644
--- a/lib/exe/indexer.php
+++ b/lib/exe/indexer.php
@@ -14,7 +14,7 @@ session_write_close(); //close session
if(!defined('NL')) define('NL',"\n");
// Version tag used to force rebuild on upgrade
-define('INDEXER_VERSION', 1);
+define('INDEXER_VERSION', 2);
// keep running after browser closes connection
@ignore_user_abort(true);
@@ -167,6 +167,10 @@ function runIndexer(){
require_once(DOKU_INC.'inc/indexer.php');
+ // upgrade to version 2
+ if (!@file_exists($conf['indexdir'].'/pageword.idx'))
+ idx_upgradePageWords();
+
// do the work
idx_addPage($ID);