summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTNHarris <telliamed@fastmail.us>2006-11-12 20:49:00 +0100
committerTNHarris <telliamed@fastmail.us>2006-11-12 20:49:00 +0100
commit579b0f7e8d80287b11fd441dfa68d15e9d4bb74c (patch)
treec2ba569b8493a7b724df57c90dcd07a89890c8a3
parentc64b282c718d5e728c9a6555cf3e29ebd1bf4524 (diff)
downloadrpg-579b0f7e8d80287b11fd441dfa68d15e9d4bb74c.tar.gz
rpg-579b0f7e8d80287b11fd441dfa68d15e9d4bb74c.tar.bz2
Word-Length Indexer
A modification to the indexer that sorts words based on length. This should make searching a little bit more efficient. After the patch is applied, your old index will be automatically converted to the new format (when you visit a page). The new index format is: 1. Index files are stored in savedir/index 2. Word lists are stored as wlen.idx. This used to be word.idx. 3. Word indexes are stored as ilen.idx. This used to be index.idx. 4. The page list, page.idx, is simply copied to the new location. Any plugins you have, such as the blog plugin, that read the index files need to be updated. darcs-hash:20061112194900-2b9f0-a975498ccf0a1d39c6df73b79bcd028d5e81c389.gz
-rw-r--r--data/index/_dummy0
-rw-r--r--inc/fulltext.php2
-rw-r--r--inc/indexer.php319
-rw-r--r--inc/init.php5
-rw-r--r--lib/exe/indexer.php14
-rw-r--r--lib/plugins/importoldindex/action.php58
6 files changed, 273 insertions, 125 deletions
diff --git a/data/index/_dummy b/data/index/_dummy
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/data/index/_dummy
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 448f72248..1534ec1a8 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -149,7 +149,7 @@ function ft_backlinks($id){
function ft_pageLookup($id,$pageonly=true){
global $conf;
$id = preg_quote($id,'/');
- $pages = file($conf['cachedir'].'/page.idx');
+ $pages = file($conf['indexdir'].'/page.idx');
$pages = array_values(preg_grep('/'.$id.'/',$pages));
$cnt = count($pages);
diff --git a/inc/indexer.php b/inc/indexer.php
index e6550c2e4..c90f2b179 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -26,16 +26,59 @@ define('IDX_ASIAN','['.
/**
+ * Write a list of strings to an index file.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_saveIndex($pre, $wlen, $idx){
+ global $conf;
+ $fn = $conf['indexdir'].'/'.$pre.$wlen;
+ $fh = @fopen($fn.'.tmp','w');
+ if(!$fh) return false;
+ fwrite($fh,join('',$idx));
+ fclose($fh);
+ if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
+ io_rename($fn.'.tmp', $fn.'.idx');
+ return true;
+}
+
+/**
+ * Read the list of words in an index (if it exists).
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_getIndex($pre, $wlen){
+ global $conf;
+ $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
+ if(!@file_exists($fn)) return array();
+ return file($fn);
+}
+
+/**
+ * Create an empty index file if it doesn't exist yet.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_touchIndex($pre, $wlen){
+ global $conf;
+ $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
+ if(!@file_exists($fn)){
+ touch($fn);
+ if($conf['fperm']) chmod($fn, $conf['fperm']);
+ }
+}
+
+/**
* Split a page into words
*
- * Returns an array of of word counts, false if an error occured
+ * Returns an array of word counts, false if an error occured.
+ * Array is keyed on the word length, then the word index.
*
* @author Andreas Gohr <andi@splitbrain.org>
* @author Christopher Smith <chris@jalakai.co.uk>
*/
function idx_getPageWords($page){
global $conf;
- $word_idx = file($conf['cachedir'].'/word.idx');
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
if(@file_exists($swfile)){
$stopwords = file($swfile);
@@ -65,47 +108,40 @@ function idx_getPageWords($page){
$words = array();
foreach ($tokens as $word => $count) {
- // simple filter to restrict use of utf8_stripspecials
- if (preg_match('/[^0-9A-Za-z]/u', $word)) {
- // handle asian chars as single words (may fail on older PHP version)
- $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
- if(!is_null($asia)) $word = $asia; //recover from regexp failure
- $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
- $arr = array_count_values($arr);
-
- foreach ($arr as $w => $c) {
- if (!is_numeric($w) && strlen($w) < 3) continue;
- $w = utf8_strtolower($w);
- $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0);
+ $arr = idx_tokenizer($word,$stopwords);
+ $arr = array_count_values($arr);
+ foreach ($arr as $w => $c) {
+ $l = strlen($w);
+ if(isset($words[$l])){
+ $words[$l][$w] = $c * $count + (isset($words[$l][$w])) ? $words[$l][$w] : 0;
+ }else{
+ $words[$l] = array($w => $c * $count);
}
- } else {
- if (!is_numeric($word) && strlen($word) < 3) continue;
- $word = strtolower($word);
- $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
}
}
- // arrive here with $words = array(word => frequency)
+ // arrive here with $words = array(wordlen => array(word => frequency))
$index = array(); //resulting index
- foreach ($words as $word => $freq) {
- if (is_int(array_search("$word\n",$stopwords))) continue;
- $wid = array_search("$word\n",$word_idx);
- if(!is_int($wid)){
- $word_idx[] = "$word\n";
- $wid = count($word_idx)-1;
+ foreach (array_keys($words) as $wlen){
+ $word_idx = idx_getIndex('w',$wlen);
+ foreach ($words[$wlen] as $word => $freq) {
+ $wid = array_search("$word\n",$word_idx);
+ if(!is_int($wid)){
+ $word_idx[] = "$word\n";
+ $wid = count($word_idx)-1;
+ }
+ if(!isset($index[$wlen]))
+ $index[$wlen] = array();
+ $index[$wlen][$wid] = $freq;
}
- $index[$wid] = $freq;
- }
- // save back word index
- $fh = fopen($conf['cachedir'].'/word.idx','w');
- if(!$fh){
- trigger_error("Failed to write word.idx", E_USER_ERROR);
- return false;
+ // save back word index
+ if(!idx_saveIndex('w',$wlen,$word_idx)){
+ trigger_error("Failed to write word index", E_USER_ERROR);
+ return false;
+ }
}
- fwrite($fh,join('',$word_idx));
- fclose($fh);
return $index;
}
@@ -123,7 +159,7 @@ function idx_addPage($page){
global $conf;
// load known documents
- $page_idx = file($conf['cachedir'].'/page.idx');
+ $page_idx = idx_getIndex('page','');
// get page id (this is the linenumber in page.idx)
$pid = array_search("$page\n",$page_idx);
@@ -131,10 +167,8 @@ function idx_addPage($page){
$page_idx[] = "$page\n";
$pid = count($page_idx)-1;
// page was new - write back
- $fh = fopen($conf['cachedir'].'/page.idx','w');
- if(!$fh) return false;
- fwrite($fh,join('',$page_idx));
- fclose($fh);
+ if (!idx_saveIndex('page','',$page_idx))
+ return false;
}
// get word usage in page
@@ -142,46 +176,51 @@ function idx_addPage($page){
if($words === false) return false;
if(!count($words)) return true;
- // Open index and temp file
- $idx = fopen($conf['cachedir'].'/index.idx','r');
- $tmp = fopen($conf['cachedir'].'/index.tmp','w');
- if(!$idx || !$tmp){
- trigger_error("Failed to open index files", E_USER_ERROR);
- return false;
- }
+ foreach(array_keys($words) as $wlen){
+ // Open index and temp file
+ $fn = $conf['indexdir']."/i$wlen";
+ idx_touchIndex('i',$wlen);
+ $idx = fopen($fn.'.idx','r');
+ $tmp = fopen($fn.'.tmp','w');
+ if(!$idx || !$tmp){
+ trigger_error("Failed to open index files", E_USER_ERROR);
+ return false;
+ }
- // copy from index to temp file, modifying were needed
- $lno = 0;
- $line = '';
- while (!feof($idx)) {
- // read full line
- $line .= fgets($idx, 4096);
- if(substr($line,-1) != "\n") continue;
+ // copy from index to temp file, modifying where needed
+ $lno = 0;
+ $line = '';
+ while (!feof($idx)) {
+ // read full line
+ $line .= fgets($idx, 4096);
+ if(substr($line,-1) != "\n") continue;
- // write a new Line to temp file
- idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
+ // write a new Line to temp file
+ idx_writeIndexLine($tmp,$line,$pid,$words[$wlen][$lno]);
- $line = ''; // reset line buffer
- $lno++; // increase linecounter
- }
- fclose($idx);
-
- // add missing lines (usually index and word should contain
- // the same number of lines, however if the page contained
- // new words the word file has some more lines which need to
- // be added here
- $word_idx = file($conf['cachedir'].'/word.idx');
- $wcnt = count($word_idx);
- for($lno; $lno<$wcnt; $lno++){
- idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
+ $line = ''; // reset line buffer
+ $lno++; // increase linecounter
+ }
+ fclose($idx);
+
+ // add missing lines (usually index and word should contain
+ // the same number of lines, however if the page contained
+ // new words the word file has some more lines which need to
+ // be added here
+ $word_idx = idx_getIndex('w',$wlen);
+ $wcnt = count($word_idx);
+ for($lno; $lno<$wcnt; $lno++){
+ idx_writeIndexLine($tmp,'',$pid,$words[$wlen][$lno]);
+ }
+
+ // close the temp file and move it over to be the new one
+ fclose($tmp);
+ if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
+ // try rename first (fast) fallback to copy (slow)
+ io_rename($fn.'.tmp', $fn.'.idx');
}
- // close the temp file and move it over to be the new one
- fclose($tmp);
- // try rename first (fast) fallback to copy (slow)
- io_rename($conf['cachedir'].'/index.tmp',
- $conf['cachedir'].'/index.idx');
- return false;
+ return true;
}
/**
@@ -218,6 +257,34 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
}
/**
+ * Get the word lengths that have been indexed.
+ *
+ * Reads the index directory and returns an array of lengths
+ * that there are indices for.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_indexLengths($minlen){
+ global $conf;
+ $dir = @opendir($conf['indexdir']);
+ if($dir===false)
+ return array();
+ $idx = array();
+ // Exact match first.
+ if(@file_exists($conf['indexdir']."/i$minlen.idx"))
+ $idx[] = $minlen;
+ while (($f = readdir($dir)) !== false) {
+ if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
+ $i = substr($f,1,-4);
+ if (is_numeric($i) && $i > $minlen)
+ $idx[] = $i;
+ }
+ }
+ closedir($dir);
+ return $idx;
+}
+
+/**
* Lookup words in index
*
* Takes an array of word and will return a list of matching
@@ -234,8 +301,7 @@ function idx_lookup($words){
$result = array();
// load known words and documents
- $page_idx = file($conf['cachedir'].'/page.idx');
- $word_idx = file($conf['cachedir'].'/word.idx');
+ $page_idx = idx_getIndex('page','');
// get word IDs
$wids = array();
@@ -243,80 +309,93 @@ function idx_lookup($words){
$result[$word] = array();
$wild = 0;
$xword = $word;
+ $wlen = strlen($word);
// check for wildcards
if(substr($xword,0,1) == '*'){
$xword = substr($xword,1);
$wild = 1;
$ptn = '/'.preg_quote($xword,'/').'$/';
+ $wlen -= 1;
# $l = -1*strlen($xword)-1;
}
if(substr($xword,-1,1) == '*'){
$xword = substr($xword,0,-1);
$wild += 2;
+ $wlen -= 1;
}
+ if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
// look for the ID(s) for the given word
if($wild){ // handle wildcard search
- $cnt = count($word_idx);
- for($wid=0; $wid<$cnt; $wid++){
- $iword = $word_idx[$wid];
- if( (($wild==3) && is_int(strpos($iword,$xword))) ||
-# (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
- (($wild==1) && preg_match($ptn,$iword)) ||
-# (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
- (($wild==2) && (0 === strpos($iword,$xword)))
-
- ){
- $wids[] = $wid;
- $result[$word][] = $wid;
+ foreach (idx_indexLengths($wlen) as $ixlen){
+ $word_idx = idx_getIndex('w',$ixlen);
+ $cnt = count($word_idx);
+ for($wid=0; $wid<$cnt; $wid++){
+ $iword = $word_idx[$wid];
+ if( (($wild==3) && is_int(strpos($iword,$xword))) ||
+# (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
+ (($wild==1) && preg_match($ptn,$iword)) ||
+# (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+ (($wild==2) && (0 === strpos($iword,$xword)))
+
+ ){
+ if(!isset($wids[$ixlen])) $wids[$ixlen] = array();
+ $wids[$ixlen][] = $wid;
+ $result[$word][] = "$ixlen*$wid";
+ }
}
}
}else{ // handle exact search
+ $word_idx = idx_getIndex('w',$wlen);
$wid = array_search("$word\n",$word_idx);
if(is_int($wid)){
- $wids[] = $wid;
- $result[$word][] = $wid;
+ $wids[$wlen] = array($wid);
+ $result[$word][] = "$wlen*$wid";
}else{
$result[$word] = array();
}
}
}
- sort($wids);
- $wids = array_unique($wids);
-
- // Open index
- $idx = fopen($conf['cachedir'].'/index.idx','r');
- if(!$idx){
- msg("Failed to open index file",-1);
- return false;
- }
- // Walk the index til the lines are found
$docs = array(); // hold docs found
- $lno = 0;
- $line = '';
- $srch = array_shift($wids); // which word do we look for?
- while (!feof($idx)) {
- // read full line
- $line .= fgets($idx, 4096);
- if(substr($line,-1) != "\n") continue;
- if($lno > $srch) break; // shouldn't happen
-
-
- // do we want this line?
- if($lno == $srch){
- // add docs to list
- $docs[$srch] = idx_parseIndexLine($page_idx,$line);
-
- $srch = array_shift($wids); // next word to look up
- if($srch == null) break; // no more words
+ foreach(array_keys($wids) as $wlen){
+ sort($wids[$wlen]);
+ $wids[$wlen] = array_unique($wids[$wlen]);
+
+ // Open index
+ idx_touchIndex('i',$wlen);
+ $idx = fopen($conf['indexdir']."/i$wlen.idx",'r');
+ if(!$idx){
+ msg("Failed to open index file",-1);
+ return false;
}
- $line = ''; // reset line buffer
- $lno++; // increase linecounter
+ // Walk the index til the lines are found
+ $lno = 0;
+ $line = '';
+ $ixids =& $wids[$wlen];
+ $srch = array_shift($ixids); // which word do we look for?
+ while (!feof($idx)) {
+ // read full line
+ $line .= fgets($idx, 4096);
+ if(substr($line,-1) != "\n") continue;
+ if($lno > $srch) break; // shouldn't happen
+
+ // do we want this line?
+ if($lno == $srch){
+ // add docs to list
+ $docs["$wlen*$srch"] = idx_parseIndexLine($page_idx,$line);
+
+ $srch = array_shift($ixids); // next word to look up
+ if($srch == null) break; // no more words
+ }
+
+ $line = ''; // reset line buffer
+ $lno++; // increase linecounter
+ }
+ fclose($idx);
}
- fclose($idx);
// merge found pages into final result array
diff --git a/inc/init.php b/inc/init.php
index 552f98526..c097cd5bd 100644
--- a/inc/init.php
+++ b/inc/init.php
@@ -133,6 +133,7 @@ function init_paths(){
'mediadir' => 'media',
'metadir' => 'meta',
'cachedir' => 'cache',
+ 'indexdir' => 'index',
'lockdir' => 'locks');
foreach($paths as $c => $p){
@@ -157,9 +158,7 @@ function init_paths(){
function init_files(){
global $conf;
- $files = array( $conf['cachedir'].'/word.idx',
- $conf['cachedir'].'/page.idx',
- $conf['cachedir'].'/index.idx');
+ $files = array( $conf['indexdir'].'/page.idx');
foreach($files as $file){
if(!@file_exists($file)){
diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php
index 224c54311..12177406f 100644
--- a/lib/exe/indexer.php
+++ b/lib/exe/indexer.php
@@ -120,6 +120,18 @@ function runIndexer(){
global $conf;
print "runIndexer(): started".NL;
+ // Move index files (if needed)
+ // Uses the importoldindex plugin to upgrade the index automatically.
+ // FIXME: Remove this from runIndexer when it is no longer needed.
+ if (@file_exists($conf['cachedir'].'/page.idx') &&
+ (!@file_exists($conf['indexdir'].'/page.idx') ||
+ !filesize($conf['indexdir'].'/page.idx')) &&
+ !@file_exists($conf['indexdir'].'/index_importing')) {
+ echo "trigger TEMPORARY_INDEX_UPGRADE_EVENT\n";
+ $tmp = array(); // no event data
+ trigger_event('TEMPORARY_INDEX_UPGRADE_EVENT', $tmp);
+ }
+
$ID = cleanID($_REQUEST['id']);
if(!$ID) return false;
@@ -233,7 +245,7 @@ function runSitemapper(){
return false;
}
- $pages = file($conf['cachedir'].'/page.idx');
+ $pages = file($conf['indexdir'].'/page.idx');
print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL;
// build the sitemap
diff --git a/lib/plugins/importoldindex/action.php b/lib/plugins/importoldindex/action.php
new file mode 100644
index 000000000..26b37664c
--- /dev/null
+++ b/lib/plugins/importoldindex/action.php
@@ -0,0 +1,58 @@
+<?php
+// must be run within Dokuwiki
+if(!defined('DOKU_INC')) die();
+
+if(!defined('DOKU_PLUGIN')) define('DOKU_PLUGIN',DOKU_INC.'lib/plugins/');
+require_once(DOKU_PLUGIN.'action.php');
+
+class action_plugin_importoldindex extends DokuWiki_Action_Plugin {
+
+ function getInfo(){
+ return array(
+ 'author' => 'Tom N Harris',
+ 'email' => 'tnharris@whoopdedo.org',
+ 'date' => '2006-11-09',
+ 'name' => 'Import Old Index',
+ 'desc' => 'Moves old index files to a new location, sorted by string length.',
+ 'url' => 'http://whoopdedo.org/doku/wiki'
+ );
+ }
+
+ function register(&$controller) {
+ $controller->register_hook('TEMPORARY_INDEX_UPGRADE_EVENT', 'BEFORE', $this, 'run_import');
+ }
+
+ function run_import(&$event, $args) {
+ global $conf;
+
+ touch($conf['indexdir'].'/index_importing'); // changelog importing lock
+ // load old index
+ $word_idx = file($conf['cachedir'].'/word.idx');
+ $idx = file($conf['cachedir'].'/index.idx');
+ $words = array();
+ for ($lno=0;$lno<count($word_idx);$lno++){
+ $wlen = strlen($word_idx[$lno])-1;
+ //if($wlen<3) continue;
+ if(!isset($words[$wlen])) $words[$wlen] = array();
+ $words[$wlen][] = $lno;
+ }
+
+ foreach (array_keys($words) as $wlen) {
+ $new_words = array();
+ $new_idx = array();
+ foreach ($words[$wlen] as $lno) {
+ $new_words[] = $word_idx[$lno];
+ $new_idx[] = $idx[$lno];
+ }
+ io_saveFile($conf['indexdir']."/w$wlen.idx", implode('', $new_words));
+ io_saveFile($conf['indexdir']."/i$wlen.idx", implode('', $new_idx));
+ }
+
+ @copy($conf['cachedir'].'/page.idx', $conf['indexdir'].'/page.idx');
+ if($conf['fperm']) chmod($conf['indexdir'].'/page.idx', $conf['fperm']);
+ unlink($conf['indexdir'].'/index_importing'); // changelog importing unlock
+ plugin_disable('importoldindex'); // only needs to run once
+ }
+
+}
+