summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--data/index/_dummy0
-rw-r--r--inc/fulltext.php2
-rw-r--r--inc/indexer.php319
-rw-r--r--inc/init.php5
-rw-r--r--lib/exe/indexer.php14
-rw-r--r--lib/plugins/importoldindex/action.php58
6 files changed, 273 insertions, 125 deletions
diff --git a/data/index/_dummy b/data/index/_dummy
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/data/index/_dummy
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 448f72248..1534ec1a8 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -149,7 +149,7 @@ function ft_backlinks($id){
function ft_pageLookup($id,$pageonly=true){
global $conf;
$id = preg_quote($id,'/');
- $pages = file($conf['cachedir'].'/page.idx');
+ $pages = file($conf['indexdir'].'/page.idx');
$pages = array_values(preg_grep('/'.$id.'/',$pages));
$cnt = count($pages);
diff --git a/inc/indexer.php b/inc/indexer.php
index e6550c2e4..c90f2b179 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -26,16 +26,59 @@ define('IDX_ASIAN','['.
/**
+ * Write a list of strings to an index file.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_saveIndex($pre, $wlen, $idx){
+ global $conf;
+ $fn = $conf['indexdir'].'/'.$pre.$wlen;
+ $fh = @fopen($fn.'.tmp','w');
+ if(!$fh) return false;
+ fwrite($fh,join('',$idx));
+ fclose($fh);
+ if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
+ io_rename($fn.'.tmp', $fn.'.idx');
+ return true;
+}
+
+/**
+ * Read the list of words in an index (if it exists).
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_getIndex($pre, $wlen){
+ global $conf;
+ $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
+ if(!@file_exists($fn)) return array();
+ return file($fn);
+}
+
+/**
+ * Create an empty index file if it doesn't exist yet.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_touchIndex($pre, $wlen){
+ global $conf;
+ $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
+ if(!@file_exists($fn)){
+ touch($fn);
+ if($conf['fperm']) chmod($fn, $conf['fperm']);
+ }
+}
+
+/**
* Split a page into words
*
- * Returns an array of of word counts, false if an error occured
+ * Returns an array of word counts, false if an error occured.
+ * Array is keyed on the word length, then the word index.
*
* @author Andreas Gohr <andi@splitbrain.org>
* @author Christopher Smith <chris@jalakai.co.uk>
*/
function idx_getPageWords($page){
global $conf;
- $word_idx = file($conf['cachedir'].'/word.idx');
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
if(@file_exists($swfile)){
$stopwords = file($swfile);
@@ -65,47 +108,40 @@ function idx_getPageWords($page){
$words = array();
foreach ($tokens as $word => $count) {
- // simple filter to restrict use of utf8_stripspecials
- if (preg_match('/[^0-9A-Za-z]/u', $word)) {
- // handle asian chars as single words (may fail on older PHP version)
- $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
- if(!is_null($asia)) $word = $asia; //recover from regexp failure
- $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
- $arr = array_count_values($arr);
-
- foreach ($arr as $w => $c) {
- if (!is_numeric($w) && strlen($w) < 3) continue;
- $w = utf8_strtolower($w);
- $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0);
+ $arr = idx_tokenizer($word,$stopwords);
+ $arr = array_count_values($arr);
+ foreach ($arr as $w => $c) {
+ $l = strlen($w);
+ if(isset($words[$l])){
+ $words[$l][$w] = $c * $count + (isset($words[$l][$w])) ? $words[$l][$w] : 0;
+ }else{
+ $words[$l] = array($w => $c * $count);
}
- } else {
- if (!is_numeric($word) && strlen($word) < 3) continue;
- $word = strtolower($word);
- $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
}
}
- // arrive here with $words = array(word => frequency)
+ // arrive here with $words = array(wordlen => array(word => frequency))
$index = array(); //resulting index
- foreach ($words as $word => $freq) {
- if (is_int(array_search("$word\n",$stopwords))) continue;
- $wid = array_search("$word\n",$word_idx);
- if(!is_int($wid)){
- $word_idx[] = "$word\n";
- $wid = count($word_idx)-1;
+ foreach (array_keys($words) as $wlen){
+ $word_idx = idx_getIndex('w',$wlen);
+ foreach ($words[$wlen] as $word => $freq) {
+ $wid = array_search("$word\n",$word_idx);
+ if(!is_int($wid)){
+ $word_idx[] = "$word\n";
+ $wid = count($word_idx)-1;
+ }
+ if(!isset($index[$wlen]))
+ $index[$wlen] = array();
+ $index[$wlen][$wid] = $freq;
}
- $index[$wid] = $freq;
- }
- // save back word index
- $fh = fopen($conf['cachedir'].'/word.idx','w');
- if(!$fh){
- trigger_error("Failed to write word.idx", E_USER_ERROR);
- return false;
+ // save back word index
+ if(!idx_saveIndex('w',$wlen,$word_idx)){
+ trigger_error("Failed to write word index", E_USER_ERROR);
+ return false;
+ }
}
- fwrite($fh,join('',$word_idx));
- fclose($fh);
return $index;
}
@@ -123,7 +159,7 @@ function idx_addPage($page){
global $conf;
// load known documents
- $page_idx = file($conf['cachedir'].'/page.idx');
+ $page_idx = idx_getIndex('page','');
// get page id (this is the linenumber in page.idx)
$pid = array_search("$page\n",$page_idx);
@@ -131,10 +167,8 @@ function idx_addPage($page){
$page_idx[] = "$page\n";
$pid = count($page_idx)-1;
// page was new - write back
- $fh = fopen($conf['cachedir'].'/page.idx','w');
- if(!$fh) return false;
- fwrite($fh,join('',$page_idx));
- fclose($fh);
+ if (!idx_saveIndex('page','',$page_idx))
+ return false;
}
// get word usage in page
@@ -142,46 +176,51 @@ function idx_addPage($page){
if($words === false) return false;
if(!count($words)) return true;
- // Open index and temp file
- $idx = fopen($conf['cachedir'].'/index.idx','r');
- $tmp = fopen($conf['cachedir'].'/index.tmp','w');
- if(!$idx || !$tmp){
- trigger_error("Failed to open index files", E_USER_ERROR);
- return false;
- }
+ foreach(array_keys($words) as $wlen){
+ // Open index and temp file
+ $fn = $conf['indexdir']."/i$wlen";
+ idx_touchIndex('i',$wlen);
+ $idx = fopen($fn.'.idx','r');
+ $tmp = fopen($fn.'.tmp','w');
+ if(!$idx || !$tmp){
+ trigger_error("Failed to open index files", E_USER_ERROR);
+ return false;
+ }
- // copy from index to temp file, modifying were needed
- $lno = 0;
- $line = '';
- while (!feof($idx)) {
- // read full line
- $line .= fgets($idx, 4096);
- if(substr($line,-1) != "\n") continue;
+ // copy from index to temp file, modifying where needed
+ $lno = 0;
+ $line = '';
+ while (!feof($idx)) {
+ // read full line
+ $line .= fgets($idx, 4096);
+ if(substr($line,-1) != "\n") continue;
- // write a new Line to temp file
- idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
+ // write a new Line to temp file
+ idx_writeIndexLine($tmp,$line,$pid,$words[$wlen][$lno]);
- $line = ''; // reset line buffer
- $lno++; // increase linecounter
- }
- fclose($idx);
-
- // add missing lines (usually index and word should contain
- // the same number of lines, however if the page contained
- // new words the word file has some more lines which need to
- // be added here
- $word_idx = file($conf['cachedir'].'/word.idx');
- $wcnt = count($word_idx);
- for($lno; $lno<$wcnt; $lno++){
- idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
+ $line = ''; // reset line buffer
+ $lno++; // increase linecounter
+ }
+ fclose($idx);
+
+ // add missing lines (usually index and word should contain
+ // the same number of lines, however if the page contained
+ // new words the word file has some more lines which need to
+ // be added here
+ $word_idx = idx_getIndex('w',$wlen);
+ $wcnt = count($word_idx);
+ for($lno; $lno<$wcnt; $lno++){
+ idx_writeIndexLine($tmp,'',$pid,$words[$wlen][$lno]);
+ }
+
+ // close the temp file and move it over to be the new one
+ fclose($tmp);
+ if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
+ // try rename first (fast) fallback to copy (slow)
+ io_rename($fn.'.tmp', $fn.'.idx');
}
- // close the temp file and move it over to be the new one
- fclose($tmp);
- // try rename first (fast) fallback to copy (slow)
- io_rename($conf['cachedir'].'/index.tmp',
- $conf['cachedir'].'/index.idx');
- return false;
+ return true;
}
/**
@@ -218,6 +257,34 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
}
/**
+ * Get the word lengths that have been indexed.
+ *
+ * Reads the index directory and returns an array of lengths
+ * that there are indices for.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_indexLengths($minlen){
+ global $conf;
+ $dir = @opendir($conf['indexdir']);
+ if($dir===false)
+ return array();
+ $idx = array();
+ // Exact match first.
+ if(@file_exists($conf['indexdir']."/i$minlen.idx"))
+ $idx[] = $minlen;
+ while (($f = readdir($dir)) !== false) {
+ if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
+ $i = substr($f,1,-4);
+ if (is_numeric($i) && $i > $minlen)
+ $idx[] = $i;
+ }
+ }
+ closedir($dir);
+ return $idx;
+}
+
+/**
* Lookup words in index
*
* Takes an array of word and will return a list of matching
@@ -234,8 +301,7 @@ function idx_lookup($words){
$result = array();
// load known words and documents
- $page_idx = file($conf['cachedir'].'/page.idx');
- $word_idx = file($conf['cachedir'].'/word.idx');
+ $page_idx = idx_getIndex('page','');
// get word IDs
$wids = array();
@@ -243,80 +309,93 @@ function idx_lookup($words){
$result[$word] = array();
$wild = 0;
$xword = $word;
+ $wlen = strlen($word);
// check for wildcards
if(substr($xword,0,1) == '*'){
$xword = substr($xword,1);
$wild = 1;
$ptn = '/'.preg_quote($xword,'/').'$/';
+ $wlen -= 1;
# $l = -1*strlen($xword)-1;
}
if(substr($xword,-1,1) == '*'){
$xword = substr($xword,0,-1);
$wild += 2;
+ $wlen -= 1;
}
+ if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
// look for the ID(s) for the given word
if($wild){ // handle wildcard search
- $cnt = count($word_idx);
- for($wid=0; $wid<$cnt; $wid++){
- $iword = $word_idx[$wid];
- if( (($wild==3) && is_int(strpos($iword,$xword))) ||
-# (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
- (($wild==1) && preg_match($ptn,$iword)) ||
-# (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
- (($wild==2) && (0 === strpos($iword,$xword)))
-
- ){
- $wids[] = $wid;
- $result[$word][] = $wid;
+ foreach (idx_indexLengths($wlen) as $ixlen){
+ $word_idx = idx_getIndex('w',$ixlen);
+ $cnt = count($word_idx);
+ for($wid=0; $wid<$cnt; $wid++){
+ $iword = $word_idx[$wid];
+ if( (($wild==3) && is_int(strpos($iword,$xword))) ||
+# (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
+ (($wild==1) && preg_match($ptn,$iword)) ||
+# (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+ (($wild==2) && (0 === strpos($iword,$xword)))
+
+ ){
+ if(!isset($wids[$ixlen])) $wids[$ixlen] = array();
+ $wids[$ixlen][] = $wid;
+ $result[$word][] = "$ixlen*$wid";
+ }
}
}
}else{ // handle exact search
+ $word_idx = idx_getIndex('w',$wlen);
$wid = array_search("$word\n",$word_idx);
if(is_int($wid)){
- $wids[] = $wid;
- $result[$word][] = $wid;
+ $wids[$wlen] = array($wid);
+ $result[$word][] = "$wlen*$wid";
}else{
$result[$word] = array();
}
}
}
- sort($wids);
- $wids = array_unique($wids);
-
- // Open index
- $idx = fopen($conf['cachedir'].'/index.idx','r');
- if(!$idx){
- msg("Failed to open index file",-1);
- return false;
- }
- // Walk the index til the lines are found
$docs = array(); // hold docs found
- $lno = 0;
- $line = '';
- $srch = array_shift($wids); // which word do we look for?
- while (!feof($idx)) {
- // read full line
- $line .= fgets($idx, 4096);
- if(substr($line,-1) != "\n") continue;
- if($lno > $srch) break; // shouldn't happen
-
-
- // do we want this line?
- if($lno == $srch){
- // add docs to list
- $docs[$srch] = idx_parseIndexLine($page_idx,$line);
-
- $srch = array_shift($wids); // next word to look up
- if($srch == null) break; // no more words
+ foreach(array_keys($wids) as $wlen){
+ sort($wids[$wlen]);
+ $wids[$wlen] = array_unique($wids[$wlen]);
+
+ // Open index
+ idx_touchIndex('i',$wlen);
+ $idx = fopen($conf['indexdir']."/i$wlen.idx",'r');
+ if(!$idx){
+ msg("Failed to open index file",-1);
+ return false;
}
- $line = ''; // reset line buffer
- $lno++; // increase linecounter
+ // Walk the index til the lines are found
+ $lno = 0;
+ $line = '';
+ $ixids =& $wids[$wlen];
+ $srch = array_shift($ixids); // which word do we look for?
+ while (!feof($idx)) {
+ // read full line
+ $line .= fgets($idx, 4096);
+ if(substr($line,-1) != "\n") continue;
+ if($lno > $srch) break; // shouldn't happen
+
+ // do we want this line?
+ if($lno == $srch){
+ // add docs to list
+ $docs["$wlen*$srch"] = idx_parseIndexLine($page_idx,$line);
+
+ $srch = array_shift($ixids); // next word to look up
+ if($srch == null) break; // no more words
+ }
+
+ $line = ''; // reset line buffer
+ $lno++; // increase linecounter
+ }
+ fclose($idx);
}
- fclose($idx);
// merge found pages into final result array
diff --git a/inc/init.php b/inc/init.php
index 552f98526..c097cd5bd 100644
--- a/inc/init.php
+++ b/inc/init.php
@@ -133,6 +133,7 @@ function init_paths(){
'mediadir' => 'media',
'metadir' => 'meta',
'cachedir' => 'cache',
+ 'indexdir' => 'index',
'lockdir' => 'locks');
foreach($paths as $c => $p){
@@ -157,9 +158,7 @@ function init_paths(){
function init_files(){
global $conf;
- $files = array( $conf['cachedir'].'/word.idx',
- $conf['cachedir'].'/page.idx',
- $conf['cachedir'].'/index.idx');
+ $files = array( $conf['indexdir'].'/page.idx');
foreach($files as $file){
if(!@file_exists($file)){
diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php
index 224c54311..12177406f 100644
--- a/lib/exe/indexer.php
+++ b/lib/exe/indexer.php
@@ -120,6 +120,18 @@ function runIndexer(){
global $conf;
print "runIndexer(): started".NL;
+ // Move index files (if needed)
+ // Uses the importoldindex plugin to upgrade the index automatically.
+ // FIXME: Remove this from runIndexer when it is no longer needed.
+ if (@file_exists($conf['cachedir'].'/page.idx') &&
+ (!@file_exists($conf['indexdir'].'/page.idx') ||
+ !filesize($conf['indexdir'].'/page.idx')) &&
+ !@file_exists($conf['indexdir'].'/index_importing')) {
+ echo "trigger TEMPORARY_INDEX_UPGRADE_EVENT\n";
+ $tmp = array(); // no event data
+ trigger_event('TEMPORARY_INDEX_UPGRADE_EVENT', $tmp);
+ }
+
$ID = cleanID($_REQUEST['id']);
if(!$ID) return false;
@@ -233,7 +245,7 @@ function runSitemapper(){
return false;
}
- $pages = file($conf['cachedir'].'/page.idx');
+ $pages = file($conf['indexdir'].'/page.idx');
print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL;
// build the sitemap
diff --git a/lib/plugins/importoldindex/action.php b/lib/plugins/importoldindex/action.php
new file mode 100644
index 000000000..26b37664c
--- /dev/null
+++ b/lib/plugins/importoldindex/action.php
@@ -0,0 +1,58 @@
+<?php
+// must be run within Dokuwiki
+if(!defined('DOKU_INC')) die();
+
+if(!defined('DOKU_PLUGIN')) define('DOKU_PLUGIN',DOKU_INC.'lib/plugins/');
+require_once(DOKU_PLUGIN.'action.php');
+
+class action_plugin_importoldindex extends DokuWiki_Action_Plugin {
+
+ function getInfo(){
+ return array(
+ 'author' => 'Tom N Harris',
+ 'email' => 'tnharris@whoopdedo.org',
+ 'date' => '2006-11-09',
+ 'name' => 'Import Old Index',
+ 'desc' => 'Moves old index files to a new location, sorted by string length.',
+ 'url' => 'http://whoopdedo.org/doku/wiki'
+ );
+ }
+
+ function register(&$controller) {
+ $controller->register_hook('TEMPORARY_INDEX_UPGRADE_EVENT', 'BEFORE', $this, 'run_import');
+ }
+
+ function run_import(&$event, $args) {
+ global $conf;
+
+ touch($conf['indexdir'].'/index_importing'); // changelog importing lock
+ // load old index
+ $word_idx = file($conf['cachedir'].'/word.idx');
+ $idx = file($conf['cachedir'].'/index.idx');
+ $words = array();
+ for ($lno=0;$lno<count($word_idx);$lno++){
+ $wlen = strlen($word_idx[$lno])-1;
+ //if($wlen<3) continue;
+ if(!isset($words[$wlen])) $words[$wlen] = array();
+ $words[$wlen][] = $lno;
+ }
+
+ foreach (array_keys($words) as $wlen) {
+ $new_words = array();
+ $new_idx = array();
+ foreach ($words[$wlen] as $lno) {
+ $new_words[] = $word_idx[$lno];
+ $new_idx[] = $idx[$lno];
+ }
+ io_saveFile($conf['indexdir']."/w$wlen.idx", implode('', $new_words));
+ io_saveFile($conf['indexdir']."/i$wlen.idx", implode('', $new_idx));
+ }
+
+ @copy($conf['cachedir'].'/page.idx', $conf['indexdir'].'/page.idx');
+ if($conf['fperm']) chmod($conf['indexdir'].'/page.idx', $conf['fperm']);
+ unlink($conf['indexdir'].'/index_importing'); // changelog importing unlock
+ plugin_disable('importoldindex'); // only needs to run once
+ }
+
+}
+