diff options
Diffstat (limited to 'inc')
-rw-r--r-- | inc/fulltext.php | 2 | ||||
-rw-r--r-- | inc/indexer.php | 319 | ||||
-rw-r--r-- | inc/init.php | 5 |
3 files changed, 202 insertions, 124 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index 448f72248..1534ec1a8 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -149,7 +149,7 @@ function ft_backlinks($id){ function ft_pageLookup($id,$pageonly=true){ global $conf; $id = preg_quote($id,'/'); - $pages = file($conf['cachedir'].'/page.idx'); + $pages = file($conf['indexdir'].'/page.idx'); $pages = array_values(preg_grep('/'.$id.'/',$pages)); $cnt = count($pages); diff --git a/inc/indexer.php b/inc/indexer.php index e6550c2e4..c90f2b179 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -26,16 +26,59 @@ define('IDX_ASIAN','['. /** + * Write a list of strings to an index file. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_saveIndex($pre, $wlen, $idx){ + global $conf; + $fn = $conf['indexdir'].'/'.$pre.$wlen; + $fh = @fopen($fn.'.tmp','w'); + if(!$fh) return false; + fwrite($fh,join('',$idx)); + fclose($fh); + if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']); + io_rename($fn.'.tmp', $fn.'.idx'); + return true; +} + +/** + * Read the list of words in an index (if it exists). + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_getIndex($pre, $wlen){ + global $conf; + $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; + if(!@file_exists($fn)) return array(); + return file($fn); +} + +/** + * Create an empty index file if it doesn't exist yet. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_touchIndex($pre, $wlen){ + global $conf; + $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; + if(!@file_exists($fn)){ + touch($fn); + if($conf['fperm']) chmod($fn, $conf['fperm']); + } +} + +/** * Split a page into words * - * Returns an array of of word counts, false if an error occured + * Returns an array of word counts, false if an error occured. + * Array is keyed on the word length, then the word index. * * @author Andreas Gohr <andi@splitbrain.org> * @author Christopher Smith <chris@jalakai.co.uk> */ function idx_getPageWords($page){ global $conf; - $word_idx = file($conf['cachedir'].'/word.idx'); $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; if(@file_exists($swfile)){ $stopwords = file($swfile); @@ -65,47 +108,40 @@ function idx_getPageWords($page){ $words = array(); foreach ($tokens as $word => $count) { - // simple filter to restrict use of utf8_stripspecials - if (preg_match('/[^0-9A-Za-z]/u', $word)) { - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); - if(!is_null($asia)) $word = $asia; //recover from regexp failure - $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*')); - $arr = array_count_values($arr); - - foreach ($arr as $w => $c) { - if (!is_numeric($w) && strlen($w) < 3) continue; - $w = utf8_strtolower($w); - $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0); + $arr = idx_tokenizer($word,$stopwords); + $arr = array_count_values($arr); + foreach ($arr as $w => $c) { + $l = strlen($w); + if(isset($words[$l])){ + $words[$l][$w] = $c * $count + (isset($words[$l][$w])) ? $words[$l][$w] : 0; + }else{ + $words[$l] = array($w => $c * $count); } - } else { - if (!is_numeric($word) && strlen($word) < 3) continue; - $word = strtolower($word); - $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); } } - // arrive here with $words = array(word => frequency) + // arrive here with $words = array(wordlen => array(word => frequency)) $index = array(); //resulting index - foreach ($words as $word => $freq) { - if (is_int(array_search("$word\n",$stopwords))) continue; - $wid = array_search("$word\n",$word_idx); - if(!is_int($wid)){ - $word_idx[] = "$word\n"; - $wid = count($word_idx)-1; + foreach (array_keys($words) as $wlen){ + $word_idx = idx_getIndex('w',$wlen); + foreach ($words[$wlen] as $word => $freq) { + $wid = array_search("$word\n",$word_idx); + if(!is_int($wid)){ + $word_idx[] = "$word\n"; + $wid = count($word_idx)-1; + } + if(!isset($index[$wlen])) + $index[$wlen] = array(); + $index[$wlen][$wid] = $freq; } - $index[$wid] = $freq; - } - // save back word index - $fh = fopen($conf['cachedir'].'/word.idx','w'); - if(!$fh){ - trigger_error("Failed to write word.idx", E_USER_ERROR); - return false; + // save back word index + if(!idx_saveIndex('w',$wlen,$word_idx)){ + trigger_error("Failed to write word index", E_USER_ERROR); + return false; + } } - fwrite($fh,join('',$word_idx)); - fclose($fh); return $index; } @@ -123,7 +159,7 @@ function idx_addPage($page){ global $conf; // load known documents - $page_idx = file($conf['cachedir'].'/page.idx'); + $page_idx = idx_getIndex('page',''); // get page id (this is the linenumber in page.idx) $pid = array_search("$page\n",$page_idx); @@ -131,10 +167,8 @@ function idx_addPage($page){ $page_idx[] = "$page\n"; $pid = count($page_idx)-1; // page was new - write back - $fh = fopen($conf['cachedir'].'/page.idx','w'); - if(!$fh) return false; - fwrite($fh,join('',$page_idx)); - fclose($fh); + if (!idx_saveIndex('page','',$page_idx)) + return false; } // get word usage in page @@ -142,46 +176,51 @@ function idx_addPage($page){ if($words === false) return false; if(!count($words)) return true; - // Open index and temp file - $idx = fopen($conf['cachedir'].'/index.idx','r'); - $tmp = fopen($conf['cachedir'].'/index.tmp','w'); - if(!$idx || !$tmp){ - trigger_error("Failed to open index files", E_USER_ERROR); - return false; - } + foreach(array_keys($words) as $wlen){ + // Open index and temp file + $fn = $conf['indexdir']."/i$wlen"; + idx_touchIndex('i',$wlen); + $idx = fopen($fn.'.idx','r'); + $tmp = fopen($fn.'.tmp','w'); + if(!$idx || !$tmp){ + trigger_error("Failed to open index files", E_USER_ERROR); + return false; + } - // copy from index to temp file, modifying were needed - $lno = 0; - $line = ''; - while (!feof($idx)) { - // read full line - $line .= fgets($idx, 4096); - if(substr($line,-1) != "\n") continue; + // copy from index to temp file, modifying where needed + $lno = 0; + $line = ''; + while (!feof($idx)) { + // read full line + $line .= fgets($idx, 4096); + if(substr($line,-1) != "\n") continue; - // write a new Line to temp file - idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); + // write a new Line to temp file + idx_writeIndexLine($tmp,$line,$pid,$words[$wlen][$lno]); - $line = ''; // reset line buffer - $lno++; // increase linecounter - } - fclose($idx); - - // add missing lines (usually index and word should contain - // the same number of lines, however if the page contained - // new words the word file has some more lines which need to - // be added here - $word_idx = file($conf['cachedir'].'/word.idx'); - $wcnt = count($word_idx); - for($lno; $lno<$wcnt; $lno++){ - idx_writeIndexLine($tmp,'',$pid,$words[$lno]); + $line = ''; // reset line buffer + $lno++; // increase linecounter + } + fclose($idx); + + // add missing lines (usually index and word should contain + // the same number of lines, however if the page contained + // new words the word file has some more lines which need to + // be added here + $word_idx = idx_getIndex('w',$wlen); + $wcnt = count($word_idx); + for($lno; $lno<$wcnt; $lno++){ + idx_writeIndexLine($tmp,'',$pid,$words[$wlen][$lno]); + } + + // close the temp file and move it over to be the new one + fclose($tmp); + if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']); + // try rename first (fast) fallback to copy (slow) + io_rename($fn.'.tmp', $fn.'.idx'); } - // close the temp file and move it over to be the new one - fclose($tmp); - // try rename first (fast) fallback to copy (slow) - io_rename($conf['cachedir'].'/index.tmp', - $conf['cachedir'].'/index.idx'); - return false; + return true; } /** @@ -218,6 +257,34 @@ function idx_writeIndexLine($fh,$line,$pid,$count){ } /** + * Get the word lengths that have been indexed. + * + * Reads the index directory and returns an array of lengths + * that there are indices for. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_indexLengths($minlen){ + global $conf; + $dir = @opendir($conf['indexdir']); + if($dir===false) + return array(); + $idx = array(); + // Exact match first. + if(@file_exists($conf['indexdir']."/i$minlen.idx")) + $idx[] = $minlen; + while (($f = readdir($dir)) !== false) { + if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){ + $i = substr($f,1,-4); + if (is_numeric($i) && $i > $minlen) + $idx[] = $i; + } + } + closedir($dir); + return $idx; +} + +/** * Lookup words in index * * Takes an array of word and will return a list of matching @@ -234,8 +301,7 @@ function idx_lookup($words){ $result = array(); // load known words and documents - $page_idx = file($conf['cachedir'].'/page.idx'); - $word_idx = file($conf['cachedir'].'/word.idx'); + $page_idx = idx_getIndex('page',''); // get word IDs $wids = array(); @@ -243,80 +309,93 @@ function idx_lookup($words){ $result[$word] = array(); $wild = 0; $xword = $word; + $wlen = strlen($word); // check for wildcards if(substr($xword,0,1) == '*'){ $xword = substr($xword,1); $wild = 1; $ptn = '/'.preg_quote($xword,'/').'$/'; + $wlen -= 1; # $l = -1*strlen($xword)-1; } if(substr($xword,-1,1) == '*'){ $xword = substr($xword,0,-1); $wild += 2; + $wlen -= 1; } + if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue; // look for the ID(s) for the given word if($wild){ // handle wildcard search - $cnt = count($word_idx); - for($wid=0; $wid<$cnt; $wid++){ - $iword = $word_idx[$wid]; - if( (($wild==3) && is_int(strpos($iword,$xword))) || -# (($wild==1) && ("$xword\n" == substr($iword,$l))) || - (($wild==1) && preg_match($ptn,$iword)) || -# (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) - (($wild==2) && (0 === strpos($iword,$xword))) - - ){ - $wids[] = $wid; - $result[$word][] = $wid; + foreach (idx_indexLengths($wlen) as $ixlen){ + $word_idx = idx_getIndex('w',$ixlen); + $cnt = count($word_idx); + for($wid=0; $wid<$cnt; $wid++){ + $iword = $word_idx[$wid]; + if( (($wild==3) && is_int(strpos($iword,$xword))) || +# (($wild==1) && ("$xword\n" == substr($iword,$l))) || + (($wild==1) && preg_match($ptn,$iword)) || +# (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) + (($wild==2) && (0 === strpos($iword,$xword))) + + ){ + if(!isset($wids[$ixlen])) $wids[$ixlen] = array(); + $wids[$ixlen][] = $wid; + $result[$word][] = "$ixlen*$wid"; + } } } }else{ // handle exact search + $word_idx = idx_getIndex('w',$wlen); $wid = array_search("$word\n",$word_idx); if(is_int($wid)){ - $wids[] = $wid; - $result[$word][] = $wid; + $wids[$wlen] = array($wid); + $result[$word][] = "$wlen*$wid"; }else{ $result[$word] = array(); } } } - sort($wids); - $wids = array_unique($wids); - - // Open index - $idx = fopen($conf['cachedir'].'/index.idx','r'); - if(!$idx){ - msg("Failed to open index file",-1); - return false; - } - // Walk the index til the lines are found $docs = array(); // hold docs found - $lno = 0; - $line = ''; - $srch = array_shift($wids); // which word do we look for? - while (!feof($idx)) { - // read full line - $line .= fgets($idx, 4096); - if(substr($line,-1) != "\n") continue; - if($lno > $srch) break; // shouldn't happen - - - // do we want this line? - if($lno == $srch){ - // add docs to list - $docs[$srch] = idx_parseIndexLine($page_idx,$line); - - $srch = array_shift($wids); // next word to look up - if($srch == null) break; // no more words + foreach(array_keys($wids) as $wlen){ + sort($wids[$wlen]); + $wids[$wlen] = array_unique($wids[$wlen]); + + // Open index + idx_touchIndex('i',$wlen); + $idx = fopen($conf['indexdir']."/i$wlen.idx",'r'); + if(!$idx){ + msg("Failed to open index file",-1); + return false; } - $line = ''; // reset line buffer - $lno++; // increase linecounter + // Walk the index til the lines are found + $lno = 0; + $line = ''; + $ixids =& $wids[$wlen]; + $srch = array_shift($ixids); // which word do we look for? + while (!feof($idx)) { + // read full line + $line .= fgets($idx, 4096); + if(substr($line,-1) != "\n") continue; + if($lno > $srch) break; // shouldn't happen + + // do we want this line? + if($lno == $srch){ + // add docs to list + $docs["$wlen*$srch"] = idx_parseIndexLine($page_idx,$line); + + $srch = array_shift($ixids); // next word to look up + if($srch == null) break; // no more words + } + + $line = ''; // reset line buffer + $lno++; // increase linecounter + } + fclose($idx); } - fclose($idx); // merge found pages into final result array diff --git a/inc/init.php b/inc/init.php index 552f98526..c097cd5bd 100644 --- a/inc/init.php +++ b/inc/init.php @@ -133,6 +133,7 @@ function init_paths(){ 'mediadir' => 'media', 'metadir' => 'meta', 'cachedir' => 'cache', + 'indexdir' => 'index', 'lockdir' => 'locks'); foreach($paths as $c => $p){ @@ -157,9 +158,7 @@ function init_paths(){ function init_files(){ global $conf; - $files = array( $conf['cachedir'].'/word.idx', - $conf['cachedir'].'/page.idx', - $conf['cachedir'].'/index.idx'); + $files = array( $conf['indexdir'].'/page.idx'); foreach($files as $file){ if(!@file_exists($file)){ |