diff options
author | Andreas Gohr <andi@splitbrain.org> | 2008-02-23 21:52:54 +0100 |
---|---|---|
committer | Andreas Gohr <andi@splitbrain.org> | 2008-02-23 21:52:54 +0100 |
commit | a05e297acbd41dc059369b143e2cadf281a581a1 (patch) | |
tree | ea599c6c71513567e9dddc23e6ac3be21888b22f /inc/fulltext.php | |
parent | b5742ced86ad0a0e0448556d81e6c97c12ae9d9f (diff) | |
download | rpg-a05e297acbd41dc059369b143e2cadf281a581a1.tar.gz rpg-a05e297acbd41dc059369b143e2cadf281a581a1.tar.bz2 |
use fulltext index to search for used media files FS#1336 FS#1275
This changes how DokuWiki looks for reference toa media file which is
about to deleted. Instead of doing a full grep through all pages it now
uses the fulltext index first, then does an exact match on the found
pages.
This speeds up the search significantly on larger wikis. However the
fulltext search limits now apply: images with names shorter than 3 charcters
may not be found.
This needs extensive testing!
darcs-hash:20080223205254-7ad00-486de0a4125d51b4e7999827f710d1d9de8bc60d.gz
Diffstat (limited to 'inc/fulltext.php')
-rw-r--r-- | inc/fulltext.php | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index b10cbde8e..a0be280bf 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -153,6 +153,56 @@ function ft_backlinks($id){ } /** + * Returns the pages that use a given media file + * + * Does a quick lookup with the fulltext index, then + * evaluates the instructions of the found pages + * + * Aborts after $max found results + */ +function ft_mediause($id,$max){ + global $conf; + $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; + $stopwords = @file_exists($swfile) ? file($swfile) : array(); + + if(!$max) $max = 1; // need to find at least one + + $result = array(); + + // quick lookup of the mediafile + $media = noNS($id); + $matches = idx_lookup(idx_tokenizer($media,$stopwords)); + $docs = array_keys(ft_resultCombine(array_values($matches))); + if(!count($docs)) return $result; + + // go through all found pages + $found = 0; + $pcre = preg_quote($media,'/'); + foreach($docs as $doc){ + $ns = getNS($doc); + preg_match_all('/\{\{([^|}]*'.$pcre.'[^|}]*)(|[^}]+)?\}\}/i',rawWiki($doc),$matches); + foreach($matches[1] as $img){ + $img = trim($img); + if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images + list($img) = explode('?',$img); // remove any parameters + resolve_mediaid($ns,$img,$exists); // resolve the possibly relative img + + if($img == $id){ // we have a match + $result[] = $doc; + $found++; + break; + } + } + if($found >= $max) break; + } + + sort($result); + return $result; +} + + + +/** * Quicksearch for pagenames * * By default it only matches the pagename and ignores the |