diff options
author | Tom N Harris <tnharris@whoopdedo.org> | 2010-12-29 03:50:05 -0500 |
---|---|---|
committer | Tom N Harris <tnharris@whoopdedo.org> | 2010-12-29 03:50:05 -0500 |
commit | 9b41be2446ea725a496f34b28ac4db84bece57c9 (patch) | |
tree | bb002063560d6132c6cda8d85cd1d1d089084f27 | |
parent | 00803e562833be06ab5a869541581314b9b84d58 (diff) | |
download | rpg-9b41be2446ea725a496f34b28ac4db84bece57c9.tar.gz rpg-9b41be2446ea725a496f34b28ac4db84bece57c9.tar.bz2 |
Indexer v3 Rewrite part two, update uses of indexer
-rwxr-xr-x | bin/indexer.php | 39 | ||||
-rw-r--r-- | inc/Sitemapper.php | 2 | ||||
-rw-r--r-- | inc/fulltext.php | 73 | ||||
-rw-r--r-- | inc/indexer.php | 54 | ||||
-rw-r--r-- | inc/init.php | 2 | ||||
-rw-r--r-- | lib/exe/indexer.php | 35 | ||||
-rw-r--r-- | lib/exe/xmlrpc.php | 27 |
7 files changed, 111 insertions, 121 deletions
diff --git a/bin/indexer.php b/bin/indexer.php index 497c6146a..0d523df6e 100755 --- a/bin/indexer.php +++ b/bin/indexer.php @@ -24,6 +24,7 @@ if ( $OPTS->isError() ) { } $CLEAR = false; $QUIET = false; +$INDEXER = null; foreach ($OPTS->options as $key => $val) { switch ($key) { case 'h': @@ -66,6 +67,9 @@ function _usage() { function _update(){ global $conf; + global $INDEXER; + + $INDEXER = idx_get_indexer(); $data = array(); _quietecho("Searching pages... "); @@ -78,25 +82,47 @@ function _update(){ } function _index($id){ + global $INDEXER; global $CLEAR; + global $QUIET; // if not cleared only update changed and new files if(!$CLEAR){ $idxtag = metaFN($id,'.indexed'); if(@file_exists($idxtag)){ if(io_readFile($idxtag) == idx_get_version()){ - $last = @filemtime(metaFN($id,'.indexed')); + $last = @filemtime($idxtag); if($last > @filemtime(wikiFN($id))) return; } } } - _lock(); _quietecho("$id... "); - idx_addPage($id); - io_saveFile(metaFN($id,'.indexed'), idx_get_version()); + $body = ''; + $data = array($id, $body); + $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); + if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($id); + $evt->advise_after(); + unset($evt); + list($id,$body) = $data; + $said = false; + while(true) { + $result = $INDEXER->addPageWords($id, $body); + if ($result == "locked") { + if($said){ + _quietecho("."); + }else{ + _quietecho("Waiting for lockfile (max. 5 min)"); + $said = true; + } + sleep(15); + } else { + break; + } + } + if ($result) + io_saveFile(metaFN($id,'.indexed'), idx_get_version()); _quietecho("done.\n"); - _unlock(); } /** @@ -141,7 +167,7 @@ function _clearindex(){ _lock(); _quietecho("Clearing index... "); io_saveFile($conf['indexdir'].'/page.idx',''); - io_saveFile($conf['indexdir'].'/title.idx',''); + //io_saveFile($conf['indexdir'].'/title.idx',''); $dir = @opendir($conf['indexdir']); if($dir!==false){ while(($f = readdir($dir)) !== false){ @@ -150,6 +176,7 @@ function _clearindex(){ @unlink($conf['indexdir']."/$f"); } } + @unlink($conf['indexdir'].'/lengths.idx'); _quietecho("done.\n"); _unlock(); } diff --git a/inc/Sitemapper.php b/inc/Sitemapper.php index 47a3fedb5..bbe1caf26 100644 --- a/inc/Sitemapper.php +++ b/inc/Sitemapper.php @@ -45,7 +45,7 @@ class Sitemapper { dbglog("Sitemapper::generate(): using $sitemap"); // FIXME: Only in debug mode - $pages = idx_getIndex('page', ''); + $pages = idx_get_indexer()->getPages(); dbglog('Sitemapper::generate(): creating sitemap using '.count($pages).' pages'); $items = array(); diff --git a/inc/fulltext.php b/inc/fulltext.php index 7ace3a724..0411b9f99 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){ * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ function _ft_pageSearch(&$data) { + $Indexer = idx_get_indexer(); + // parse the given query - $q = ft_queryParser($data['query']); + $q = ft_queryParser($Indexer, $data['query']); $data['highlight'] = $q['highlight']; if (empty($q['parsed_ary'])) return array(); // lookup all words found in the query - $lookup = idx_lookup($q['words']); + $lookup = $Indexer->lookup($q['words']); // get all pages in this dokuwiki site (!: includes nonexistent pages) $pages_all = array(); - foreach (idx_getIndex('page', '') as $id) { - $pages_all[trim($id)] = 0; // base: 0 hit + foreach ($Indexer->getPages() as $id) { + $pages_all[$id] = 0; // base: 0 hit } // process the query @@ -126,15 +128,12 @@ function _ft_pageSearch(&$data) { * evaluates the instructions of the found pages */ function ft_backlinks($id){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - $result = array(); // quick lookup of the pagename + // FIXME use metadata key lookup $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) + $matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .) $docs = array_keys(ft_resultCombine(array_values($matches))); $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages if(!count($docs)) return $result; @@ -168,17 +167,14 @@ function ft_backlinks($id){ * Aborts after $max found results */ function ft_mediause($id,$max){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - if(!$max) $max = 1; // need to find at least one $result = array(); // quick lookup of the mediafile + // FIXME use metadata key lookup $media = noNS($id); - $matches = idx_lookup(idx_tokenizer($media,$stopwords)); + $matches = idx_lookup(idx_tokenizer($media)); $docs = array_keys(ft_resultCombine(array_values($matches))); if(!count($docs)) return $result; @@ -229,7 +225,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){ } function _ft_pageLookup(&$data){ - global $conf; // split out original parameters $id = $data['id']; if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) { @@ -239,29 +234,27 @@ function _ft_pageLookup(&$data){ $in_ns = $data['in_ns']; $in_title = $data['in_title']; + $cleaned = cleanID($id); - $pages = array_map('rtrim', idx_getIndex('page', '')); - $titles = array_map('rtrim', idx_getIndex('title', '')); - // check for corrupt title index #FS2076 - if(count($pages) != count($titles)){ - $titles = array_fill(0,count($pages),''); - @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php - } - $pages = array_combine($pages, $titles); + $Indexer = idx_get_indexer(); + $page_idx = $Indexer->getPages(); - $cleaned = cleanID($id); + $pages = array(); if ($id !== '' && $cleaned !== '') { - foreach ($pages as $p_id => $p_title) { - if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) && - (!$in_title || (stripos($p_title, $id) === false)) ) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } + //if ($in_title) + // $titles = $Indexer->lookupKey('title', "*$id*"); } if (isset($ns)) { - foreach (array_keys($pages) as $p_id) { - if (strpos($p_id, $ns) !== 0) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if (strpos($p_id, $ns) === 0) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } } @@ -494,11 +487,7 @@ function ft_resultComplement($args) { * @author Andreas Gohr <andi@splitbrain.org> * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ -function ft_queryParser($query){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - +function ft_queryParser($Indexer, $query){ /** * parse a search query and transform it into intermediate representation * @@ -544,7 +533,7 @@ function ft_queryParser($query){ if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { // phrase-include and phrase-exclude $not = $matches[1] ? 'NOT' : ''; - $parsed = $not.ft_termParser($matches[2], $stopwords, false, true); + $parsed = $not.ft_termParser($Indexer, $matches[2], false, true); } else { // fix incomplete phrase $term = str_replace('"', ' ', $term); @@ -591,10 +580,10 @@ function ft_queryParser($query){ $parsed .= '(N+:'.$matches[1].')'; } elseif (preg_match('/^-(.+)$/', $token, $matches)) { // word-exclude - $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')'; + $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')'; } else { // word-include - $parsed .= ft_termParser($token, $stopwords); + $parsed .= ft_termParser($Indexer, $token); } } } @@ -728,18 +717,18 @@ function ft_queryParser($query){ * * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ -function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) { +function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) { $parsed = ''; if ($consider_asian) { // successive asian characters need to be searched as a phrase $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($words as $word) { if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true; - $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); + $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode); } } else { $term_noparen = str_replace(array('(', ')'), ' ', $term); - $words = idx_tokenizer($term_noparen, $stopwords, true); + $words = $Indexer->tokenizer($term_noparen, true); // W_: no need to highlight if (empty($words)) { diff --git a/inc/indexer.php b/inc/indexer.php index 099b7e9fc..a61f3772a 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -97,7 +97,8 @@ class Doku_Indexer { * @author Andreas Gohr <andi@splitbrain.org> */ public function addPageWords($page, $text) { - $this->_lock(); + if (!$this->_lock()) + return "locked"; // load known documents $page_idx = $this->_addIndexKey('page', '', $page); @@ -348,12 +349,12 @@ class Doku_Indexer { * in the returned list is an array with the page names as keys and the * number of times that token appeas on the page as value. * - * @param array $tokens list of words to search for + * @param arrayref $tokens list of words to search for * @return array list of page names with usage counts * @author Tom N Harris <tnharris@whoopdedo.org> * @author Andreas Gohr <andi@splitbrain.org> */ - public function lookup($tokens) { + public function lookup(&$tokens) { $result = array(); $wids = $this->_getIndexWords($tokens, $result); if (empty($wids)) return array(); @@ -397,10 +398,11 @@ class Doku_Indexer { * @param string $key name of the metadata key to look for * @param string $value search term to look for * @param callback $func comparison function - * @return array list with page names + * @return array list with page names, keys are query values if more than one given * @author Tom N Harris <tnharris@whoopdedo.org> */ public function lookupKey($key, $value, $func=null) { + return array(); } /** @@ -411,12 +413,12 @@ class Doku_Indexer { * The $result parameter can be used to merge the index locations with * the appropriate query term. * - * @param array $words The query terms. + * @param arrayref $words The query terms. * @param arrayref $result Set to word => array("length*id" ...) * @return array Set to length => array(id ...) * @author Tom N Harris <tnharris@whoopdedo.org> */ - private function _getIndexWords($words, &$result) { + private function _getIndexWords(&$words, &$result) { $tokens = array(); $tokenlength = array(); $tokenwild = array(); @@ -807,7 +809,7 @@ class Doku_Indexer { * @return object a Doku_Indexer * @author Tom N Harris <tnharris@whoopdedo.org> */ -function & idx_get_indexer() { +function idx_get_indexer() { static $Indexer = null; if (is_null($Indexer)) { $Indexer = new Doku_Indexer(); @@ -841,10 +843,23 @@ function & idx_get_stopwords() { * Locking is handled internally. * * @param string $page name of the page to index + * @param boolean $verbose print status messages * @return boolean the function completed successfully * @author Tom N Harris <tnharris@whoopdedo.org> */ -function idx_addPage($page) { +function idx_addPage($page, $verbose=false) { + // check if indexing needed + $idxtag = metaFN($page,'.indexed'); + if(@file_exists($idxtag)){ + if(trim(io_readFile($idxtag)) == idx_get_version()){ + $last = @filemtime($idxtag); + if($last > @filemtime(wikiFN($ID))){ + if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); + return false; + } + } + } + $body = ''; $data = array($page, $body); $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); @@ -853,8 +868,19 @@ function idx_addPage($page) { unset($evt); list($page,$body) = $data; - $Indexer =& idx_get_indexer(); - return $Indexer->addPageWords($page, $body); + $Indexer = idx_get_indexer(); + $result = $Indexer->addPageWords($page, $body); + if ($result == "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + if ($result) + io_saveFile(metaFN($page,'.indexed'), idx_get_version()); + if ($verbose) { + print("Indexer: finished".DOKU_LF); + return true; + } + return $result; } /** @@ -866,11 +892,11 @@ function idx_addPage($page) { * Important: No ACL checking is done here! All results are * returned, regardless of permissions * - * @param array $words list of words to search for + * @param arrayref $words list of words to search for * @return array list of pages found, associated with the search terms */ -function idx_lookup($words) { - $Indexer =& idx_get_indexer(); +function idx_lookup(&$words) { + $Indexer = idx_get_indexer(); return $Indexer->lookup($words); } @@ -879,7 +905,7 @@ function idx_lookup($words) { * */ function idx_tokenizer($string, $wc=false) { - $Indexer =& idx_get_indexer(); + $Indexer = idx_get_indexer(); return $Indexer->tokenizer($string, $wc); } diff --git a/inc/init.php b/inc/init.php index ed4409729..1dc31a31f 100644 --- a/inc/init.php +++ b/inc/init.php @@ -276,6 +276,7 @@ function init_files(){ } # create title index (needs to have same length as page.idx) + /* $file = $conf['indexdir'].'/title.idx'; if(!@file_exists($file)){ $pages = file($conf['indexdir'].'/page.idx'); @@ -290,6 +291,7 @@ function init_files(){ nice_die("$file is not writable. Check your permissions settings!"); } } + */ } /** diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 55d860296..a5a7d6b2a 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -134,41 +134,8 @@ function runIndexer(){ if(!$ID) return false; - // check if indexing needed - $idxtag = metaFN($ID,'.indexed'); - if(@file_exists($idxtag)){ - if(trim(io_readFile($idxtag)) == idx_get_version()){ - $last = @filemtime($idxtag); - if($last > @filemtime(wikiFN($ID))){ - print "runIndexer(): index for $ID up to date".NL; - return false; - } - } - } - - // try to aquire a lock - $lock = $conf['lockdir'].'/_indexer.lock'; - while(!@mkdir($lock,$conf['dmode'])){ - usleep(50); - if(time()-@filemtime($lock) > 60*5){ - // looks like a stale lock - remove it - @rmdir($lock); - print "runIndexer(): stale lock removed".NL; - }else{ - print "runIndexer(): indexer locked".NL; - return false; - } - } - if($conf['dperm']) chmod($lock, $conf['dperm']); - // do the work - idx_addPage($ID); - - // we're finished - save and free lock - io_saveFile(metaFN($ID,'.indexed'), idx_get_version()); - @rmdir($lock); - print "runIndexer(): finished".NL; - return true; + return idx_addPage($ID, true); } /** diff --git a/lib/exe/xmlrpc.php b/lib/exe/xmlrpc.php index 410d4f6ba..84068f96e 100644 --- a/lib/exe/xmlrpc.php +++ b/lib/exe/xmlrpc.php @@ -355,9 +355,8 @@ class dokuwiki_xmlrpc_server extends IXR_IntrospectionServer { */ function listPages(){ $list = array(); - $pages = array_filter(array_filter(idx_getIndex('page', ''), - 'isVisiblePage'), - 'page_exists'); + $pages = idx_get_indexer()->getPages(); + $pages = array_filter(array_filter($pages,'isVisiblePage'),'page_exists'); foreach(array_keys($pages) as $idx) { $perm = auth_quickaclcheck($pages[$idx]); @@ -552,27 +551,7 @@ class dokuwiki_xmlrpc_server extends IXR_IntrospectionServer { unlock($id); // run the indexer if page wasn't indexed yet - if(!@file_exists(metaFN($id, '.indexed'))) { - // try to aquire a lock - $lock = $conf['lockdir'].'/_indexer.lock'; - while(!@mkdir($lock,$conf['dmode'])){ - usleep(50); - if(time()-@filemtime($lock) > 60*5){ - // looks like a stale lock - remove it - @rmdir($lock); - }else{ - return false; - } - } - if($conf['dperm']) chmod($lock, $conf['dperm']); - - // do the work - idx_addPage($id); - - // we're finished - save and free lock - io_saveFile(metaFN($id,'.indexed'), idx_get_version()); - @rmdir($lock); - } + idx_addPage($id); return 0; } |