diff options
author | Tom N Harris <tnharris@whoopdedo.org> | 2010-12-29 03:50:05 -0500 |
---|---|---|
committer | Tom N Harris <tnharris@whoopdedo.org> | 2010-12-29 03:50:05 -0500 |
commit | 9b41be2446ea725a496f34b28ac4db84bece57c9 (patch) | |
tree | bb002063560d6132c6cda8d85cd1d1d089084f27 /inc | |
parent | 00803e562833be06ab5a869541581314b9b84d58 (diff) | |
download | rpg-9b41be2446ea725a496f34b28ac4db84bece57c9.tar.gz rpg-9b41be2446ea725a496f34b28ac4db84bece57c9.tar.bz2 |
Indexer v3 Rewrite part two, update uses of indexer
Diffstat (limited to 'inc')
-rw-r--r-- | inc/Sitemapper.php | 2 | ||||
-rw-r--r-- | inc/fulltext.php | 73 | ||||
-rw-r--r-- | inc/indexer.php | 54 | ||||
-rw-r--r-- | inc/init.php | 2 |
4 files changed, 74 insertions, 57 deletions
diff --git a/inc/Sitemapper.php b/inc/Sitemapper.php index 47a3fedb5..bbe1caf26 100644 --- a/inc/Sitemapper.php +++ b/inc/Sitemapper.php @@ -45,7 +45,7 @@ class Sitemapper { dbglog("Sitemapper::generate(): using $sitemap"); // FIXME: Only in debug mode - $pages = idx_getIndex('page', ''); + $pages = idx_get_indexer()->getPages(); dbglog('Sitemapper::generate(): creating sitemap using '.count($pages).' pages'); $items = array(); diff --git a/inc/fulltext.php b/inc/fulltext.php index 7ace3a724..0411b9f99 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){ * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ function _ft_pageSearch(&$data) { + $Indexer = idx_get_indexer(); + // parse the given query - $q = ft_queryParser($data['query']); + $q = ft_queryParser($Indexer, $data['query']); $data['highlight'] = $q['highlight']; if (empty($q['parsed_ary'])) return array(); // lookup all words found in the query - $lookup = idx_lookup($q['words']); + $lookup = $Indexer->lookup($q['words']); // get all pages in this dokuwiki site (!: includes nonexistent pages) $pages_all = array(); - foreach (idx_getIndex('page', '') as $id) { - $pages_all[trim($id)] = 0; // base: 0 hit + foreach ($Indexer->getPages() as $id) { + $pages_all[$id] = 0; // base: 0 hit } // process the query @@ -126,15 +128,12 @@ function _ft_pageSearch(&$data) { * evaluates the instructions of the found pages */ function ft_backlinks($id){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - $result = array(); // quick lookup of the pagename + // FIXME use metadata key lookup $page = noNS($id); - $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) + $matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .) $docs = array_keys(ft_resultCombine(array_values($matches))); $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages if(!count($docs)) return $result; @@ -168,17 +167,14 @@ function ft_backlinks($id){ * Aborts after $max found results */ function ft_mediause($id,$max){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - if(!$max) $max = 1; // need to find at least one $result = array(); // quick lookup of the mediafile + // FIXME use metadata key lookup $media = noNS($id); - $matches = idx_lookup(idx_tokenizer($media,$stopwords)); + $matches = idx_lookup(idx_tokenizer($media)); $docs = array_keys(ft_resultCombine(array_values($matches))); if(!count($docs)) return $result; @@ -229,7 +225,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){ } function _ft_pageLookup(&$data){ - global $conf; // split out original parameters $id = $data['id']; if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) { @@ -239,29 +234,27 @@ function _ft_pageLookup(&$data){ $in_ns = $data['in_ns']; $in_title = $data['in_title']; + $cleaned = cleanID($id); - $pages = array_map('rtrim', idx_getIndex('page', '')); - $titles = array_map('rtrim', idx_getIndex('title', '')); - // check for corrupt title index #FS2076 - if(count($pages) != count($titles)){ - $titles = array_fill(0,count($pages),''); - @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php - } - $pages = array_combine($pages, $titles); + $Indexer = idx_get_indexer(); + $page_idx = $Indexer->getPages(); - $cleaned = cleanID($id); + $pages = array(); if ($id !== '' && $cleaned !== '') { - foreach ($pages as $p_id => $p_title) { - if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) && - (!$in_title || (stripos($p_title, $id) === false)) ) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } + //if ($in_title) + // $titles = $Indexer->lookupKey('title', "*$id*"); } if (isset($ns)) { - foreach (array_keys($pages) as $p_id) { - if (strpos($p_id, $ns) !== 0) { - unset($pages[$p_id]); + foreach ($page_idx as $p_id) { + if (strpos($p_id, $ns) === 0) { + if (!isset($pages[$p_id])) + $pages[$p_id] = p_get_first_heading($p_id, false); } } } @@ -494,11 +487,7 @@ function ft_resultComplement($args) { * @author Andreas Gohr <andi@splitbrain.org> * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ -function ft_queryParser($query){ - global $conf; - $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; - $stopwords = @file_exists($swfile) ? file($swfile) : array(); - +function ft_queryParser($Indexer, $query){ /** * parse a search query and transform it into intermediate representation * @@ -544,7 +533,7 @@ function ft_queryParser($query){ if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { // phrase-include and phrase-exclude $not = $matches[1] ? 'NOT' : ''; - $parsed = $not.ft_termParser($matches[2], $stopwords, false, true); + $parsed = $not.ft_termParser($Indexer, $matches[2], false, true); } else { // fix incomplete phrase $term = str_replace('"', ' ', $term); @@ -591,10 +580,10 @@ function ft_queryParser($query){ $parsed .= '(N+:'.$matches[1].')'; } elseif (preg_match('/^-(.+)$/', $token, $matches)) { // word-exclude - $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')'; + $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')'; } else { // word-include - $parsed .= ft_termParser($token, $stopwords); + $parsed .= ft_termParser($Indexer, $token); } } } @@ -728,18 +717,18 @@ function ft_queryParser($query){ * * @author Kazutaka Miyasaka <kazmiya@gmail.com> */ -function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) { +function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) { $parsed = ''; if ($consider_asian) { // successive asian characters need to be searched as a phrase $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($words as $word) { if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true; - $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); + $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode); } } else { $term_noparen = str_replace(array('(', ')'), ' ', $term); - $words = idx_tokenizer($term_noparen, $stopwords, true); + $words = $Indexer->tokenizer($term_noparen, true); // W_: no need to highlight if (empty($words)) { diff --git a/inc/indexer.php b/inc/indexer.php index 099b7e9fc..a61f3772a 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -97,7 +97,8 @@ class Doku_Indexer { * @author Andreas Gohr <andi@splitbrain.org> */ public function addPageWords($page, $text) { - $this->_lock(); + if (!$this->_lock()) + return "locked"; // load known documents $page_idx = $this->_addIndexKey('page', '', $page); @@ -348,12 +349,12 @@ class Doku_Indexer { * in the returned list is an array with the page names as keys and the * number of times that token appeas on the page as value. * - * @param array $tokens list of words to search for + * @param arrayref $tokens list of words to search for * @return array list of page names with usage counts * @author Tom N Harris <tnharris@whoopdedo.org> * @author Andreas Gohr <andi@splitbrain.org> */ - public function lookup($tokens) { + public function lookup(&$tokens) { $result = array(); $wids = $this->_getIndexWords($tokens, $result); if (empty($wids)) return array(); @@ -397,10 +398,11 @@ class Doku_Indexer { * @param string $key name of the metadata key to look for * @param string $value search term to look for * @param callback $func comparison function - * @return array list with page names + * @return array list with page names, keys are query values if more than one given * @author Tom N Harris <tnharris@whoopdedo.org> */ public function lookupKey($key, $value, $func=null) { + return array(); } /** @@ -411,12 +413,12 @@ class Doku_Indexer { * The $result parameter can be used to merge the index locations with * the appropriate query term. * - * @param array $words The query terms. + * @param arrayref $words The query terms. * @param arrayref $result Set to word => array("length*id" ...) * @return array Set to length => array(id ...) * @author Tom N Harris <tnharris@whoopdedo.org> */ - private function _getIndexWords($words, &$result) { + private function _getIndexWords(&$words, &$result) { $tokens = array(); $tokenlength = array(); $tokenwild = array(); @@ -807,7 +809,7 @@ class Doku_Indexer { * @return object a Doku_Indexer * @author Tom N Harris <tnharris@whoopdedo.org> */ -function & idx_get_indexer() { +function idx_get_indexer() { static $Indexer = null; if (is_null($Indexer)) { $Indexer = new Doku_Indexer(); @@ -841,10 +843,23 @@ function & idx_get_stopwords() { * Locking is handled internally. * * @param string $page name of the page to index + * @param boolean $verbose print status messages * @return boolean the function completed successfully * @author Tom N Harris <tnharris@whoopdedo.org> */ -function idx_addPage($page) { +function idx_addPage($page, $verbose=false) { + // check if indexing needed + $idxtag = metaFN($page,'.indexed'); + if(@file_exists($idxtag)){ + if(trim(io_readFile($idxtag)) == idx_get_version()){ + $last = @filemtime($idxtag); + if($last > @filemtime(wikiFN($ID))){ + if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); + return false; + } + } + } + $body = ''; $data = array($page, $body); $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); @@ -853,8 +868,19 @@ function idx_addPage($page) { unset($evt); list($page,$body) = $data; - $Indexer =& idx_get_indexer(); - return $Indexer->addPageWords($page, $body); + $Indexer = idx_get_indexer(); + $result = $Indexer->addPageWords($page, $body); + if ($result == "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + if ($result) + io_saveFile(metaFN($page,'.indexed'), idx_get_version()); + if ($verbose) { + print("Indexer: finished".DOKU_LF); + return true; + } + return $result; } /** @@ -866,11 +892,11 @@ function idx_addPage($page) { * Important: No ACL checking is done here! All results are * returned, regardless of permissions * - * @param array $words list of words to search for + * @param arrayref $words list of words to search for * @return array list of pages found, associated with the search terms */ -function idx_lookup($words) { - $Indexer =& idx_get_indexer(); +function idx_lookup(&$words) { + $Indexer = idx_get_indexer(); return $Indexer->lookup($words); } @@ -879,7 +905,7 @@ function idx_lookup($words) { * */ function idx_tokenizer($string, $wc=false) { - $Indexer =& idx_get_indexer(); + $Indexer = idx_get_indexer(); return $Indexer->tokenizer($string, $wc); } diff --git a/inc/init.php b/inc/init.php index ed4409729..1dc31a31f 100644 --- a/inc/init.php +++ b/inc/init.php @@ -276,6 +276,7 @@ function init_files(){ } # create title index (needs to have same length as page.idx) + /* $file = $conf['indexdir'].'/title.idx'; if(!@file_exists($file)){ $pages = file($conf['indexdir'].'/page.idx'); @@ -290,6 +291,7 @@ function init_files(){ nice_die("$file is not writable. Check your permissions settings!"); } } + */ } /** |