From 03aafe1cc677b4b0b0b0ef9ed04d8ab72f3ce583 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Mon, 3 Dec 2012 00:13:01 +0100 Subject: Indexer: add getPID/getPageFromPID functions and PID to INDEXER_PAGE_ADD This allows plugins to get the PID for a page and also to get the page for a certain PID. That way plugins can build their own index that uses numeric ids. --- inc/indexer.php | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 6 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index f22aee3a0..cbb817d78 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -120,7 +120,7 @@ class Doku_Indexer { return "locked"; // load known documents - $pid = $this->addIndexKey('page', '', $page); + $pid = $this->getPIDNoLock($page); if ($pid === false) { $this->unlock(); return false; @@ -256,7 +256,7 @@ class Doku_Indexer { return "locked"; // load known documents - $pid = $this->addIndexKey('page', '', $page); + $pid = $this->getPIDNoLock($page); if ($pid === false) { $this->unlock(); return false; @@ -348,7 +348,7 @@ class Doku_Indexer { return "locked"; // load known documents - $pid = $this->addIndexKey('page', '', $page); + $pid = $this->getPIDNoLock($page); if ($pid === false) { $this->unlock(); return false; @@ -453,6 +453,48 @@ class Doku_Indexer { return array_values($wordlist); } + /** + * Get the numeric PID of a page + * + * @param string $page The page to get the PID for + * @return bool|int The page id on success, false on error + */ + public function getPID($page) { + if (!$this->lock()) + return false; + + // load known documents + $pid = $this->getPIDNoLock($page); + if ($pid === false) { + $this->unlock(); + return false; + } + + $this->unlock(); + return $pid; + } + + /** + * Get the numeric PID of a page without locking the index. + * Only use this function when the index is already locked. + * + * @param string $page The page to get the PID for + * @return bool|int The page id on success, false on error + */ + protected function getPIDNoLock($page) { + return $this->addIndexKey('page', '', $page); + } + + /** + * Get the page id of a numeric PID + * + * @param int $pid The PID to get the page id for + * @return string The page id + */ + public function getPageFromPID($pid) { + return $this->getIndexKey('page', '', $pid); + } + /** * Find pages in the fulltext index containing the words, * @@ -946,7 +988,7 @@ class Doku_Indexer { * @param string $idx name of the index * @param string $suffix subpart identifier * @param string $value line to find in the index - * @return int line number of the value in the index + * @return int|bool line number of the value in the index or false if writing the index failed * @author Tom N Harris */ protected function addIndexKey($idx, $suffix, $value) { @@ -1223,6 +1265,12 @@ function idx_addPage($page, $verbose=false, $force=false) { return $result; } + $Indexer = idx_get_indexer(); + $pid = $Indexer->getPID($page); + if ($pid === false) { + if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); + return false; + } $body = ''; $metadata = array(); $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); @@ -1230,14 +1278,13 @@ function idx_addPage($page, $verbose=false, $force=false) { $metadata['relation_references'] = array_keys($references); else $metadata['relation_references'] = array(); - $data = compact('page', 'body', 'metadata'); + $data = compact('page', 'body', 'metadata', 'pid'); $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); $evt->advise_after(); unset($evt); extract($data); - $Indexer = idx_get_indexer(); $result = $Indexer->addPageWords($page, $body); if ($result === "locked") { if ($verbose) print("Indexer: locked".DOKU_LF); -- cgit v1.2.3 From 3d2ce006bd7cffab5cda27f01787d2fd66ab630e Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Tue, 4 Dec 2012 16:13:46 +0100 Subject: Indexer: Add cache for getPID() This avoids re-reading of the page index file for every getPID()-call by using a simple FIFO cache, limited to 10 items. In idx_addPage() and the functions that it calls getPID() is called 3 times for the same PID. --- inc/indexer.php | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index cbb817d78..05b082872 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -102,6 +102,10 @@ function wordlen($w){ * @author Tom N Harris */ class Doku_Indexer { + /** + * @var array $pidCache Cache for getPID() + */ + protected $pidCache = array(); /** * Adds the contents of a page to the fulltext index @@ -460,6 +464,9 @@ class Doku_Indexer { * @return bool|int The page id on success, false on error */ public function getPID($page) { + // return PID without locking when it is in the cache + if (isset($this->pidCache[$page])) return $this->pidCache[$page]; + if (!$this->lock()) return false; @@ -482,7 +489,14 @@ class Doku_Indexer { * @return bool|int The page id on success, false on error */ protected function getPIDNoLock($page) { - return $this->addIndexKey('page', '', $page); + // avoid expensive addIndexKey operation for the most recently requested pages by using a cache + if (isset($this->pidCache[$page])) return $this->pidCache[$page]; + $pid = $this->addIndexKey('page', '', $page); + // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently + // added item will be requested again + if (count($this->pidCache) > 10) array_shift($this->pidCache); + $this->pidCache[$page] = $pid; + return $pid; } /** -- cgit v1.2.3 From 1421e5483ad1ca8780331077141e0c07b6530bc5 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Tue, 4 Dec 2012 16:15:49 +0100 Subject: Indexer: Make $Indexer in idx_get_indexer() global instead of static This avoids problems with test cases that use the indexer and the PID cache - the index is cleaned between test cases but the PID cache wasn't cleaned. Now PHPUnit can unset the global $Indexer between test cases. --- inc/indexer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 05b082872..4dfaa33fb 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -1196,8 +1196,8 @@ class Doku_Indexer { * @author Tom N Harris */ function idx_get_indexer() { - static $Indexer = null; - if (is_null($Indexer)) { + global $Indexer; + if (!isset($Indexer)) { $Indexer = new Doku_Indexer(); } return $Indexer; -- cgit v1.2.3 From 3cf3c7d60c81d562c8331a377a76e86d41f8f528 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Thu, 20 Dec 2012 17:47:05 +0100 Subject: Add clear function to the indexer that deletes the whole index --- inc/indexer.php | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 4dfaa33fb..b9eaf31d9 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -401,6 +401,38 @@ class Doku_Indexer { return true; } + /** + * Clear the whole index + * + * @return bool If the index has been cleared successfully + */ + public function clear() { + global $conf; + + if (!$this->lock()) return false; + + @unlink($conf['indexdir'].'/page.idx'); + @unlink($conf['indexdir'].'/title.idx'); + @unlink($conf['indexdir'].'/pageword.idx'); + @unlink($conf['indexdir'].'/metadata.idx'); + $dir = @opendir($conf['indexdir']); + if($dir!==false){ + while(($f = readdir($dir)) !== false){ + if(substr($f,-4)=='.idx' && + (substr($f,0,1)=='i' || substr($f,0,1)=='w' + || substr($f,-6)=='_w.idx' || substr($f,-6)=='_i.idx' || substr($f,-6)=='_p.idx')) + @unlink($conf['indexdir']."/$f"); + } + } + @unlink($conf['indexdir'].'/lengths.idx'); + + // clear the pid cache + $this->pidCache = array(); + + $this->unlock(); + return true; + } + /** * Split the text into words for fulltext search * -- cgit v1.2.3 From 4f7083212993bf72f0b8969e6d57f587faaa5dfc Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Thu, 20 Dec 2012 17:47:39 +0100 Subject: Remove global $Indexer, clear index in the testing system instead --- inc/indexer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index b9eaf31d9..7a62345bf 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -1228,7 +1228,7 @@ class Doku_Indexer { * @author Tom N Harris */ function idx_get_indexer() { - global $Indexer; + static $Indexer; if (!isset($Indexer)) { $Indexer = new Doku_Indexer(); } -- cgit v1.2.3 From 5eb9e8678ddc58a01929a9f340a01048836b47d3 Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sat, 19 Jan 2013 16:59:39 +0100 Subject: Indexer: Added page and meta value rename functions With these functions that search index can be updated after page moves or mass metadata updates without the need to reindex the whole page/wiki. These functions will be used by the new pagemove plugin. --- inc/indexer.php | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 7a62345bf..70eac035b 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -338,6 +338,106 @@ class Doku_Indexer { return true; } + /** + * Rename a page in the search index without changing the indexed content + * + * @param string $oldpage The old page name + * @param string $newpage The new page name + * @return string|bool If the page was successfully renamed, can be a message in the case of an error + */ + public function renamePage($oldpage, $newpage) { + if (!$this->lock()) return 'locked'; + + $pages = $this->getPages(); + + $id = array_search($oldpage, $pages); + if ($id === false) { + $this->unlock(); + return 'page is not in index'; + } + + $new_id = array_search($newpage, $pages); + if ($new_id !== false) { + $this->unlock(); + // make sure the page is not in the index anymore + $this->deletePage($newpage); + if (!$this->lock()) return 'locked'; + + $pages[$new_id] = 'deleted:'.time().rand(0, 9999); + } + + $pages[$id] = $newpage; + + // update index + if (!$this->saveIndex('page', '', $pages)) { + $this->unlock(); + return false; + } + + // reset the pid cache + $this->pidCache = array(); + + $this->unlock(); + return true; + } + + /** + * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages + * will be updated. + * + * @param string $key The metadata key of which a value shall be changed + * @param string $oldvalue The old value that shall be renamed + * @param string $newvalue The new value to which the old value shall be renamed, can exist (then values will be merged) + * @return bool|string If renaming the value has been successful, false or error message on error. + */ + public function renameMetaValue($key, $oldvalue, $newvalue) { + if (!$this->lock()) return 'locked'; + + // change the relation references index + $metavalues = $this->getIndex($key, '_w'); + $oldid = array_search($oldvalue, $metavalues); + if ($oldid !== false) { + $newid = array_search($newvalue, $metavalues); + if ($newid !== false) { + // free memory + unset ($metavalues); + + // okay, now we have two entries for the same value. we need to merge them. + $indexline = $this->getIndexKey($key, '_i', $oldid); + if ($indexline != '') { + $newindexline = $this->getIndexKey($key, '_i', $newid); + $pagekeys = $this->getIndex($key, '_p'); + $parts = explode(':', $indexline); + foreach ($parts as $part) { + list($id, $count) = explode('*', $part); + $newindexline = $this->updateTuple($newindexline, $id, $count); + + $keyline = explode(':', $pagekeys[$id]); + // remove old meta value + $keyline = array_diff($keyline, array($oldid)); + // add new meta value when not already present + if (!in_array($newid, $keyline)) { + array_push($keyline, $newid); + } + $pagekeys[$id] = implode(':', $keyline); + } + $this->saveIndex($key, '_p', $pagekeys); + unset($pagekeys); + $this->saveIndexKey($key, '_i', $oldid, ''); + $this->saveIndexKey($key, '_i', $newid, $newindexline); + } + } else { + $metavalues[$oldid] = $newvalue; + if (!$this->saveIndex($key, '_w', $metavalues)) { + $this->unlock(); + return false; + } + } + } + + $this->unlock(); + return true; + } /** * Remove a page from the index * -- cgit v1.2.3 From af73bba62fb11d7872a8b108b156d451302695bd Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sat, 26 Jan 2013 11:17:59 +0100 Subject: Clarified the behavior of the Doku_Indexer::renamePage method --- inc/indexer.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 70eac035b..37ca92055 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -339,7 +339,9 @@ class Doku_Indexer { } /** - * Rename a page in the search index without changing the indexed content + * Rename a page in the search index without changing the indexed content. This function doesn't check if the + * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the + * indexer and it deletes all previously indexed content of the new page. * * @param string $oldpage The old page name * @param string $newpage The new page name -- cgit v1.2.3 From 25adeb91ff207452ebd6275707b8a0cc3121db6c Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sat, 26 Jan 2013 11:18:52 +0100 Subject: Indexer: added internal deletePageNoLock method The new deletePageNoLock method is used by renamePage and avoids that the index needs to be unlocked and locked again for deleting the page. --- inc/indexer.php | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index 37ca92055..c08e438bf 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -360,10 +360,8 @@ class Doku_Indexer { $new_id = array_search($newpage, $pages); if ($new_id !== false) { - $this->unlock(); // make sure the page is not in the index anymore - $this->deletePage($newpage); - if (!$this->lock()) return 'locked'; + $this->deletePageNoLock($newpage); $pages[$new_id] = 'deleted:'.time().rand(0, 9999); } @@ -440,6 +438,7 @@ class Doku_Indexer { $this->unlock(); return true; } + /** * Remove a page from the index * @@ -453,10 +452,26 @@ class Doku_Indexer { if (!$this->lock()) return "locked"; + $result = $this->deletePageNoLock($page); + + $this->unlock(); + + return $result; + } + + /** + * Remove a page from the index without locking the index, only use this function if the index is already locked + * + * Erases entries in all known indexes. + * + * @param string $page a page name + * @return boolean the function completed successfully + * @author Tom N Harris + */ + protected function deletePageNoLock($page) { // load known documents $pid = $this->getPIDNoLock($page); if ($pid === false) { - $this->unlock(); return false; } @@ -482,7 +497,6 @@ class Doku_Indexer { } // Save the reverse index if (!$this->saveIndexKey('pageword', '', $pid, "")) { - $this->unlock(); return false; } @@ -499,7 +513,6 @@ class Doku_Indexer { $this->saveIndexKey($metaname.'_p', '', $pid, ''); } - $this->unlock(); return true; } -- cgit v1.2.3 From bc27f3e28790e9a25e9428ed275624578b3e9a2d Mon Sep 17 00:00:00 2001 From: Michael Hamann Date: Sat, 26 Jan 2013 11:22:52 +0100 Subject: Indexer: abort page rename if deletion of new id fails --- inc/indexer.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php index c08e438bf..e518907d7 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -361,7 +361,9 @@ class Doku_Indexer { $new_id = array_search($newpage, $pages); if ($new_id !== false) { // make sure the page is not in the index anymore - $this->deletePageNoLock($newpage); + if ($this->deletePageNoLock($newpage) !== true) { + return false; + } $pages[$new_id] = 'deleted:'.time().rand(0, 9999); } -- cgit v1.2.3