summaryrefslogtreecommitdiff
path: root/inc/indexer.php
diff options
context:
space:
mode:
Diffstat (limited to 'inc/indexer.php')
-rw-r--r--inc/indexer.php124
1 files changed, 94 insertions, 30 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index 658fb966b..a86bfc656 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -10,7 +10,7 @@
if(!defined('DOKU_INC')) die('meh.');
// Version tag used to force rebuild on upgrade
-define('INDEXER_VERSION', 7);
+define('INDEXER_VERSION', 8);
// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
@@ -61,6 +61,8 @@ define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
*
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author Michael Hamann <michael@content-space.de>
+ *
+ * @return int|string
*/
function idx_get_version(){
static $indexer_version = null;
@@ -84,6 +86,9 @@ function idx_get_version(){
* Differs from strlen in handling of asian characters.
*
* @author Tom N Harris <tnharris@whoopdedo.org>
+ *
+ * @param string $w
+ * @return int
*/
function wordlen($w){
$l = strlen($w);
@@ -115,7 +120,8 @@ class Doku_Indexer {
*
* @param string $page a page name
* @param string $text the body of the page
- * @return boolean the function completed successfully
+ * @return string|boolean the function completed successfully
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author Andreas Gohr <andi@splitbrain.org>
*/
@@ -190,6 +196,7 @@ class Doku_Indexer {
*
* @param string $text content of the page
* @return array list of word IDs and number of times used
+ *
* @author Andreas Gohr <andi@splitbrain.org>
* @author Christopher Smith <chris@jalakai.co.uk>
* @author Tom N Harris <tnharris@whoopdedo.org>
@@ -215,6 +222,7 @@ class Doku_Indexer {
foreach (array_keys($words) as $wlen) {
$word_idx = $this->getIndex('w', $wlen);
foreach ($words[$wlen] as $word => $freq) {
+ $word = (string)$word;
$wid = array_search($word, $word_idx, true);
if ($wid === false) {
$wid = count($word_idx);
@@ -244,7 +252,8 @@ class Doku_Indexer {
* @param string $page a page name
* @param mixed $key a key string or array of key=>value pairs
* @param mixed $value the value or list of values
- * @return boolean the function completed successfully
+ * @return boolean|string the function completed successfully
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author Michael Hamann <michael@content-space.de>
*/
@@ -269,8 +278,9 @@ class Doku_Indexer {
// Special handling for titles so the index file is simpler
if (array_key_exists('title', $key)) {
$value = $key['title'];
- if (is_array($value))
+ if (is_array($value)) {
$value = $value[0];
+ }
$this->saveIndexKey('title', '', $pid, $value);
unset($key['title']);
}
@@ -298,20 +308,24 @@ class Doku_Indexer {
if ($val !== "") {
$id = array_search($val, $metawords, true);
if ($id === false) {
+ // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx
$id = count($metawords);
$metawords[$id] = $val;
+ $metaidx[$id] = '';
$addwords = true;
}
// test if value is already in the index
- if (isset($val_idx[$id]) && $val_idx[$id] <= 0)
+ if (isset($val_idx[$id]) && $val_idx[$id] <= 0){
$val_idx[$id] = 0;
- else // else add it
+ } else { // else add it
$val_idx[$id] = 1;
+ }
}
}
- if ($addwords)
+ if ($addwords) {
$this->saveIndex($metaname.'_w', '', $metawords);
+ }
$vals_changed = false;
foreach ($val_idx as $id => $action) {
if ($action == -1) {
@@ -447,7 +461,8 @@ class Doku_Indexer {
* Erases entries in all known indexes.
*
* @param string $page a page name
- * @return boolean the function completed successfully
+ * @return string|boolean the function completed successfully
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
public function deletePage($page) {
@@ -468,6 +483,7 @@ class Doku_Indexer {
*
* @param string $page a page name
* @return boolean the function completed successfully
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
protected function deletePageNoLock($page) {
@@ -562,6 +578,7 @@ class Doku_Indexer {
* @param string $text plain text
* @param boolean $wc are wildcards allowed?
* @return array list of words in the text
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author Andreas Gohr <andi@splitbrain.org>
*/
@@ -670,6 +687,7 @@ class Doku_Indexer {
*
* @param array $tokens list of words to search for
* @return array list of page names with usage counts
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author Andreas Gohr <andi@splitbrain.org>
*/
@@ -724,6 +742,7 @@ class Doku_Indexer {
* @param string $value search term to look for, must be a string or array of strings
* @param callback $func comparison function
* @return array lists with page names, keys are query values if $value is array
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author Michael Hamann <michael@content-space.de>
*/
@@ -823,6 +842,7 @@ class Doku_Indexer {
* @param array $words The query terms.
* @param array $result Set to word => array("length*id" ...)
* @return array Set to length => array(id ...)
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
protected function getIndexWords(&$words, &$result) {
@@ -903,6 +923,7 @@ class Doku_Indexer {
*
* @param string $key list only pages containing the metadata key (optional)
* @return array list of page names
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
public function getPages($key=null) {
@@ -936,6 +957,7 @@ class Doku_Indexer {
* @param int $minlen minimum length of words to count
* @param string $key metadata key to list. Uses the fulltext index if not given
* @return array list of words as the keys and frequency as values
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
public function histogram($min=1, $max=0, $minlen=3, $key=null) {
@@ -996,6 +1018,8 @@ class Doku_Indexer {
* Lock the indexer.
*
* @author Tom N Harris <tnharris@whoopdedo.org>
+ *
+ * @return bool|string
*/
protected function lock() {
global $conf;
@@ -1027,6 +1051,8 @@ class Doku_Indexer {
* Release the indexer lock.
*
* @author Tom N Harris <tnharris@whoopdedo.org>
+ *
+ * @return bool
*/
protected function unlock() {
global $conf;
@@ -1044,12 +1070,13 @@ class Doku_Indexer {
* @param string $idx name of the index
* @param string $suffix subpart identifier
* @return array list of lines without CR or LF
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
protected function getIndex($idx, $suffix) {
global $conf;
$fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
- if (!@file_exists($fn)) return array();
+ if (!file_exists($fn)) return array();
return file($fn, FILE_IGNORE_NEW_LINES);
}
@@ -1060,6 +1087,7 @@ class Doku_Indexer {
* @param string $suffix subpart identifier
* @param array $lines list of lines without LF
* @return bool If saving succeeded
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
protected function saveIndex($idx, $suffix, &$lines) {
@@ -1084,12 +1112,13 @@ class Doku_Indexer {
* @param string $suffix subpart identifier
* @param int $id the line number
* @return string a line with trailing whitespace removed
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
protected function getIndexKey($idx, $suffix, $id) {
global $conf;
$fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
- if (!@file_exists($fn)) return '';
+ if (!file_exists($fn)) return '';
$fh = @fopen($fn, 'r');
if (!$fh) return '';
$ln = -1;
@@ -1108,6 +1137,7 @@ class Doku_Indexer {
* @param int $id the line number
* @param string $line line to write
* @return bool If saving succeeded
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
protected function saveIndexKey($idx, $suffix, $id, $line) {
@@ -1149,6 +1179,7 @@ class Doku_Indexer {
* @param string $suffix subpart identifier
* @param string $value line to find in the index
* @return int|bool line number of the value in the index or false if writing the index failed
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
protected function addIndexKey($idx, $suffix, $value) {
@@ -1172,6 +1203,8 @@ class Doku_Indexer {
* a sorted array of lengths of the words used in the wiki.
*
* @author YoBoY <yoboy.leguesh@gmail.com>
+ *
+ * @return array
*/
protected function listIndexLengths() {
return idx_listIndexLengths();
@@ -1184,6 +1217,9 @@ class Doku_Indexer {
* that there are indices for.
*
* @author YoBoY <yoboy.leguesh@gmail.com>
+ *
+ * @param array|int $filter
+ * @return array
*/
protected function indexLengths($filter) {
global $conf;
@@ -1192,7 +1228,7 @@ class Doku_Indexer {
// testing if index files exist only
$path = $conf['indexdir']."/i";
foreach ($filter as $key => $value) {
- if (@file_exists($path.$key.'.idx'))
+ if (file_exists($path.$key.'.idx'))
$idx[] = $key;
}
} else {
@@ -1210,19 +1246,25 @@ class Doku_Indexer {
* Insert or replace a tuple in a line.
*
* @author Tom N Harris <tnharris@whoopdedo.org>
+ *
+ * @param string $line
+ * @param string|int $id
+ * @param int $count
+ * @return string
*/
protected function updateTuple($line, $id, $count) {
- $newLine = $line;
- if ($newLine !== '')
- $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine);
- $newLine = trim($newLine, ':');
+ if ($line != ''){
+ $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line);
+ }
+ $line = trim($line, ':');
if ($count) {
- if (strlen($newLine) > 0)
- return "$id*$count:".$newLine;
- else
- return "$id*$count".$newLine;
+ if ($line) {
+ return "$id*$count:".$line;
+ } else {
+ return "$id*$count";
+ }
}
- return $newLine;
+ return $line;
}
/**
@@ -1230,6 +1272,10 @@ class Doku_Indexer {
*
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author Andreas Gohr <andi@splitbrain.org>
+ *
+ * @param array $keys
+ * @param string $line
+ * @return array
*/
protected function parseTuples(&$keys, $line) {
$result = array();
@@ -1250,13 +1296,16 @@ class Doku_Indexer {
* Sum the counts in a list of tuples.
*
* @author Tom N Harris <tnharris@whoopdedo.org>
+ *
+ * @param string $line
+ * @return int
*/
protected function countTuples($line) {
$freq = 0;
$parts = explode(':', $line);
foreach ($parts as $tuple) {
if ($tuple === '') continue;
- list($pid, $cnt) = explode('*', $tuple);
+ list(/* $pid */, $cnt) = explode('*', $tuple);
$freq += (int)$cnt;
}
return $freq;
@@ -1266,7 +1315,8 @@ class Doku_Indexer {
/**
* Create an instance of the indexer.
*
- * @return Doku_Indexer a Doku_Indexer
+ * @return Doku_Indexer a Doku_Indexer
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
function idx_get_indexer() {
@@ -1281,6 +1331,7 @@ function idx_get_indexer() {
* Returns words that will be ignored.
*
* @return array list of stop words
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
function & idx_get_stopwords() {
@@ -1288,7 +1339,7 @@ function & idx_get_stopwords() {
if (is_null($stopwords)) {
global $conf;
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
- if(@file_exists($swfile)){
+ if(file_exists($swfile)){
$stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
}else{
$stopwords = array();
@@ -1305,14 +1356,15 @@ function & idx_get_stopwords() {
* @param string $page name of the page to index
* @param boolean $verbose print status messages
* @param boolean $force force reindexing even when the index is up to date
- * @return boolean the function completed successfully
+ * @return string|boolean the function completed successfully
+ *
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
function idx_addPage($page, $verbose=false, $force=false) {
$idxtag = metaFN($page,'.indexed');
// check if page was deleted but is still in the index
if (!page_exists($page)) {
- if (!@file_exists($idxtag)) {
+ if (!file_exists($idxtag)) {
if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
return false;
}
@@ -1327,7 +1379,7 @@ function idx_addPage($page, $verbose=false, $force=false) {
}
// check if indexing needed
- if(!$force && @file_exists($idxtag)){
+ if(!$force && file_exists($idxtag)){
if(trim(io_readFile($idxtag)) == idx_get_version()){
$last = @filemtime($idxtag);
if($last > @filemtime(wikiFN($page))){
@@ -1340,7 +1392,7 @@ function idx_addPage($page, $verbose=false, $force=false) {
$indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
if ($indexenabled === false) {
$result = false;
- if (@file_exists($idxtag)) {
+ if (file_exists($idxtag)) {
$Indexer = idx_get_indexer();
$result = $Indexer->deletePage($page);
if ($result === "locked") {
@@ -1434,11 +1486,15 @@ function idx_tokenizer($string, $wc=false) {
* Read the list of words in an index (if it exists).
*
* @author Tom N Harris <tnharris@whoopdedo.org>
+ *
+ * @param string $idx
+ * @param string $suffix
+ * @return array
*/
function idx_getIndex($idx, $suffix) {
global $conf;
$fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
- if (!@file_exists($fn)) return array();
+ if (!file_exists($fn)) return array();
return file($fn);
}
@@ -1449,6 +1505,8 @@ function idx_getIndex($idx, $suffix) {
* a sorted array of lengths of the words used in the wiki.
*
* @author YoBoY <yoboy.leguesh@gmail.com>
+ *
+ * @return array
*/
function idx_listIndexLengths() {
global $conf;
@@ -1457,7 +1515,7 @@ function idx_listIndexLengths() {
$docache = false;
} else {
clearstatcache();
- if (@file_exists($conf['indexdir'].'/lengths.idx')
+ if (file_exists($conf['indexdir'].'/lengths.idx')
&& (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) {
$idx = array();
@@ -1503,6 +1561,9 @@ function idx_listIndexLengths() {
* that there are indices for.
*
* @author YoBoY <yoboy.leguesh@gmail.com>
+ *
+ * @param array|int $filter
+ * @return array
*/
function idx_indexLengths($filter) {
global $conf;
@@ -1511,7 +1572,7 @@ function idx_indexLengths($filter) {
// testing if index files exist only
$path = $conf['indexdir']."/i";
foreach ($filter as $key => $value) {
- if (@file_exists($path.$key.'.idx'))
+ if (file_exists($path.$key.'.idx'))
$idx[] = $key;
}
} else {
@@ -1532,6 +1593,9 @@ function idx_indexLengths($filter) {
* not a letter, number, or underscore.
*
* @author Tom N Harris <tnharris@whoopdedo.org>
+ *
+ * @param string $name
+ * @return string
*/
function idx_cleanName($name) {
$name = utf8_romanize(trim((string)$name));