Indexer v3 Rewrite part one (unstable)

The indexer functions have been converted to a class interface. Use the Doku_Indexer class to access the indexer with these public methods: addPageWords addMetaKeys deletePage tokenizer lookup lookupKey getPages histogram These functions are provided for general use: idx_get_version idx_get_indexer idx_get_stopwords idx_addPage idx_lookup idx_tokenizer These functions are still available, but are deprecated: idx_getIndex idx_indexLengths All other old idx_ functions are unsupported and have been removed.
author: Tom N Harris <tnharris@whoopdedo.org> 2010-12-27 20:30:46 -0500
committer: Tom N Harris <tnharris@whoopdedo.org> 2010-12-27 20:30:46 -0500
commit: 00803e562833be06ab5a869541581314b9b84d58 (patch)
tree: 83586e2ae8081ad8000070e03982368b927e4c5f /inc/indexer.php
parent: 3c4b38902b6f6d32222611b22087d5d41d20de6e (diff)
download: rpg-00803e562833be06ab5a869541581314b9b84d58.tar.gz
rpg-00803e562833be06ab5a869541581314b9b84d58.tar.bz2
1 files changed, 786 insertions, 502 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index d4432026e..099b7e9fc 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -4,12 +4,13 @@
  *
  * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
  * @author     Andreas Gohr <andi@splitbrain.org>
+ * @author     Tom N Harris <tnharris@whoopdedo.org>
  */
 
 if(!defined('DOKU_INC')) die('meh.');
 
 // Version tag used to force rebuild on upgrade
-define('INDEXER_VERSION', 2);
+define('INDEXER_VERSION', 3);
 
 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
@@ -77,335 +78,827 @@ function wordlen($w){
 }
 
 /**
- * Write a list of strings to an index file.
+ * Class that encapsulates operations on the indexer database.
  *
  * @author Tom N Harris <tnharris@whoopdedo.org>
  */
-function idx_saveIndex($pre, $wlen, &$idx){
-    global $conf;
-    $fn = $conf['indexdir'].'/'.$pre.$wlen;
-    $fh = @fopen($fn.'.tmp','w');
-    if(!$fh) return false;
-    fwrite($fh,join('', $idx));
-    fclose($fh);
-    if(isset($conf['fperm'])) chmod($fn.'.tmp', $conf['fperm']);
-    io_rename($fn.'.tmp', $fn.'.idx');
-    return true;
-}
-
-/**
- * Append a given line to an index file.
- *
- * @author Andreas Gohr <andi@splitbrain.org>
- */
-function idx_appendIndex($pre, $wlen, $line){
-    global $conf;
-    $fn = $conf['indexdir'].'/'.$pre.$wlen;
-    $fh = @fopen($fn.'.idx','a');
-    if(!$fh) return false;
-    fwrite($fh,$line);
-    fclose($fh);
-    return true;
-}
+class Doku_Indexer {
+
+    /**
+     * Adds the contents of a page to the fulltext index
+     *
+     * The added text replaces previous words for the same page.
+     * An empty value erases the page.
+     *
+     * @param string    $page   a page name
+     * @param string    $text   the body of the page
+     * @return boolean          the function completed successfully
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     * @author Andreas Gohr <andi@splitbrain.org>
+     */
+    public function addPageWords($page, $text) {
+        $this->_lock();
+
+        // load known documents
+        $page_idx = $this->_addIndexKey('page', '', $page);
+        if ($page_idx === false) {
+            $this->_unlock();
+            return false;
+        }
 
-/**
- * Read the list of words in an index (if it exists).
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
-function idx_getIndex($pre, $wlen){
-    global $conf;
-    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
-    if(!@file_exists($fn)) return array();
-    return file($fn);
-}
+        $pagewords = array();
+        // get word usage in page
+        $words = $this->_getPageWords($text);
+        if ($words === false) {
+            $this->_unlock();
+            return false;
+        }
 
-/**
- * Read a line ending with \n.
- * Returns false on EOF.
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
-function _freadline($fh) {
-    if (feof($fh)) return false;
-    $ln = '';
-    while (($buf = fgets($fh,4096)) !== false) {
-        $ln .= $buf;
-        if (substr($buf,-1) == "\n") break;
-    }
-    if ($ln === '') return false;
-    if (substr($ln,-1) != "\n") $ln .= "\n";
-    return $ln;
-}
+        if (!empty($words)) {
+            foreach (array_keys($words) as $wlen) {
+                $index = $this->_getIndex('i', $wlen);
+                foreach ($words[$wlen] as $wid => $freq) {
+                    $idx = ($wid<count($index)) ? $index[$wid] : '';
+                    $index[$wid] = $this->_updateTuple($idx, $pid, $freq);
+                    $pagewords[] = "$wlen*$wid";
+                }
+                if (!$this->_saveIndex('i', $wlen, $index)) {
+                    $this->_unlock();
+                    return false;
+                }
+            }
+        }
 
-/**
- * Write a line to an index file.
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
-function idx_saveIndexLine($pre, $wlen, $idx, $line){
-    global $conf;
-    if(substr($line,-1) != "\n") $line .= "\n";
-    $fn = $conf['indexdir'].'/'.$pre.$wlen;
-    $fh = @fopen($fn.'.tmp','w');
-    if(!$fh) return false;
-    $ih = @fopen($fn.'.idx','r');
-    if ($ih) {
-        $ln = -1;
-        while (($curline = fgets($ih)) !== false) {
-            if (++$ln == $idx) {
-                fwrite($fh, $line);
-            } else {
-                fwrite($fh, $curline);
+        // Remove obsolete index entries
+        $pageword_idx = $this->_getIndexKey('pageword', '', $pid);
+        if ($pageword_idx !== '') {
+            $oldwords = explode(':',$pageword_idx);
+            $delwords = array_diff($oldwords, $pagewords);
+            $upwords = array();
+            foreach ($delwords as $word) {
+                if ($word != '') {
+                    list($wlen,$wid) = explode('*', $word);
+                    $wid = (int)$wid;
+                    $upwords[$wlen][] = $wid;
+                }
+            }
+            foreach ($upwords as $wlen => $widx) {
+                $index = $this->_getIndex('i', $wlen);
+                foreach ($widx as $wid) {
+                    $index[$wid] = $this->_updateTuple($index[$wid], $pid, 0);
+                }
+                $this->_saveIndex('i', $wlen, $index);
             }
         }
-        if ($idx > $ln) {
-            fwrite($fh,$line);
+        // Save the reverse index
+        $pageword_idx = join(':', $pagewords);
+        if (!$this->_saveIndexKey('pageword', '', $pid, $pageword_idx)) {
+            $this->_unlock();
+            return false;
         }
-        fclose($ih);
-    } else {
-        fwrite($fh,$line);
-    }
-    fclose($fh);
-    if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
-    io_rename($fn.'.tmp', $fn.'.idx');
-    return true;
-}
 
-/**
- * Read a single line from an index (if it exists).
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
-function idx_getIndexLine($pre, $wlen, $idx){
-    global $conf;
-    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
-    if(!@file_exists($fn)) return '';
-    $fh = @fopen($fn,'r');
-    if(!$fh) return '';
-    $ln = -1;
-    while (($line = fgets($fh)) !== false) {
-        if (++$ln == $idx) break;
+        $this->_unlock();
+        return true;
     }
-    fclose($fh);
-    return "$line";
-}
 
-/**
- * Split a page into words
- *
- * Returns an array of word counts, false if an error occurred.
- * Array is keyed on the word length, then the word index.
- *
- * @author Andreas Gohr <andi@splitbrain.org>
- * @author Christopher Smith <chris@jalakai.co.uk>
- */
-function idx_getPageWords($page){
-    global $conf;
-    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
-    if(@file_exists($swfile)){
-        $stopwords = file($swfile);
-    }else{
-        $stopwords = array();
-    }
+    /**
+     * Split the words in a page and add them to the index.
+     *
+     * @author Andreas Gohr <andi@splitbrain.org>
+     * @author Christopher Smith <chris@jalakai.co.uk>
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _getPageWords($text) {
+        global $conf;
+
+        $tokens = $this->tokenizer($text);
+        $tokens = array_count_values($tokens);  // count the frequency of each token
+
+        $words = array();
+        foreach ($tokens as $w=>$c) {
+            $l = wordlen($w);
+            if (isset($words[$l])){
+                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
+            }else{
+                $words[$l] = array($w => $c);
+            }
+        }
 
-    $body = '';
-    $data = array($page, $body);
-    $evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
-    if ($evt->advise_before()) $data[1] .= rawWiki($page);
-    $evt->advise_after();
-    unset($evt);
+        // arrive here with $words = array(wordlen => array(word => frequency))
+        $word_idx_modified = false;
+        $index = array();   //resulting index
+        foreach (array_keys($words) as $wlen) {
+            $word_idx = $this->_getIndex('w', $wlen);
+            foreach ($words[$wlen] as $word => $freq) {
+                $wid = array_search($word, $word_idx);
+                if ($wid === false) {
+                    $wid = count($word_idx);
+                    $word_idx[] = $word;
+                    $word_idx_modified = true;
+                }
+                if (!isset($index[$wlen]))
+                    $index[$wlen] = array();
+                $index[$wlen][$wid] = $freq;
+            }
+            // save back the word index
+            if ($word_idx_modified && !$this->_saveIndex('w', $wlen, $word_idx))
+                return false;
+        }
 
-    list($page,$body) = $data;
+        return $index;
+    }
 
-    $tokens = idx_tokenizer($body, $stopwords);
-    $tokens = array_count_values($tokens);   // count the frequency of each token
+    /**
+     * Add keys to the metadata index.
+     *
+     * Adding new keys does not remove other keys for the page.
+     * An empty value will erase the key.
+     * The $key parameter can be an array to add multiple keys. $value will
+     * not be used if $key is an array.
+     *
+     * @param string    $page   a page name
+     * @param mixed     $key    a key string or array of key=>value pairs
+     * @param mixed     $value  the value or list of values
+     * @return boolean          the function completed successfully
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    public function addMetaKeys($page, $key, $value=null) {
+        if (!is_array($key)) {
+            $key = array($key => $value);
+        } elseif (!is_null($value)) {
+            // $key is array, but $value is not null
+            trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING);
+        }
 
-    // ensure the deaccented or romanised page names of internal links are added to the token array
-    // (this is necessary for the backlink function -- there maybe a better way!)
-    if ($conf['deaccent']) {
-        $links = p_get_metadata($page,'relation references');
+        $this->_lock();
 
-        if (!empty($links)) {
-            $tmp = join(' ',array_keys($links));                // make a single string
-            $tmp = strtr($tmp, ':', ' ');                       // replace namespace separator with a space
-            $link_tokens = array_unique(explode(' ', $tmp));    // break into tokens
+        // load known documents
+        $pid = $this->_addIndexKey('page', '', $page);
+        if ($pid === false) {
+            $this->_unlock();
+            return false;
+        }
 
-            foreach ($link_tokens as $link_token) {
-                if (isset($tokens[$link_token])) continue;
-                $tokens[$link_token] = 1;
+        foreach ($key as $name => $values) {
+            $metaname = idx_cleanName($name);
+            $metaidx = $this->_getIndex($metaname, '_i');
+            $metawords = $this->_getIndex($metaname, '_w');
+            $addwords = false;
+            $update = array();
+            if (!is_array($val)) $values = array($values);
+            foreach ($values as $val) {
+                $val = (string)$val;
+                if ($val !== "") {
+                    $id = array_search($val, $metawords);
+                    if ($id === false) {
+                        $id = count($metawords);
+                        $metawords[$id] = $val;
+                        $addwords = true;
+                    }
+                    $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 1);
+                    $update[$id] = 1;
+                } else {
+                    $id = array_search($val, $metawords);
+                    if ($id !== false) {
+                        $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0);
+                        $update[$id] = 0;
+                    }
+                }
+            }
+            if (!empty($update)) {
+                if ($addwords)
+                    $this->_saveIndex($metaname.'_w', '', $metawords);
+                $this->_saveIndex($metaname.'_i', '', $metaidx);
+                $val_idx = $this->_getIndexKey($metaname, '_p', $pid);
+                $val_idx = array_flip(explode(':', $val_idx));
+                foreach ($update as $id => $add) {
+                    if ($add) $val_idx[$id] = 1;
+                    else unset($val_idx[$id]);
+                }
+                $val_idx = array_keys($val_idx);
+                $this->_saveIndexKey($metaname.'_p', '', $pid, $val_idx);
             }
+            unset($metaidx);
+            unset($metawords);
         }
+        return true;
     }
 
-    $words = array();
-    foreach ($tokens as $w => $c) {
-        $l = wordlen($w);
-        if(isset($words[$l])){
-            $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
-        }else{
-            $words[$l] = array($w => $c);
+    /**
+     * Remove a page from the index
+     *
+     * Erases entries in all known indexes.
+     *
+     * @param string    $page   a page name
+     * @return boolean          the function completed successfully
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    public function deletePage($page) {
+    }
+
+    /**
+     * Split the text into words for fulltext search
+     *
+     * TODO: does this also need &$stopwords ?
+     *
+     * @param string    $text   plain text
+     * @param boolean   $wc     are wildcards allowed?
+     * @return array            list of words in the text
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     * @author Andreas Gohr <andi@splitbrain.org>
+     */
+    public function tokenizer($text, $wc=false) {
+        global $conf;
+        $words = array();
+        $wc = ($wc) ? '' : '\*';
+        $stopwords =& idx_get_stopwords();
+
+        if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') {
+            if (0 == io_exec($conf['tokenizer_cmd'], $text, $output))
+                $text = $output;
+        } else {
+            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
+                // handle asian chars as single words (may fail on older PHP version)
+                $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text);
+                if (!is_null($asia)) $text = $asia; // recover from regexp falure
+            }
+        }
+        $text = strtr($text, "\r\n\t", '   ');
+        if (preg_match('/[^0-9A-Za-z ]/u', $text))
+            $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc);
+
+        $wordlist = explode(' ', $text);
+        foreach ($wordlist as $word) {
+            $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
+                utf8_strtolower($word) : strtolower($word);
+            if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue;
+            if (array_search($word, $stopwords) !== false) continue;
+            $words[] = $word;
         }
+        return $words;
     }
 
-    // arrive here with $words = array(wordlen => array(word => frequency))
+    /**
+     * Find pages in the fulltext index containing the words,
+     *
+     * The search words must be pre-tokenized, meaning only letters and
+     * numbers with an optional wildcard
+     *
+     * The returned array will have the original tokens as key. The values
+     * in the returned list is an array with the page names as keys and the
+     * number of times that token appeas on the page as value.
+     *
+     * @param array     $tokens list of words to search for
+     * @return array            list of page names with usage counts
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     * @author Andreas Gohr <andi@splitbrain.org>
+     */
+    public function lookup($tokens) {
+        $result = array();
+        $wids = $this->_getIndexWords($tokens, $result);
+        if (empty($wids)) return array();
+        // load known words and documents
+        $page_idx = $this->_getIndex('page', '');
+        $docs = array();
+        foreach (array_keys($wids) as $wlen) {
+            $wids[$wlen] = array_unique($wids[$wlen]);
+            $index = $this->_getIndex('i', $wlen);
+            foreach($wids[$wlen] as $ixid) {
+                if ($ixid < count($index))
+                    $docs["$wlen*$ixid"] = $this->_parseTuples($page_idx, $index[$ixid]);
+            }
+        }
+        // merge found pages into final result array
+        $final = array();
+        foreach ($result as $word => $res) {
+            $final[$word] = array();
+            foreach ($res as $wid) {
+                $hits = &$docs[$wid];
+                foreach ($hits as $hitkey => $hitcnt) {
+                    // make sure the document still exists
+                    if (!page_exists($hitkey, '', false)) continue;
+                    if (!isset($final[$word][$hitkey]))
+                        $final[$word][$hitkey] = $hitcnt;
+                    else
+                        $final[$word][$hitkey] += $hitcnt;
+                }
+            }
+        }
+        return $final;
+    }
 
-    $word_idx_modified = false;
+    /**
+     * Find pages containing a metadata key.
+     *
+     * The metadata values are compared as case-sensitive strings. Pass a
+     * callback function that returns true or false to use a different
+     * comparison function
+     *
+     * @param string    $key    name of the metadata key to look for
+     * @param string    $value  search term to look for
+     * @param callback  $func   comparison function
+     * @return array            list with page names
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    public function lookupKey($key, $value, $func=null) {
+    }
 
-    $index = array(); //resulting index
-    foreach (array_keys($words) as $wlen){
-        $word_idx = idx_getIndex('w',$wlen);
-        foreach ($words[$wlen] as $word => $freq) {
-            $wid = array_search("$word\n",$word_idx);
-            if(!is_int($wid)){
-                $wid = count($word_idx);
-                $word_idx[] = "$word\n";
-                $word_idx_modified = true;
+    /**
+     * Find the index ID of each search term.
+     *
+     * The query terms should only contain valid characters, with a '*' at
+     * either the beginning or end of the word (or both).
+     * The $result parameter can be used to merge the index locations with
+     * the appropriate query term.
+     *
+     * @param array     $words  The query terms.
+     * @param arrayref  $result Set to word => array("length*id" ...)
+     * @return array            Set to length => array(id ...)
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _getIndexWords($words, &$result) {
+        $tokens = array();
+        $tokenlength = array();
+        $tokenwild = array();
+        foreach ($words as $word) {
+            $result[$word] = array();
+            $caret = false;
+            $dollar = false;
+            $xword = $word;
+            $wlen = wordlen($word);
+
+            // check for wildcards
+            if (substr($xword, 0, 1) == '*') {
+                $xword = substr($xword, 1);
+                $caret = true;
+                $wlen -= 1;
+            }
+            if (substr($xword, -1, 1) == '*') {
+                $xword = substr($xword, 0, -1);
+                $dollar = true;
+                $wlen -= 1;
+            }
+            if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword))
+                continue;
+            if (!isset($tokens[$xword]))
+                $tokenlength[$wlen][] = $xword;
+            if ($caret || $dollar) {
+                $re = preg_quote($xword, '/');
+                if ($caret) $re = '^'.$re;
+                if ($dollar) $re = $re.'$';
+                $tokens[$xword][] = array($word, '/'.$re.'/');
+                if (!isset($tokenwild[$xword]))
+                    $tokenwild[$xword] = $wlen;
+            } else {
+                $tokens[$xword][] = array($word, null);
             }
-            if(!isset($index[$wlen]))
-                $index[$wlen] = array();
-            $index[$wlen][$wid] = $freq;
         }
+        asort($tokenwild);
+        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
+        // $tokenlength = array( base word length => base word ... )
+        // $tokenwild = array( base word => base word length ... )
+        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
+        $indexes_known = $this->_indexLengths($length_filter);
+        if (!empty($tokenwild)) sort($indexes_known);
+        // get word IDs
+        $wids = array();
+        foreach ($indexes_known as $ixlen) {
+            $word_idx = $this->_getIndex('w', $ixlen);
+            // handle exact search
+            if (isset($tokenlength[$ixlen])) {
+                foreach ($tokenlength[$ixlen] as $xword) {
+                    $wid = array_search($xword, $word_idx);
+                    if ($wid !== false) {
+                        $wids[$ixlen][] = $wid;
+                        foreach ($tokens[$xword] as $w)
+                            $result[$w[0]][] = "$ixlen*$wid";
+                    }
+                }
+            }
+            // handle wildcard search
+            foreach ($tokenwild as $xword => $wlen) {
+                if ($wlen >= $ixlen) break;
+                foreach ($tokens[$xword] as $w) {
+                    if (is_null($w[1])) continue;
+                    foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) {
+                        $wids[$ixlen][] = $wid;
+                        $result[$w[0]][] = "$ixlen*$wid";
+                    }
+                }
+            }
+        }
+        return $wids;
+    }
 
-        // save back word index
-        if($word_idx_modified && !idx_saveIndex('w',$wlen,$word_idx)){
-            trigger_error("Failed to write word index", E_USER_ERROR);
-            return false;
+    /**
+     * Return a list of all pages
+     *
+     * @param string    $key    list only pages containing the metadata key (optional)
+     * @return array            list of page names
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    public function getPages($key=null) {
+        $page_idx = $this->_getIndex('page', '');
+        if (is_null($key)) return $page_idx;
+    }
+
+    /**
+     * Return a list of words sorted by number of times used
+     *
+     * @param int       $min    bottom frequency threshold
+     * @param int       $max    upper frequency limit. No limit if $max<$min
+     * @param string    $key    metadata key to list. Uses the fulltext index if not given
+     * @return array            list of words as the keys and frequency as values
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    public function histogram($min=1, $max=0, $key=null) {
+    }
+
+    /**
+     * Lock the indexer.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _lock() {
+        global $conf;
+        $status = true;
+        $lock = $conf['lockdir'].'/_indexer.lock';
+        while (!@mkdir($lock, $conf['dmode'])) {
+            usleep(50);
+            if (time() - @filemtime($lock) > 60*5) {
+                // looks like a stale lock, remove it
+                @rmdir($lock);
+                $status = "stale lock removed";
+            } else {
+                return false;
+            }
         }
+        if ($conf['dperm'])
+            chmod($lock, $conf['dperm']);
+        return $status;
     }
 
-    return $index;
-}
+    /**
+     * Release the indexer lock.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _unlock() {
+        global $conf;
+        @rmdir($conf['lockdir'].'/_indexer.lock');
+        return true;
+    }
 
-/**
- * Adds/updates the search for the given page
- *
- * This is the core function of the indexer which does most
- * of the work. This function needs to be called with proper
- * locking!
- *
- * @author Andreas Gohr <andi@splitbrain.org>
- */
-function idx_addPage($page){
-    global $conf;
+    /**
+     * Retrieve the entire index.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _getIndex($idx, $suffix) {
+        global $conf;
+        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
+        if (!@file_exists($fn, FILE_IGNORE_NEW_LINES)) return array();
+        return file($fn);
+    }
 
-    // load known documents
-    $page_idx = idx_getIndex('page','');
+    /**
+     * Replace the contents of the index with an array.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _saveIndex($idx, $suffix, &$lines) {
+        global $conf;
+        $fn = $conf['indexdir'].'/'.$idx.$suffix;
+        $fh = @fopen($fn.'.tmp', 'w');
+        if (!$fh) return false;
+        fwrite($fh, join("\n", $lines));
+        fclose($fh);
+        if (isset($conf['fperm']))
+            chmod($fn.'.tmp', $conf['fperm']);
+        io_rename($fn.'.tmp', $fn.'.idx');
+        if ($suffix !== '')
+            $this->_cacheIndexDir($idx, $suffix, empty($lines));
+        return true;
+    }
 
-    // get page id (this is the linenumber in page.idx)
-    $pid = array_search("$page\n",$page_idx);
-    if(!is_int($pid)){
-        $pid = count($page_idx);
-        // page was new - write back
-        if (!idx_appendIndex('page','',"$page\n")){
-            trigger_error("Failed to write page index", E_USER_ERROR);
-            return false;
+    /**
+     * Retrieve a line from the index.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _getIndexKey($idx, $suffix, $id) {
+        global $conf;
+        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
+        if (!@file_exists($fn)) return '';
+        $fh = @fopen($fn, 'r');
+        if (!$fh) return '';
+        $ln = -1;
+        while (($line = fgets($fh)) !== false) {
+            if (++$ln == $id) break;
         }
+        fclose($fh);
+        return rtrim((string)$line);
     }
-    unset($page_idx); // free memory
-
-    idx_saveIndexLine('title', '', $pid, p_get_first_heading($page, false));
-
-    $pagewords = array();
-    // get word usage in page
-    $words = idx_getPageWords($page);
-    if($words === false) return false;
-
-    if(!empty($words)) {
-        foreach(array_keys($words) as $wlen){
-            $index = idx_getIndex('i',$wlen);
-            foreach($words[$wlen] as $wid => $freq){
-                if($wid<count($index)){
-                    $index[$wid] = idx_updateIndexLine($index[$wid],$pid,$freq);
-                }else{
-                    // New words **should** have been added in increasing order
-                    // starting with the first unassigned index.
-                    // If someone can show how this isn't true, then I'll need to sort
-                    // or do something special.
-                    $index[$wid] = idx_updateIndexLine('',$pid,$freq);
-                }
-                $pagewords[] = "$wlen*$wid";
+
+    /**
+     * Write a line into the index.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _saveIndexKey($idx, $suffix, $id, $line) {
+        global $conf;
+        if (substr($line, -1) != "\n")
+            $line .= "\n";
+        $fn = $conf['indexdir'].'/'.$idx.$suffix;
+        $fh = @fopen($fn.'.tmp', 'w');
+        if (!fh) return false;
+        $ih = @fopen($fn.'.idx', 'r');
+        if ($ih) {
+            $ln = -1;
+            while (($curline = fgets($ih)) !== false) {
+                fwrite($fh, (++$ln == $id) ? $line : $curline);
             }
-            // save back word index
-            if(!idx_saveIndex('i',$wlen,$index)){
-                trigger_error("Failed to write index", E_USER_ERROR);
+            if ($id > $ln)
+                fwrite($fh, $line);
+            fclose($ih);
+        } else {
+            fwrite($fh, $line);
+        }
+        fclose($fh);
+        if (isset($conf['fperm']))
+            chmod($fn.'.tmp', $conf['fperm']);
+        io_rename($fn.'.tmp', $fn.'.idx');
+        if ($suffix !== '')
+            $this->_cacheIndexDir($idx, $suffix);
+        return true;
+    }
+
+    /**
+     * Retrieve or insert a value in the index.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _addIndexKey($idx, $suffix, $value) {
+        $index = $this->_getIndex($idx, $suffix);
+        $id = array_search($value, $index);
+        if ($id === false) {
+            $id = count($index);
+            $index[$id] = $value;
+            if (!$this->_saveIndex($idx, $suffix, $index)) {
+                trigger_error("Failed to write $idx index", E_USER_ERROR);
                 return false;
             }
         }
+        return $id;
+    }
+
+    private function _cacheIndexDir($idx, $suffix, $delete=false) {
+        global $conf;
+        if ($idx == 'i')
+            $cachename = $conf['indexdir'].'/lengths';
+        else
+            $cachename = $conf['indexdir'].'/'.$idx.'lengths';
+        $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+        if ($lengths === false) $lengths = array();
+        $old = array_search((string)$suffix, $lengths);
+        if (empty($lines)) {
+            if ($old === false) return;
+            unset($lengths[$old]);
+        } else {
+            if ($old !== false) return;
+            $lengths[] = $suffix;
+            sort($lengths);
+        }
+        $fh = @fopen($cachename.'.tmp', 'w');
+        if (!$fh) {
+            trigger_error("Failed to write index cache", E_USER_ERROR);
+            return;
+        }
+        @fwrite($fh, implode("\n", $lengths));
+        @fclose($fh);
+        if (isset($conf['fperm']))
+            chmod($cachename.'.tmp', $conf['fperm']);
+        io_rename($cachename.'.tmp', $cachename.'.idx');
+    }
+
+    /**
+     * Get the list of lengths indexed in the wiki.
+     *
+     * Read the index directory or a cache file and returns
+     * a sorted array of lengths of the words used in the wiki.
+     *
+     * @author YoBoY <yoboy.leguesh@gmail.com>
+     */
+    private function _listIndexLengths() {
+        global $conf;
+        $cachename = $conf['indexdir'].'/lengths';
+        clearstatcache();
+        if (@file_exists($cachename.'.idx')) {
+            $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+            if ($lengths !== false) {
+                $idx = array();
+                foreach ($lengths as $length)
+                    $idx[] = (int)$length;
+                return $idx;
+            }
+        }
+
+        $dir = @opendir($conf['indexdir']);
+        if ($dir === false)
+            return array();
+        $lengths[] = array();
+        while (($f = readdir($dir)) !== false) {
+            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
+                $i = substr($f, 1, -4);
+                if (is_numeric($i))
+                    $lengths[] = (int)$i;
+            }
+        }
+        closedir($dir);
+        sort($lengths);
+        // save this in a file
+        $fh = @fopen($cachename.'.tmp', 'w');
+        if (!$fh) {
+            trigger_error("Failed to write index cache", E_USER_ERROR);
+            return;
+        }
+        @fwrite($fh, implode("\n", $lengths));
+        @fclose($fh);
+        if (isset($conf['fperm']))
+            chmod($cachename.'.tmp', $conf['fperm']);
+        io_rename($cachename.'.tmp', $cachename.'.idx');
+
+        return $lengths;
     }
 
-    // Remove obsolete index entries
-    $pageword_idx = trim(idx_getIndexLine('pageword','',$pid));
-    if ($pageword_idx !== '') {
-        $oldwords = explode(':',$pageword_idx);
-        $delwords = array_diff($oldwords, $pagewords);
-        $upwords = array();
-        foreach ($delwords as $word) {
-            if($word=='') continue;
-            list($wlen,$wid) = explode('*',$word);
-            $wid = (int)$wid;
-            $upwords[$wlen][] = $wid;
-        }
-        foreach ($upwords as $wlen => $widx) {
-            $index = idx_getIndex('i',$wlen);
-            foreach ($widx as $wid) {
-                $index[$wid] = idx_updateIndexLine($index[$wid],$pid,0);
+    /**
+     * Get the word lengths that have been indexed.
+     *
+     * Reads the index directory and returns an array of lengths
+     * that there are indices for.
+     *
+     * @author YoBoY <yoboy.leguesh@gmail.com>
+     */
+    private function _indexLengths($filter) {
+        global $conf;
+        $idx = array();
+        if (is_array($filter)) {
+            // testing if index files exist only
+            $path = $conf['indexdir']."/i";
+            foreach ($filter as $key => $value) {
+                if (@file_exists($path.$key.'.idx'))
+                    $idx[] = $key;
+            }
+        } else {
+            $lengths = idx_listIndexLengths();
+            foreach ($lengths as $key => $length) {
+                // keep all the values equal or superior
+                if ((int)$length >= (int)$filter)
+                    $idx[] = $length;
             }
-            idx_saveIndex('i',$wlen,$index);
         }
+        return $idx;
     }
-    // Save the reverse index
-    $pageword_idx = join(':',$pagewords)."\n";
-    if(!idx_saveIndexLine('pageword','',$pid,$pageword_idx)){
-        trigger_error("Failed to write word index", E_USER_ERROR);
-        return false;
+
+    /**
+     * Insert or replace a tuple in a line.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     */
+    private function _updateTuple($line, $id, $count) {
+        $newLine = $line;
+        if ($newLine !== '')
+            $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine);
+        $newLine = trim($newLine, ':');
+        if ($count) {
+            if ($strlen($newLine) > 0)
+                return "$id*$count:".$newLine;
+            else
+                return "$id*$count".$newLine;
+        }
+        return $newLine;
     }
 
-    return true;
+    /**
+     * Split a line into an array of tuples.
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     * @author Andreas Gohr <andi@splitbrain.org>
+     */
+    private function _parseTuples(&$keys, $line) {
+        $result = array();
+        if ($line == '') return $result;
+        $parts = explode(':', $line);
+        foreach ($parts as $tuple) {
+            if ($tuple == '') continue;
+            list($key, $cnt) = explode('*', $tuple);
+            if (!$cnd) continue;
+            $key = $keys[$key];
+            if (!$key) continue;
+            $result[$key] = $cnt;
+        }
+        return $result;
+    }
 }
 
 /**
- * Write a new index line to the filehandle
- *
- * This function writes an line for the index file to the
- * given filehandle. It removes the given document from
- * the given line and readds it when $count is >0.
+ * Create an instance of the indexer.
  *
- * @deprecated - see idx_updateIndexLine
- * @author Andreas Gohr <andi@splitbrain.org>
+ * @return object               a Doku_Indexer
+ * @author Tom N Harris <tnharris@whoopdedo.org>
  */
-function idx_writeIndexLine($fh,$line,$pid,$count){
-    fwrite($fh,idx_updateIndexLine($line,$pid,$count));
+function & idx_get_indexer() {
+    static $Indexer = null;
+    if (is_null($Indexer)) {
+        $Indexer = new Doku_Indexer();
+    }
+    return $Indexer;
 }
 
 /**
- * Modify an index line with new information
- *
- * This returns a line of the index. It removes the
- * given document from the line and readds it if
- * $count is >0.
+ * Returns words that will be ignored.
  *
+ * @return array                list of stop words
  * @author Tom N Harris <tnharris@whoopdedo.org>
- * @author Andreas Gohr <andi@splitbrain.org>
  */
-function idx_updateIndexLine($line,$pid,$count){
-    if ($line == ''){
-        $newLine = "\n";
-    }else{
-        $newLine = preg_replace('/(^|:)'.preg_quote($pid, '/').'\*\d*/', '', $line);
-    }
-    if ($count){
-        if (strlen($newLine) > 1){
-            return "$pid*$count:".$newLine;
+function & idx_get_stopwords() {
+    static $stopwords = null;
+    if (is_null($stopwords)) {
+        global $conf;
+        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
+        if(@file_exists($swfile)){
+            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
         }else{
-            return "$pid*$count".$newLine;
+            $stopwords = array();
         }
     }
-    return $newLine;
+    return $stopwords;
+}
+
+/**
+ * Adds/updates the search index for the given page
+ *
+ * Locking is handled internally.
+ *
+ * @param string        $page   name of the page to index
+ * @return boolean              the function completed successfully
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_addPage($page) {
+    $body = '';
+    $data = array($page, $body);
+    $evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
+    if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($page);
+    $evt->advise_after();
+    unset($evt);
+    list($page,$body) = $data;
+
+    $Indexer =& idx_get_indexer();
+    return $Indexer->addPageWords($page, $body);
+}
+
+/**
+ * Find tokens in the fulltext index
+ *
+ * Takes an array of words and will return a list of matching
+ * pages for each one.
+ *
+ * Important: No ACL checking is done here! All results are
+ *            returned, regardless of permissions
+ *
+ * @param array         $words  list of words to search for
+ * @return array                list of pages found, associated with the search terms
+ */
+function idx_lookup($words) {
+    $Indexer =& idx_get_indexer();
+    return $Indexer->lookup($words);
+}
+
+/**
+ * Split a string into tokens
+ *
+ */
+function idx_tokenizer($string, $wc=false) {
+    $Indexer =& idx_get_indexer();
+    return $Indexer->tokenizer($string, $wc);
 }
 
+/* For compatibility */
+
 /**
- * Get the list of lenghts indexed in the wiki
+ * Read the list of words in an index (if it exists).
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_getIndex($idx, $suffix) {
+    global $conf;
+    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
+    if (!@file_exists($fn)) return array();
+    return file($fn);
+}
+
+/**
+ * Get the list of lengths indexed in the wiki.
  *
  * Read the index directory or a cache file and returns
  * a sorted array of lengths of the words used in the wiki.
@@ -419,10 +912,11 @@ function idx_listIndexLengths() {
         $docache = false;
     } else {
         clearstatcache();
-        if (@file_exists($conf['indexdir'].'/lengths.idx') and (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
-            if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) ) !== false) {
+        if (@file_exists($conf['indexdir'].'/lengths.idx')
+        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
+            if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) {
                 $idx = array();
-                foreach ( $lengths as $length) {
+                foreach ($lengths as $length) {
                     $idx[] = (int)$length;
                 }
                 return $idx;
@@ -431,24 +925,24 @@ function idx_listIndexLengths() {
         $docache = true;
     }
 
-    if ($conf['readdircache'] == 0 or $docache ) {
+    if ($conf['readdircache'] == 0 || $docache) {
         $dir = @opendir($conf['indexdir']);
-        if($dir===false)
+        if ($dir === false)
             return array();
         $idx[] = array();
         while (($f = readdir($dir)) !== false) {
-            if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
-                $i = substr($f,1,-4);
+            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
+                $i = substr($f, 1, -4);
                 if (is_numeric($i))
                     $idx[] = (int)$i;
             }
         }
         closedir($dir);
         sort($idx);
-        // we save this in a file.
-        if ($docache === true) {
-            $handle = @fopen($conf['indexdir'].'/lengths.idx','w');
-            @fwrite($handle, implode("\n",$idx));
+        // save this in a file
+        if ($docache) {
+            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
+            @fwrite($handle, implode("\n", $idx));
             @fclose($handle);
         }
         return $idx;
@@ -465,250 +959,40 @@ function idx_listIndexLengths() {
  *
  * @author YoBoY <yoboy.leguesh@gmail.com>
  */
-function idx_indexLengths(&$filter){
+function idx_indexLengths($filter) {
     global $conf;
     $idx = array();
-    if (is_array($filter)){
-        // testing if index files exists only
+    if (is_array($filter)) {
+        // testing if index files exist only
+        $path = $conf['indexdir']."/i";
         foreach ($filter as $key => $value) {
-            if (@file_exists($conf['indexdir']."/i$key.idx")) {
+            if (@file_exists($path.$key.'.idx'))
                 $idx[] = $key;
-            }
         }
     } else {
         $lengths = idx_listIndexLengths();
-        foreach ( $lengths as $key => $length) {
-            // we keep all the values equal or superior
-            if ((int)$length >= (int)$filter) {
+        foreach ($lengths as $key => $length) {
+            // keep all the values equal or superior
+            if ((int)$length >= (int)$filter)
                 $idx[] = $length;
-            }
         }
     }
     return $idx;
 }
 
 /**
- * Find the the index number of each search term.
+ * Clean a name of a key for use as a file name.
  *
- * This will group together words that appear in the same index.
- * So it should perform better, because it only opens each index once.
- * Actually, it's not that great. (in my experience) Probably because of the disk cache.
- * And the sorted function does more work, making it slightly slower in some cases.
- *
- * @param array    $words   The query terms. Words should only contain valid characters,
- *                          with a '*' at either the beginning or end of the word (or both)
- * @param arrayref $result  Set to word => array("length*id" ...), use this to merge the
- *                          index locations with the appropriate query term.
- * @return array            Set to length => array(id ...)
+ * Romanizes non-latin characters, then strips away anything that's
+ * not a letter, number, or underscore.
  *
  * @author Tom N Harris <tnharris@whoopdedo.org>
  */
-function idx_getIndexWordsSorted($words,&$result){
-    // parse and sort tokens
-    $tokens = array();
-    $tokenlength = array();
-    $tokenwild = array();
-    foreach($words as $word){
-        $result[$word] = array();
-        $wild = 0;
-        $xword = $word;
-        $wlen = wordlen($word);
-
-        // check for wildcards
-        if(substr($xword,0,1) == '*'){
-            $xword = substr($xword,1);
-            $wild |= 1;
-            $wlen -= 1;
-        }
-        if(substr($xword,-1,1) == '*'){
-            $xword = substr($xword,0,-1);
-            $wild |= 2;
-            $wlen -= 1;
-        }
-        if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue;
-        if(!isset($tokens[$xword])){
-            $tokenlength[$wlen][] = $xword;
-        }
-        if($wild){
-            $ptn = preg_quote($xword,'/');
-            if(($wild&1) == 0) $ptn = '^'.$ptn;
-            if(($wild&2) == 0) $ptn = $ptn.'$';
-            $tokens[$xword][] = array($word, '/'.$ptn.'/');
-            if(!isset($tokenwild[$xword])) $tokenwild[$xword] = $wlen;
-        }else
-            $tokens[$xword][] = array($word, null);
-    }
-    asort($tokenwild);
-    // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... )
-    // $tokenlength = array( base word length => base word ... )
-    // $tokenwild = array( base word => base word length ... )
-
-    $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
-    $indexes_known = idx_indexLengths($length_filter);
-    if(!empty($tokenwild)) sort($indexes_known);
-    // get word IDs
-    $wids = array();
-    foreach($indexes_known as $ixlen){
-        $word_idx = idx_getIndex('w',$ixlen);
-        // handle exact search
-        if(isset($tokenlength[$ixlen])){
-            foreach($tokenlength[$ixlen] as $xword){
-                $wid = array_search("$xword\n",$word_idx);
-                if(is_int($wid)){
-                    $wids[$ixlen][] = $wid;
-                    foreach($tokens[$xword] as $w)
-                        $result[$w[0]][] = "$ixlen*$wid";
-                }
-            }
-        }
-        // handle wildcard search
-        foreach($tokenwild as $xword => $wlen){
-            if($wlen >= $ixlen) break;
-            foreach($tokens[$xword] as $w){
-                if(is_null($w[1])) continue;
-                foreach(array_keys(preg_grep($w[1],$word_idx)) as $wid){
-                    $wids[$ixlen][] = $wid;
-                    $result[$w[0]][] = "$ixlen*$wid";
-                }
-            }
-        }
-    }
-    return $wids;
-}
-
-/**
- * Lookup words in index
- *
- * Takes an array of word and will return a list of matching
- * documents for each one.
- *
- * Important: No ACL checking is done here! All results are
- *            returned, regardless of permissions
- *
- * @author Andreas Gohr <andi@splitbrain.org>
- */
-function idx_lookup($words){
-    global $conf;
-
-    $result = array();
-
-    $wids = idx_getIndexWordsSorted($words, $result);
-    if(empty($wids)) return array();
-
-    // load known words and documents
-    $page_idx = idx_getIndex('page','');
-
-    $docs = array();                          // hold docs found
-    foreach(array_keys($wids) as $wlen){
-        $wids[$wlen] = array_unique($wids[$wlen]);
-        $index = idx_getIndex('i',$wlen);
-        foreach($wids[$wlen] as $ixid){
-            if($ixid < count($index))
-                $docs["$wlen*$ixid"] = idx_parseIndexLine($page_idx,$index[$ixid]);
-        }
-    }
-
-    // merge found pages into final result array
-    $final = array();
-    foreach($result as $word => $res){
-        $final[$word] = array();
-        foreach($res as $wid){
-            $hits = &$docs[$wid];
-            foreach ($hits as $hitkey => $hitcnt) {
-                if (!isset($final[$word][$hitkey])) {
-                    $final[$word][$hitkey] = $hitcnt;
-                } else {
-                    $final[$word][$hitkey] += $hitcnt;
-                }
-            }
-        }
-    }
-    return $final;
-}
-
-/**
- * Returns a list of documents and counts from a index line
- *
- * It omits docs with a count of 0 and pages that no longer
- * exist.
- *
- * @param  array  $page_idx The list of known pages
- * @param  string $line     A line from the main index
- * @author Andreas Gohr <andi@splitbrain.org>
- */
-function idx_parseIndexLine(&$page_idx,$line){
-    $result = array();
-
-    $line = trim($line);
-    if($line == '') return $result;
-
-    $parts = explode(':',$line);
-    foreach($parts as $part){
-        if($part == '') continue;
-        list($doc,$cnt) = explode('*',$part);
-        if(!$cnt) continue;
-        $doc = trim($page_idx[$doc]);
-        if(!$doc) continue;
-        // make sure the document still exists
-        if(!page_exists($doc,'',false)) continue;
-
-        $result[$doc] = $cnt;
-    }
-    return $result;
-}
-
-/**
- * Tokenizes a string into an array of search words
- *
- * Uses the same algorithm as idx_getPageWords()
- * Takes an arbitrarily complex string and returns a list of words
- * suitable for indexing. The string may include spaces and line
- * breaks
- *
- * @param string   $string     the query as given by the user
- * @param arrayref $stopwords  array of stopwords
- * @param boolean  $wc         are wildcards allowed?
- * @return array               list of indexable words
- * @author Tom N Harris <tnharris@whoopdedo.org>
- * @author Andreas Gohr <andi@splitbrain.org>
- */
-function idx_tokenizer($string,&$stopwords,$wc=false){
-    global $conf;
-    $words = array();
-    $wc = ($wc) ? '' : $wc = '\*';
-
-    if (!$stopwords)
-        $sw = array();
-    else
-        $sw =& $stopwords;
-
-    if ($conf['external_tokenizer']) {
-        if (0 == io_exec($conf['tokenizer_cmd'], $string, $output))
-            $string = $output;
-    } else {
-        if(preg_match('/[^0-9A-Za-z ]/u', $string)) {
-            // handle asian chars as single words (may fail on older PHP version)
-            $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
-            if(!is_null($asia)) $string = $asia; //recover from regexp failure
-        }
-    }
-    $string = strtr($string, "\r\n\t", '   ');
-    if(preg_match('/[^0-9A-Za-z ]/u', $string))
-        $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc);
-
-    $wordlist = explode(' ', $string);
-    foreach ($wordlist as $word) {
-        if(preg_match('/[^0-9A-Za-z]/u', $word)){
-            $word = utf8_strtolower($word);
-        }else{
-            $word = strtolower($word);
-        }
-        if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue;
-        if(is_int(array_search("$word\n",$stopwords))) continue;
-        $words[] = $word;
-    }
-
-    return $words;
+function idx_cleanName($name) {
+    $name = utf8_romanize(trim((string)$name));
+    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
+    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
+    return strtolower($name);
 }
 
-//Setup VIM: ex: et ts=4 enc=utf-8 :
+//Setup VIM: ex: et ts=4 :
author	Tom N Harris <tnharris@whoopdedo.org>	2010-12-27 20:30:46 -0500
committer	Tom N Harris <tnharris@whoopdedo.org>	2010-12-27 20:30:46 -0500
commit	00803e562833be06ab5a869541581314b9b84d58 (patch)
tree	83586e2ae8081ad8000070e03982368b927e4c5f /inc/indexer.php
parent	3c4b38902b6f6d32222611b22087d5d41d20de6e (diff)
download	rpg-00803e562833be06ab5a869541581314b9b84d58.tar.gz rpg-00803e562833be06ab5a869541581314b9b84d58.tar.bz2