6 files changed, 261 insertions, 138 deletions
diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php
index f3fd8286a..0df72bce4 100644
--- a/conf/dokuwiki.php
+++ b/conf/dokuwiki.php
@@ -130,3 +130,5 @@ $conf['ftp']['user'] = 'user';
 $conf['ftp']['pass'] = 'password';
 $conf['ftp']['root'] = '/home/user/htdocs';
 
+/* FIXME: delete when no longer needed */
+$conf['test_indexer'] = 0;
diff --git a/inc/indexer.php b/inc/indexer.php
index 3789098e0..09b43a1b5 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -17,15 +17,48 @@
 // Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
 // I'm no language expert. If you think some ranges are wrongly chosen or
 // a range is missing, please contact me
-define('IDX_ASIAN','['.
-                   '\x{0E00}-\x{0E7F}'.  // Thai
-                   '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
+define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai
+define('IDX_ASIAN2','['.
+                   '\x{2E80}-\x{3040}'.  // CJK -> Hangul
+                   '\x{309D}-\x{30A0}'.
+                   '\x{30FB}-\x{31EF}\x{3200}-\x{D7AF}'.
                    '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
                    '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
                    ']');
+define('IDX_ASIAN3','['.                // Hiragana/Katakana (can be two characters)
+                   '\x{3042}\x{3044}\x{3046}\x{3048}'.
+                   '\x{304A}-\x{3062}\x{3064}-\x{3082}'.
+                   '\x{3084}\x{3086}\x{3088}-\x{308D}'.
+                   '\x{308F}-\x{3094}'.
+                   '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'.
+                   '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'.
+                   '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'.
+                   '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'.
+                   ']['.
+                   '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'.
+                   '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'.
+                   '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'.
+                   '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'.
+                   '\x{31F0}-\x{31FF}'.
+                   ']?');
 
 
 /**
+ * Measure the length of a string.
+ * Differs from strlen in handling of asian characters.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function wordlen($w){
+    $l = strlen($w);
+    // If left alone, all chinese "words" will get put into w3.idx
+    // So the "length" of a "word" is faked
+    if(preg_match('/'.IDX_ASIAN2.'/u',$w))
+        $l += ord($w) - 0xE1;  // Lead bytes from 0xE2-0xEF
+    return $l;
+}
+
+/**
  * Write a list of strings to an index file.
  *
  * @author Tom N Harris <tnharris@whoopdedo.org>
@@ -113,7 +146,7 @@ function idx_getPageWords($page){
         $arr = idx_tokenizer($word,$stopwords);
         $arr = array_count_values($arr);
         foreach ($arr as $w => $c) {
-            $l = strlen($w);
+            $l = wordlen($w);
             if(isset($words[$l])){
                 $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
             }else{
@@ -130,8 +163,8 @@ function idx_getPageWords($page){
         foreach ($words[$wlen] as $word => $freq) {
             $wid = array_search("$word\n",$word_idx);
             if(!is_int($wid)){
+                $wid = count($word_idx);
                 $word_idx[] = "$word\n";
-                $wid = count($word_idx)-1;
             }
             if(!isset($index[$wlen]))
                 $index[$wlen] = array();
@@ -169,8 +202,10 @@ function idx_addPage($page){
         $page_idx[] = "$page\n";
         $pid = count($page_idx)-1;
         // page was new - write back
-        if (!idx_saveIndex('page','',$page_idx))
+        if (!idx_saveIndex('page','',$page_idx)){
+            trigger_error("Failed to write page index", E_USER_ERROR);
             return false;
+        }
     }
 
     // get word usage in page
@@ -179,47 +214,23 @@ function idx_addPage($page){
     if(!count($words)) return true;
 
     foreach(array_keys($words) as $wlen){
-        // Open index and temp file
-        $fn = $conf['indexdir']."/i$wlen";
-        idx_touchIndex('i',$wlen);
-        $idx = fopen($fn.'.idx','r');
-        $tmp = fopen($fn.'.tmp','w');
-        if(!$idx || !$tmp){
-            trigger_error("Failed to open index files", E_USER_ERROR);
-            return false;
-        }
-
-        // copy from index to temp file, modifying where needed
-        $lno = 0;
-        $line = '';
-        while (!feof($idx)) {
-            // read full line
-            $line .= fgets($idx, 4096);
-            if(substr($line,-1) != "\n") continue;
-
-            // write a new Line to temp file
-            idx_writeIndexLine($tmp,$line,$pid,$words[$wlen][$lno]);
-
-            $line = ''; // reset line buffer
-            $lno++;     // increase linecounter
+        $index = idx_getIndex('i',$wlen);
+        foreach($words[$wlen] as $wid => $freq){
+            if($wid<count($index)){
+                $index[$wid] = idx_updateIndexLine($index[$wid],$pid,$freq);
+            }else{
+                // New words **should** have been added in increasing order
+                // starting with the first unassigned index.
+                // If someone can show how this isn't true, then I'll need to sort
+                // or do something special.
+                $index[$wid] = idx_updateIndexLine('',$pid,$freq);
+            }
         }
-        fclose($idx);
-
-        // add missing lines (usually index and word should contain
-        // the same number of lines, however if the page contained
-        // new words the word file has some more lines which need to
-        // be added here
-        $word_idx = idx_getIndex('w',$wlen);
-        $wcnt = count($word_idx);
-        for($lno; $lno<$wcnt; $lno++){
-            idx_writeIndexLine($tmp,'',$pid,$words[$wlen][$lno]);
+        // save back word index
+        if(!idx_saveIndex('i',$wlen,$index)){
+            trigger_error("Failed to write index", E_USER_ERROR);
+            return false;
         }
-
-        // close the temp file and move it over to be the new one
-        fclose($tmp);
-        if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
-        // try rename first (fast) fallback to copy (slow)
-        io_rename($fn.'.tmp', $fn.'.idx');
     }
 
     return true;
@@ -232,11 +243,26 @@ function idx_addPage($page){
  * given filehandle. It removes the given document from
  * the given line and readds it when $count is >0.
  *
+ * @deprecated - see idx_updateIndexLine
  * @author Andreas Gohr <andi@splitbrain.org>
  */
 function idx_writeIndexLine($fh,$line,$pid,$count){
-    $line = trim($line);
+    fwrite($fh,idx_updateIndexLine($line,$pid,$count));
+}
 
+/**
+ * Modify an index line with new information
+ *
+ * This returns a line of the index. It removes the 
+ * given document from the line and readds it if 
+ * $count is >0.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function idx_updateIndexLine($line,$pid,$count){
+    $line = trim($line);
+    $updated = array();
     if($line != ''){
         $parts = explode(':',$line);
         // remove doc from given line
@@ -244,18 +270,17 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
             if($part == '') continue;
             list($doc,$cnt) = explode('*',$part);
             if($doc != $pid){
-                fwrite($fh,"$doc*$cnt:");
+                $updated[] = $part;
             }
         }
     }
 
     // add doc
     if ($count){
-        fwrite($fh,"$pid*$count");
+        $updated[] = "$pid*$count";
     }
 
-    // add newline
-    fwrite($fh,"\n");
+    return join(':',$updated)."\n";
 }
 
 /**
@@ -266,20 +291,30 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
  *
  * @author Tom N Harris <tnharris@whoopdedo.org>
  */
-function idx_indexLengths($minlen){
+function idx_indexLengths(&$filter){
     global $conf;
     $dir = @opendir($conf['indexdir']);
     if($dir===false)
         return array();
     $idx = array();
-    // Exact match first.
-    if(@file_exists($conf['indexdir']."/i$minlen.idx"))
-        $idx[] = $minlen;
-    while (($f = readdir($dir)) !== false) {
-        if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
-            $i = substr($f,1,-4);
-            if (is_numeric($i) && $i > $minlen)
-                $idx[] = $i;
+    if(is_array($filter)){
+        while (($f = readdir($dir)) !== false) {
+            if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
+                $i = substr($f,1,-4);
+                if (is_numeric($i) && isset($filter[(int)$i]))
+                    $idx[] = (int)$i;
+            }
+        }
+    }else{
+        // Exact match first.
+        if(@file_exists($conf['indexdir']."/i$filter.idx"))
+            $idx[] = $filter;
+        while (($f = readdir($dir)) !== false) {
+            if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
+                $i = substr($f,1,-4);
+                if (is_numeric($i) && $i > $filter)
+                    $idx[] = (int)$i;
+            }
         }
     }
     closedir($dir);
@@ -287,118 +322,182 @@ function idx_indexLengths($minlen){
 }
 
 /**
- * Lookup words in index
+ * Find the the index number of each search term.
  *
- * Takes an array of word and will return a list of matching
- * documents for each one.
+ * There are two variation: Simple and Sorted.
+ * The simple version just takes the words one at a time.
+ * The sorted version will group together words that appear in the same index.
+ * So it should perform better, because it only opens each index once.
+ * Actually, it's not that great. (in my experience) Probably because of the disk cache.
+ * And the sorted function does more work, making it slightly slower in some cases.
  *
- * Important: No ACL checking is done here! All results are
- *            returned, regardless of permissions
+ * For now, you can choose to use the sorted version by setting $conf['test_indexer'] = 1
+ * Eventually, the more worthy will be chosen and the loser cast into the deepest depths.
  *
- * @author Andreas Gohr <andi@splitbrain.org>
+ * @param array    $words   The query terms. Words should only contain valid characters,
+ *                          with a '*' at either the beginning or end of the word (or both)
+ * @param arrayref $result  Set to word => array("length*id" ...), use this to merge the 
+ *                          index locations with the appropriate query term.
+ * @return array            Set to length => array(id ...)
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
  */
-function idx_lookup($words){
-    global $conf;
-
-    $result = array();
-
-    // load known words and documents
-    $page_idx = idx_getIndex('page','');
-
+function idx_getIndexWordsSimple($words, &$result){
     // get word IDs
     $wids = array();
     foreach($words as $word){
         $result[$word] = array();
         $wild = 0;
         $xword = $word;
-        $wlen = strlen($word);
+        $wlen = wordlen($word);
 
         // check for wildcards
         if(substr($xword,0,1) == '*'){
             $xword = substr($xword,1);
-            $wild  = 1;
-            $ptn = '/'.preg_quote($xword,'/').'$/';
+            $wild |= 1;
             $wlen -= 1;
-#            $l = -1*strlen($xword)-1;
         }
         if(substr($xword,-1,1) == '*'){
             $xword = substr($xword,0,-1);
-            $wild += 2;
+            $wild |= 2;
             $wlen -= 1;
         }
         if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
 
         // look for the ID(s) for the given word
         if($wild){  // handle wildcard search
+            $ptn = preg_quote($xword,'/');
+            if(($wild&1) == 0) $ptn = '^'.$ptn;
+            if(($wild&2) == 0) $ptn = $ptn.'$';
+            $ptn = '/'.$ptn.'/';
             foreach (idx_indexLengths($wlen) as $ixlen){
                 $word_idx = idx_getIndex('w',$ixlen);
-                $cnt = count($word_idx);
-                for($wid=0; $wid<$cnt; $wid++){
-                    $iword = $word_idx[$wid];
-                    if( (($wild==3) && is_int(strpos($iword,$xword))) ||
-#                        (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
-                        (($wild==1) && preg_match($ptn,$iword)) ||
-#                        (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
-                        (($wild==2) && (0 === strpos($iword,$xword)))
-
-                      ){
-                        if(!isset($wids[$ixlen])) $wids[$ixlen] = array();
-                        $wids[$ixlen][] = $wid;
-                        $result[$word][] = "$ixlen*$wid";
-                    }
+                foreach(array_keys(preg_grep($ptn,$word_idx)) as $wid){
+                    $wids[$ixlen][] = $wid;
+                    $result[$word][] = "$ixlen*$wid";
                 }
             }
         }else{     // handle exact search
             $word_idx = idx_getIndex('w',$wlen);
             $wid = array_search("$word\n",$word_idx);
             if(is_int($wid)){
-                $wids[$wlen] = array($wid);
+                $wids[$wlen][] = $wid;
                 $result[$word][] = "$wlen*$wid";
             }else{
                 $result[$word] = array();
             }
         }
     }
+    return $wids;
+}
+function idx_getIndexWordsSorted($words,&$result){
+    // parse and sort tokens
+    $tokens = array();
+    $tokenlength = array();
+    $tokenwild = array();
+    foreach($words as $word){
+        $result[$word] = array();
+        $wild = 0;
+        $xword = $word;
+        $wlen = wordlen($word);
 
-    $docs = array();                          // hold docs found
-    foreach(array_keys($wids) as $wlen){
-        sort($wids[$wlen]);
-        $wids[$wlen] = array_unique($wids[$wlen]);
-
-        // Open index
-        idx_touchIndex('i',$wlen);
-        $idx = fopen($conf['indexdir']."/i$wlen.idx",'r');
-        if(!$idx){
-            msg("Failed to open index file",-1);
-            return false;
+        // check for wildcards
+        if(substr($xword,0,1) == '*'){
+            $xword = substr($xword,1);
+            $wild |= 1;
+            $wlen -= 1;
         }
-
-        // Walk the index til the lines are found
-        $lno  = 0;
-        $line = '';
-        $ixids =& $wids[$wlen];
-        $srch = array_shift($ixids);               // which word do we look for?
-        while (!feof($idx)) {
-            // read full line
-            $line .= fgets($idx, 4096);
-            if(substr($line,-1) != "\n") continue;
-            if($lno > $srch)             break;   // shouldn't happen
-
-            // do we want this line?
-            if($lno == $srch){
-                // add docs to list
-                $docs["$wlen*$srch"] = idx_parseIndexLine($page_idx,$line);
-
-                $srch = array_shift($ixids);        // next word to look up
-                if($srch == null) break;           // no more words
+        if(substr($xword,-1,1) == '*'){
+            $xword = substr($xword,0,-1);
+            $wild |= 2;
+            $wlen -= 1;
+        }
+        if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
+        if(!isset($tokens[$xword])){
+            $tokenlength[$wlen][] = $xword;
+        }
+        if($wild){
+            $ptn = preg_quote($xword,'/');
+            if(($wild&1) == 0) $ptn = '^'.$ptn;
+            if(($wild&2) == 0) $ptn = $ptn.'$';
+            $tokens[$xword][] = array($word, '/'.$ptn.'/');
+            if(!isset($tokenwild[$xword])) $tokenwild[$xword] = $wlen;
+        }else
+            $tokens[$xword][] = array($word, null);
+    }
+    asort($tokenwild);
+    // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... )
+    // $tokenlength = array( base word length => base word ... )
+    // $tokenwild = array( base word => base word length ... )
+
+    $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
+    $indexes_known = idx_indexLengths($length_filter);
+    if(!empty($tokenwild)) sort($indexes_known);
+    // get word IDs
+    $wids = array();
+    echo "\n";
+    foreach($indexes_known as $ixlen){
+        $word_idx = idx_getIndex('w',$ixlen);
+        // handle exact search
+        if(isset($tokenlength[$ixlen])){
+            foreach($tokenlength[$ixlen] as $xword){
+                $wid = array_search("$xword\n",$word_idx);
+                if(is_int($wid)){
+                    $wids[$ixlen][] = $wid;
+                    foreach($tokens[$xword] as $w)
+                        $result[$w[0]][] = "$ixlen*$wid";
+                }
+            }
+        }
+        // handle wildcard search
+        foreach($tokenwild as $xword => $wlen){
+            if($wlen >= $ixlen) break;
+            foreach($tokens[$xword] as $w){
+                if(is_null($w[1])) continue;
+                foreach(array_keys(preg_grep($w[1],$word_idx)) as $wid){
+                    $wids[$ixlen][] = $wid;
+                    $result[$w[0]][] = "$ixlen*$wid";
+                }
             }
-
-            $line = ''; // reset line buffer
-            $lno++;     // increase linecounter
         }
-        fclose($idx);
     }
+  return $wids;
+}
 
+/**
+ * Lookup words in index
+ *
+ * Takes an array of word and will return a list of matching
+ * documents for each one.
+ *
+ * Important: No ACL checking is done here! All results are
+ *            returned, regardless of permissions
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function idx_lookup($words){
+    global $conf;
+
+    $result = array();
+
+    if(isset($conf['test_indexer']) && ($conf['test_indexer']&1))
+        $wids = idx_getIndexWordsSorted($words, $result);
+    else
+        $wids = idx_getIndexWordsSimple($words, $result);
+    if(empty($wids)) return array();
+
+    // load known words and documents
+    $page_idx = idx_getIndex('page','');
+    
+    $docs = array();                          // hold docs found
+    foreach(array_keys($wids) as $wlen){
+        $wids[$wlen] = array_unique($wids[$wlen]);
+        $index = idx_getIndex('i',$wlen);
+        foreach($wids[$wlen] as $ixid){
+            if($ixid < count($index))
+                $docs["$wlen*$ixid"] = idx_parseIndexLine($page_idx,$index[$ixid]);
+        }
+    }
 
     // merge found pages into final result array
     $final = array();
@@ -453,8 +552,6 @@ function idx_parseIndexLine(&$page_idx,$line){
  * @param string   $string     the query as given by the user
  * @param arrayref $stopwords  array of stopwords
  * @param boolean  $wc         are wildcards allowed?
- *
- * @todo make combined function to use alone or in getPageWords
  */
 function idx_tokenizer($string,&$stopwords,$wc=false){
     $words = array();
@@ -462,7 +559,7 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
 
     if(preg_match('/[^0-9A-Za-z]/u', $string)){
         // handle asian chars as single words (may fail on older PHP version)
-        $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
+        $asia = @preg_replace('/('.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')/u',' \1 ',$string);
         if(!is_null($asia)) $string = $asia; //recover from regexp failure
 
         $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
diff --git a/inc/utf8.php b/inc/utf8.php
index e32b64b17..8da981fd9 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -1027,11 +1027,20 @@ $UTF8_SPECIAL_CHARS = array(
   0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
   0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
   0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
-  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
+  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
+  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
+  0x3018, 0x3019, 0x301a, 0x301b, 0x3036, 
+  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
   0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
   0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
   0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
   0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
+          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
+  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
+  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
+  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
+  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
+  0xffeb, 0xffec, 0xffed, 0xffee,
 );
 
 // utf8 version of above data
@@ -1051,8 +1060,12 @@ $UTF8_SPECIAL_CHARS2 =
     '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
     '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
     '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
-    '➷➸➹➺➻➼➽➾�'.
-    '�ﹼﹽ';
+    '➷➸➹➺➻➼➽➾'.
+    '　、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
+    '�'.
+    '�ﹼﹽ'.
+    '！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？＠［＼］＾｀｛｜｝～'.
+    '｟｠｡｢｣､･￠￡￢￣￤￥￦￨￩￪￫￬￭￮';
 
 /**
  * Romanization lookup table
diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php
index f5fc07bc3..057ab06f1 100644
--- a/lib/exe/indexer.php
+++ b/lib/exe/indexer.php
@@ -13,6 +13,9 @@ require_once(DOKU_INC.'inc/events.php');
 session_write_close();  //close session
 if(!defined('NL')) define('NL',"\n");
 
+// Version tag used to force rebuild on upgrade
+define(INDEXER_VERSION, 1);
+
 // keep running after browser closes connection
 @ignore_user_abort(true);
 
@@ -136,10 +139,15 @@ function runIndexer(){
     if(!$ID) return false;
 
     // check if indexing needed
-    $last = @filemtime(metaFN($ID,'.indexed'));
-    if($last > @filemtime(wikiFN($ID))){
-        print "runIndexer(): index for $ID up to date".NL;
-        return false;
+    $idxtag = metaFN($ID,'.indexed');
+    if(@file_exists($idxtag)){
+        if(io_readFile($idxtag) >= INDEXER_VERSION){
+            $last = @filemtime($idxtag);
+            if($last > @filemtime(wikiFN($ID))){
+                print "runIndexer(): index for $ID up to date".NL;
+                return false;
+            }
+        }
     }
 
     // try to aquire a lock
@@ -163,7 +171,7 @@ function runIndexer(){
     idx_addPage($ID);
 
     // we're finished - save and free lock
-    io_saveFile(metaFN($ID,'.indexed'),' ');
+    io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION);
     @rmdir($lock);
     print "runIndexer(): finished".NL;
     return true;
diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php
index 526ac4a0e..24b8db147 100644
--- a/lib/plugins/config/lang/en/lang.php
+++ b/lib/plugins/config/lang/en/lang.php
@@ -180,3 +180,5 @@ $lang['compression_o_0']   = 'none';
 $lang['compression_o_gz']  = 'gzip';
 $lang['compression_o_bz2'] = 'bz2';
 
+/* FIXME: delete when no longer needed */
+$lang['test_indexer'] = 'New Indexer testing bitfield: 0x1 -> sorted searching';
diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php
index 36b2be38b..44b121dfa 100644
--- a/lib/plugins/config/settings/config.metadata.php
+++ b/lib/plugins/config/settings/config.metadata.php
@@ -163,6 +163,7 @@ $meta['rss_type']    = array('multichoice','_choices' => array('rss','rss1','rss
 $meta['rss_linkto']  = array('multichoice','_choices' => array('diff','page','rev','current'));
 $meta['rss_update']  = array('numeric');
 $meta['recent_days'] = array('numeric');
+$meta['test_indexer'] = array('numeric'); // FIXME: delete when no longer needed
 
 $meta['_network']    = array('fieldset');
 $meta['proxy____host'] = array('string','_pattern' => '#^[a-z0-9\-\.+]+?#i');