Merge branch 'tokenizer-rewrite' into michitux

author: Tom N Harris <tnharris@whoopdedo.org> 2010-11-20 14:29:13 -0500
committer: Tom N Harris <tnharris@whoopdedo.org> 2010-11-20 14:29:13 -0500
commit: 3c4b38902b6f6d32222611b22087d5d41d20de6e (patch)
tree: 3d28bbaa015e10283050b6512a4f49e53323b8fc /inc/indexer.php
parent: 4753bcc0e2fd9417e885e128e8c9ab4bfc566c32 (diff)
parent: 420edfd639fb3d0a0f6a2504ecb2f8f6b68be1f7 (diff)
download: rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.gz
rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.bz2
1 files changed, 64 insertions, 48 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index 954512673..d4432026e 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -8,6 +8,9 @@
 
 if(!defined('DOKU_INC')) die('meh.');
 
+// Version tag used to force rebuild on upgrade
+define('INDEXER_VERSION', 2);
+
 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
 
@@ -43,6 +46,20 @@ define('IDX_ASIAN3','['.                // Hiragana/Katakana (can be two charact
 define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
 
 /**
+ * Version of the indexer taking into consideration the external tokenizer.
+ * The indexer is only compatible with data written by the same version.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_get_version(){
+    global $conf;
+    if($conf['external_tokenizer'])
+        return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']);
+    else
+        return INDEXER_VERSION;
+}
+
+/**
  * Measure the length of a string.
  * Differs from strlen in handling of asian characters.
  *
@@ -52,8 +69,10 @@ function wordlen($w){
     $l = strlen($w);
     // If left alone, all chinese "words" will get put into w3.idx
     // So the "length" of a "word" is faked
-    if(preg_match('/'.IDX_ASIAN2.'/u',$w))
-        $l += ord($w) - 0xE1;  // Lead bytes from 0xE2-0xEF
+    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
+        foreach($leadbytes[0] as $b)
+            $l += ord($b) - 0xE1;
+    }
     return $l;
 }
 
@@ -102,22 +121,6 @@ function idx_getIndex($pre, $wlen){
 }
 
 /**
- * Create an empty index file if it doesn't exist yet.
- *
- * FIXME: This function isn't currently used. It will probably be removed soon.
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
-function idx_touchIndex($pre, $wlen){
-    global $conf;
-    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
-    if(!@file_exists($fn)){
-        touch($fn);
-        if($conf['fperm']) chmod($fn, $conf['fperm']);
-    }
-}
-
-/**
  * Read a line ending with \n.
  * Returns false on EOF.
  *
@@ -215,8 +218,7 @@ function idx_getPageWords($page){
 
     list($page,$body) = $data;
 
-    $body   = strtr($body, "\r\n\t", '   ');
-    $tokens = explode(' ', $body);
+    $tokens = idx_tokenizer($body, $stopwords);
     $tokens = array_count_values($tokens);   // count the frequency of each token
 
     // ensure the deaccented or romanised page names of internal links are added to the token array
@@ -237,16 +239,12 @@ function idx_getPageWords($page){
     }
 
     $words = array();
-    foreach ($tokens as $word => $count) {
-        $arr = idx_tokenizer($word,$stopwords);
-        $arr = array_count_values($arr);
-        foreach ($arr as $w => $c) {
-            $l = wordlen($w);
-            if(isset($words[$l])){
-                $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
-            }else{
-                $words[$l] = array($w => $c * $count);
-            }
+    foreach ($tokens as $w => $c) {
+        $l = wordlen($w);
+        if(isset($words[$l])){
+            $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
+        }else{
+            $words[$l] = array($w => $c);
         }
     }
 
@@ -480,7 +478,7 @@ function idx_indexLengths(&$filter){
     } else {
         $lengths = idx_listIndexLengths();
         foreach ( $lengths as $key => $length) {
-            // we keep all the values equal or superior 
+            // we keep all the values equal or superior
             if ((int)$length >= (int)$filter) {
                 $idx[] = $length;
             }
@@ -663,33 +661,51 @@ function idx_parseIndexLine(&$page_idx,$line){
  * Tokenizes a string into an array of search words
  *
  * Uses the same algorithm as idx_getPageWords()
+ * Takes an arbitrarily complex string and returns a list of words
+ * suitable for indexing. The string may include spaces and line
+ * breaks
  *
  * @param string   $string     the query as given by the user
  * @param arrayref $stopwords  array of stopwords
  * @param boolean  $wc         are wildcards allowed?
+ * @return array               list of indexable words
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ * @author Andreas Gohr <andi@splitbrain.org>
  */
 function idx_tokenizer($string,&$stopwords,$wc=false){
+    global $conf;
     $words = array();
     $wc = ($wc) ? '' : $wc = '\*';
 
-    if(preg_match('/[^0-9A-Za-z]/u', $string)){
-        // handle asian chars as single words (may fail on older PHP version)
-        $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
-        if(!is_null($asia)) $string = $asia; //recover from regexp failure
-
-        $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
-        foreach ($arr as $w) {
-            if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue;
-            $w = utf8_strtolower($w);
-            if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
-            $words[] = $w;
+    if (!$stopwords)
+        $sw = array();
+    else
+        $sw =& $stopwords;
+
+    if ($conf['external_tokenizer']) {
+        if (0 == io_exec($conf['tokenizer_cmd'], $string, $output))
+            $string = $output;
+    } else {
+        if(preg_match('/[^0-9A-Za-z ]/u', $string)) {
+            // handle asian chars as single words (may fail on older PHP version)
+            $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
+            if(!is_null($asia)) $string = $asia; //recover from regexp failure
         }
-    }else{
-        $w = $string;
-        if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) return $words;
-        $w = strtolower($w);
-        if(is_int(array_search("$w\n",$stopwords))) return $words;
-        $words[] = $w;
+    }
+    $string = strtr($string, "\r\n\t", '   ');
+    if(preg_match('/[^0-9A-Za-z ]/u', $string))
+        $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc);
+
+    $wordlist = explode(' ', $string);
+    foreach ($wordlist as $word) {
+        if(preg_match('/[^0-9A-Za-z]/u', $word)){
+            $word = utf8_strtolower($word);
+        }else{
+            $word = strtolower($word);
+        }
+        if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue;
+        if(is_int(array_search("$word\n",$stopwords))) continue;
+        $words[] = $word;
     }
 
     return $words;
author	Tom N Harris <tnharris@whoopdedo.org>	2010-11-20 14:29:13 -0500
committer	Tom N Harris <tnharris@whoopdedo.org>	2010-11-20 14:29:13 -0500
commit	3c4b38902b6f6d32222611b22087d5d41d20de6e (patch)
tree	3d28bbaa015e10283050b6512a4f49e53323b8fc /inc/indexer.php
parent	4753bcc0e2fd9417e885e128e8c9ab4bfc566c32 (diff)
parent	420edfd639fb3d0a0f6a2504ecb2f8f6b68be1f7 (diff)
download	rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.gz rpg-3c4b38902b6f6d32222611b22087d5d41d20de6e.tar.bz2