Wildcardsearch added #552 #632

Now searching for word parts is possible by adding or prepending a * character to the searchword: 'foo*' searches for words beginning with 'foo' eg. 'foobar' '*foo' looks for words ending in 'foo' eg. 'barfoo' '*foo*' gets anything with 'foo' in it eg. 'barfoobaz' darcs-hash:20051127180723-7ad00-1eb29e812ddaf38d9812697bb1cffffe9a5fb330.gz
author: Andreas Gohr <andi@splitbrain.org> 2005-11-27 19:07:23 +0100
committer: Andreas Gohr <andi@splitbrain.org> 2005-11-27 19:07:23 +0100
commit: ad81d4312bb97f5d88438632707e8c4d561a3914 (patch)
tree: c528e3213006c411375a48eb5cea833ccb4f9902 /inc
parent: 519b3173fdfbf6418e4d1d8df87ec70dac7ffc60 (diff)
download: rpg-ad81d4312bb97f5d88438632707e8c4d561a3914.tar.gz
rpg-ad81d4312bb97f5d88438632707e8c4d561a3914.tar.bz2
5 files changed, 75 insertions, 26 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index f48250548..34520f0c5 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -19,7 +19,7 @@
 function ft_pageSearch($query,&$poswords){
     $q = ft_queryParser($query);
     // use this for higlighting later:
-    $poswords = join(' ',$q['and']);
+    $poswords = str_replace('*','',join(' ',$q['and']));
 
     // lookup all words found in the query
     $words  = array_merge($q['and'],$q['not']);
@@ -268,7 +268,7 @@ function ft_queryParser($query){
     $words = explode(' ',$query);
     foreach($words as $w){
         if($w{0} == '-'){
-            $token = idx_tokenizer($w,$stopwords);
+            $token = idx_tokenizer($w,$stopwords,true);
             if(count($token)) $q['not'] = array_merge($q['not'],$token);
         }else{
             // asian "words" need to be searched as phrases
@@ -276,7 +276,7 @@ function ft_queryParser($query){
                 $q['phrases'] = array_merge($q['phrases'],$matches[1]);
 
             }
-            $token = idx_tokenizer($w,$stopwords);
+            $token = idx_tokenizer($w,$stopwords,true);
             if(count($token)) $q['and'] = array_merge($q['and'],$token);
         }
     }
diff --git a/inc/geshi.php b/inc/geshi.php
index 69846eea4..4319afa02 100644
--- a/inc/geshi.php
+++ b/inc/geshi.php
@@ -28,7 +28,7 @@
  * @author    Nigel McNie <nigel@geshi.org>
  * @copyright Copyright &copy; 2004, 2005, Nigel McNie
  * @license   http://gnu.org/copyleft/gpl.html GNU GPL
- * @version   $Id: geshi.php,v 1.19 2005/10/22 07:52:59 oracleshinoda Exp $
+ * @version   $Id: geshi.php,v 1.23 2005/11/19 02:23:37 oracleshinoda Exp $
  *
  */
 
@@ -40,7 +40,7 @@
 //
 
 /** The version of this GeSHi file */
-define('GESHI_VERSION', '1.0.7.4');
+define('GESHI_VERSION', '1.0.7.5');
 
 /** For the future (though this may never be realised) */
 define('GESHI_OUTPUT_HTML', 0);
@@ -286,6 +286,7 @@ class GeSHi
     /**
      * Whether important blocks should be recognised or not
      * @var boolean
+     * @deprecated
      * @todo REMOVE THIS FUNCTIONALITY!
      */
 	var $enable_important_blocks = false;
@@ -293,6 +294,7 @@ class GeSHi
     /**
      * Styles for important parts of the code
      * @var string
+     * @deprecated
      * @todo As above - rethink the whole idea of important blocks as it is buggy and
      * will be hard to implement in 1.2
      */
@@ -1305,6 +1307,7 @@ class GeSHi
 	 * Sets whether context-important blocks are highlighted
      * 
      * @todo REMOVE THIS SHIZ FROM GESHI!
+     * @deprecated
 	 */
 	function enable_important_blocks ( $flag )
 	{
@@ -1590,6 +1593,12 @@ class GeSHi
 										$attributes = ' class="es0"';
 									}
 									$char = "<span$attributes>" . $char;
+                                    if (substr($code, $i + 1, 1) == "\n") {
+                                        // escaping a newline, what's the point in putting the span around
+                                        // the newline? It only causes hassles when inserting line numbers
+                                        $char .= '</span>';
+                                        $ESCAPE_CHAR_OPEN = false;
+                                    }
 								}
 							} else {
 								$ESCAPE_CHAR_OPEN = false;
@@ -1897,7 +1906,7 @@ class GeSHi
 	 */
 	function add_url_to_keyword ($keyword, $group, $start_or_end)
 	{
-		if (isset($this->language_data['URLS'][$group]) &&
+        if (isset($this->language_data['URLS'][$group]) &&
             $this->language_data['URLS'][$group] != '' &&
             substr($keyword, 0, 5) != '&lt;/') {
 			// There is a base group for this keyword
@@ -1915,7 +1924,8 @@ class GeSHi
                         ) . '">';
 				}
 				return '';
-			} else {
+            // HTML fix. Again, dirty hackage...
+			} elseif (!($this->language == 'html4strict' && '&gt;' == $keyword)) {
 				return '</a>';
 			}
 		}
@@ -2405,7 +2415,7 @@ class GeSHi
 			if ($this->use_classes) {
 				$attr = ' class="foot"';
 			} else {
-				$attr = " style=\"{$this->footer_content_style}\">";
+				$attr = " style=\"{$this->footer_content_style}\"";
 			}
 			return "<div$attr>$footer</div>";
 		}
diff --git a/inc/indexer.php b/inc/indexer.php
index 22bd8566b..915f26938 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -55,7 +55,7 @@ function idx_getPageWords($page){
             // handle asian chars as single words (may fail on older PHP version)
             $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
             if(!is_null($asia)) $word = $asia; //recover from regexp failure
-            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
+            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
             $arr = array_count_values($arr);
             
             foreach ($arr as $w => $c) {
@@ -231,12 +231,41 @@ function idx_lookup($words){
     // get word IDs
     $wids = array();
     foreach($words as $word){
-        $wid = array_search("$word\n",$word_idx);
-        if(is_int($wid)){
-            $wids[] = $wid;
-            $result[$word] = $wid;
-        }else{
-            $result[$word] = array();
+        $result[$word] = array();
+        $wild = 0;
+        $xword = $word; 
+
+        // check for wildcards
+        if(substr($xword,0,1) == '*'){
+            $xword = substr($xword,1);
+            $wild  = 1;
+        }
+        if(substr($xword,-1,1) == '*'){
+            $xword = substr($xword,0,-1);
+            $wild += 2;
+        }
+ 
+        // look for the ID(s) for the given word
+        if($wild){  // handle wildcard search
+            $cnt = count($word_idx);
+            for($wid=0; $wid<$cnt; $wid++){
+                $iword = $word_idx[$wid];
+                if( (($wild==3) && is_int(strpos($iword,$xword))) ||
+                    (($wild==1) && ("$xword\n" == substr($iword,(-1*strlen($xword))-1))) ||
+                    (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+                  ){
+                    $wids[] = $wid;
+                    $result[$word][] = $wid;
+                }
+            }
+        }else{     // handle exact search
+            $wid = array_search("$word\n",$word_idx);
+            if(is_int($wid)){
+                $wids[] = $wid;
+                $result[$word][] = $wid;
+            }else{
+                $result[$word] = array();
+            }
         }
     }
     sort($wids);
@@ -245,7 +274,7 @@ function idx_lookup($words){
     // Open index
     $idx = fopen($conf['cachedir'].'/index.idx','r');
     if(!$idx){
-       msg("Failed to open index files",-1);
+       msg("Failed to open index file",-1);
        return false;
     } 
 
@@ -275,14 +304,19 @@ function idx_lookup($words){
     }
     fclose($idx);
 
-    // merge found pages into result array
+
+    // merge found pages into final result array
+    $final = array();
     foreach(array_keys($result) as $word){
-        if(is_int($result[$word])){
-            $result[$word] = $docs[$result[$word]];
+        $final[$word] = array();
+        foreach($result[$word] as $wid){
+            $hits = &$docs[$wid];
+            foreach ($hits as $hitkey => $hitcnt) {
+                $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
+            }
         }
     }
-
-    return $result;
+    return $final;
 }
 
 /**
@@ -321,17 +355,22 @@ function idx_parseIndexLine(&$page_idx,$line){
  *
  * Uses the same algorithm as idx_getPageWords()
  *
+ * @param string   $string     the query as given by the user
+ * @param arrayref $stopwords  array of stopwords
+ * @param boolean  $wc         are wildcards allowed?
+ * 
  * @todo make combined function to use alone or in getPageWords
  */
-function idx_tokenizer($string,&$stopwords){
+function idx_tokenizer($string,&$stopwords,$wc=false){
     $words = array();
+    if(!$wc) $wc = '\*';
 
     if(preg_match('/[^0-9A-Za-z]/u', $string)){
         // handle asian chars as single words (may fail on older PHP version)
         $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
         if(!is_null($asia)) $string = $asia; //recover from regexp failure
 
-        $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
+        $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'.$wc));
         foreach ($arr as $w) {
             if (!is_numeric($w) && strlen($w) < 3) continue;
             $w = utf8_strtolower($w);
diff --git a/inc/pageutils.php b/inc/pageutils.php
index 0f9b47e47..a6432619d 100644
--- a/inc/pageutils.php
+++ b/inc/pageutils.php
@@ -89,7 +89,7 @@ function cleanID($id){
   if($conf['deaccent']) $id = utf8_deaccent($id,-1);
 
   //remove specials
-  $id = utf8_stripspecials($id,$sepchar);
+  $id = utf8_stripspecials($id,$sepchar,'\*');
 
   //clean up
   $id = preg_replace($sepcharpat,$sepchar,$id);
diff --git a/inc/utf8.php b/inc/utf8.php
index 64100f658..46d30f85d 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -581,14 +581,14 @@ $UTF8_UPPER_ACCENTS = array(
  * chars.
  *
  * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
- * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d
+ * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
  *
  * @author Andreas Gohr <andi@splitbrain.org>
  * @see    utf8_stripspecials()
  */
 $UTF8_SPECIAL_CHARS = array(
   0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
-  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
+  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
           0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
   0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
   0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
author	Andreas Gohr <andi@splitbrain.org>	2005-11-27 19:07:23 +0100
committer	Andreas Gohr <andi@splitbrain.org>	2005-11-27 19:07:23 +0100
commit	ad81d4312bb97f5d88438632707e8c4d561a3914 (patch)
tree	c528e3213006c411375a48eb5cea833ccb4f9902 /inc
parent	519b3173fdfbf6418e4d1d8df87ec70dac7ffc60 (diff)
download	rpg-ad81d4312bb97f5d88438632707e8c4d561a3914.tar.gz rpg-ad81d4312bb97f5d88438632707e8c4d561a3914.tar.bz2