diff options
-rw-r--r-- | inc/fulltext.php | 6 | ||||
-rw-r--r-- | inc/geshi.php | 20 | ||||
-rw-r--r-- | inc/indexer.php | 69 | ||||
-rw-r--r-- | inc/pageutils.php | 2 | ||||
-rw-r--r-- | inc/utf8.php | 4 |
5 files changed, 75 insertions, 26 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index f48250548..34520f0c5 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -19,7 +19,7 @@ function ft_pageSearch($query,&$poswords){ $q = ft_queryParser($query); // use this for higlighting later: - $poswords = join(' ',$q['and']); + $poswords = str_replace('*','',join(' ',$q['and'])); // lookup all words found in the query $words = array_merge($q['and'],$q['not']); @@ -268,7 +268,7 @@ function ft_queryParser($query){ $words = explode(' ',$query); foreach($words as $w){ if($w{0} == '-'){ - $token = idx_tokenizer($w,$stopwords); + $token = idx_tokenizer($w,$stopwords,true); if(count($token)) $q['not'] = array_merge($q['not'],$token); }else{ // asian "words" need to be searched as phrases @@ -276,7 +276,7 @@ function ft_queryParser($query){ $q['phrases'] = array_merge($q['phrases'],$matches[1]); } - $token = idx_tokenizer($w,$stopwords); + $token = idx_tokenizer($w,$stopwords,true); if(count($token)) $q['and'] = array_merge($q['and'],$token); } } diff --git a/inc/geshi.php b/inc/geshi.php index 69846eea4..4319afa02 100644 --- a/inc/geshi.php +++ b/inc/geshi.php @@ -28,7 +28,7 @@ * @author Nigel McNie <nigel@geshi.org> * @copyright Copyright © 2004, 2005, Nigel McNie * @license http://gnu.org/copyleft/gpl.html GNU GPL - * @version $Id: geshi.php,v 1.19 2005/10/22 07:52:59 oracleshinoda Exp $ + * @version $Id: geshi.php,v 1.23 2005/11/19 02:23:37 oracleshinoda Exp $ * */ @@ -40,7 +40,7 @@ // /** The version of this GeSHi file */ -define('GESHI_VERSION', '1.0.7.4'); +define('GESHI_VERSION', '1.0.7.5'); /** For the future (though this may never be realised) */ define('GESHI_OUTPUT_HTML', 0); @@ -286,6 +286,7 @@ class GeSHi /** * Whether important blocks should be recognised or not * @var boolean + * @deprecated * @todo REMOVE THIS FUNCTIONALITY! */ var $enable_important_blocks = false; @@ -293,6 +294,7 @@ class GeSHi /** * Styles for important parts of the code * @var string + * @deprecated * @todo As above - rethink the whole idea of important blocks as it is buggy and * will be hard to implement in 1.2 */ @@ -1305,6 +1307,7 @@ class GeSHi * Sets whether context-important blocks are highlighted * * @todo REMOVE THIS SHIZ FROM GESHI! + * @deprecated */ function enable_important_blocks ( $flag ) { @@ -1590,6 +1593,12 @@ class GeSHi $attributes = ' class="es0"'; } $char = "<span$attributes>" . $char; + if (substr($code, $i + 1, 1) == "\n") { + // escaping a newline, what's the point in putting the span around + // the newline? It only causes hassles when inserting line numbers + $char .= '</span>'; + $ESCAPE_CHAR_OPEN = false; + } } } else { $ESCAPE_CHAR_OPEN = false; @@ -1897,7 +1906,7 @@ class GeSHi */ function add_url_to_keyword ($keyword, $group, $start_or_end) { - if (isset($this->language_data['URLS'][$group]) && + if (isset($this->language_data['URLS'][$group]) && $this->language_data['URLS'][$group] != '' && substr($keyword, 0, 5) != '</') { // There is a base group for this keyword @@ -1915,7 +1924,8 @@ class GeSHi ) . '">'; } return ''; - } else { + // HTML fix. Again, dirty hackage... + } elseif (!($this->language == 'html4strict' && '>' == $keyword)) { return '</a>'; } } @@ -2405,7 +2415,7 @@ class GeSHi if ($this->use_classes) { $attr = ' class="foot"'; } else { - $attr = " style=\"{$this->footer_content_style}\">"; + $attr = " style=\"{$this->footer_content_style}\""; } return "<div$attr>$footer</div>"; } diff --git a/inc/indexer.php b/inc/indexer.php index 22bd8566b..915f26938 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -55,7 +55,7 @@ function idx_getPageWords($page){ // handle asian chars as single words (may fail on older PHP version) $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); if(!is_null($asia)) $word = $asia; //recover from regexp failure - $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); + $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*')); $arr = array_count_values($arr); foreach ($arr as $w => $c) { @@ -231,12 +231,41 @@ function idx_lookup($words){ // get word IDs $wids = array(); foreach($words as $word){ - $wid = array_search("$word\n",$word_idx); - if(is_int($wid)){ - $wids[] = $wid; - $result[$word] = $wid; - }else{ - $result[$word] = array(); + $result[$word] = array(); + $wild = 0; + $xword = $word; + + // check for wildcards + if(substr($xword,0,1) == '*'){ + $xword = substr($xword,1); + $wild = 1; + } + if(substr($xword,-1,1) == '*'){ + $xword = substr($xword,0,-1); + $wild += 2; + } + + // look for the ID(s) for the given word + if($wild){ // handle wildcard search + $cnt = count($word_idx); + for($wid=0; $wid<$cnt; $wid++){ + $iword = $word_idx[$wid]; + if( (($wild==3) && is_int(strpos($iword,$xword))) || + (($wild==1) && ("$xword\n" == substr($iword,(-1*strlen($xword))-1))) || + (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) + ){ + $wids[] = $wid; + $result[$word][] = $wid; + } + } + }else{ // handle exact search + $wid = array_search("$word\n",$word_idx); + if(is_int($wid)){ + $wids[] = $wid; + $result[$word][] = $wid; + }else{ + $result[$word] = array(); + } } } sort($wids); @@ -245,7 +274,7 @@ function idx_lookup($words){ // Open index $idx = fopen($conf['cachedir'].'/index.idx','r'); if(!$idx){ - msg("Failed to open index files",-1); + msg("Failed to open index file",-1); return false; } @@ -275,14 +304,19 @@ function idx_lookup($words){ } fclose($idx); - // merge found pages into result array + + // merge found pages into final result array + $final = array(); foreach(array_keys($result) as $word){ - if(is_int($result[$word])){ - $result[$word] = $docs[$result[$word]]; + $final[$word] = array(); + foreach($result[$word] as $wid){ + $hits = &$docs[$wid]; + foreach ($hits as $hitkey => $hitcnt) { + $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey]; + } } } - - return $result; + return $final; } /** @@ -321,17 +355,22 @@ function idx_parseIndexLine(&$page_idx,$line){ * * Uses the same algorithm as idx_getPageWords() * + * @param string $string the query as given by the user + * @param arrayref $stopwords array of stopwords + * @param boolean $wc are wildcards allowed? + * * @todo make combined function to use alone or in getPageWords */ -function idx_tokenizer($string,&$stopwords){ +function idx_tokenizer($string,&$stopwords,$wc=false){ $words = array(); + if(!$wc) $wc = '\*'; if(preg_match('/[^0-9A-Za-z]/u', $string)){ // handle asian chars as single words (may fail on older PHP version) $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); if(!is_null($asia)) $string = $asia; //recover from regexp failure - $arr = explode(' ', utf8_stripspecials($string,' ','._\-:')); + $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'.$wc)); foreach ($arr as $w) { if (!is_numeric($w) && strlen($w) < 3) continue; $w = utf8_strtolower($w); diff --git a/inc/pageutils.php b/inc/pageutils.php index 0f9b47e47..a6432619d 100644 --- a/inc/pageutils.php +++ b/inc/pageutils.php @@ -89,7 +89,7 @@ function cleanID($id){ if($conf['deaccent']) $id = utf8_deaccent($id,-1); //remove specials - $id = utf8_stripspecials($id,$sepchar); + $id = utf8_stripspecials($id,$sepchar,'\*'); //clean up $id = preg_replace($sepcharpat,$sepchar,$id); diff --git a/inc/utf8.php b/inc/utf8.php index 64100f658..46d30f85d 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -581,14 +581,14 @@ $UTF8_UPPER_ACCENTS = array( * chars. * * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! - * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d + * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a * * @author Andreas Gohr <andi@splitbrain.org> * @see utf8_stripspecials() */ $UTF8_SPECIAL_CHARS = array( 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, - 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, + 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, |