summaryrefslogtreecommitdiff
path: root/inc
diff options
context:
space:
mode:
authorAndreas Gohr <andi@splitbrain.org>2005-11-27 19:07:23 +0100
committerAndreas Gohr <andi@splitbrain.org>2005-11-27 19:07:23 +0100
commitad81d4312bb97f5d88438632707e8c4d561a3914 (patch)
treec528e3213006c411375a48eb5cea833ccb4f9902 /inc
parent519b3173fdfbf6418e4d1d8df87ec70dac7ffc60 (diff)
downloadrpg-ad81d4312bb97f5d88438632707e8c4d561a3914.tar.gz
rpg-ad81d4312bb97f5d88438632707e8c4d561a3914.tar.bz2
Wildcardsearch added #552 #632
Now searching for word parts is possible by adding or prepending a * character to the searchword: 'foo*' searches for words beginning with 'foo' eg. 'foobar' '*foo' looks for words ending in 'foo' eg. 'barfoo' '*foo*' gets anything with 'foo' in it eg. 'barfoobaz' darcs-hash:20051127180723-7ad00-1eb29e812ddaf38d9812697bb1cffffe9a5fb330.gz
Diffstat (limited to 'inc')
-rw-r--r--inc/fulltext.php6
-rw-r--r--inc/geshi.php20
-rw-r--r--inc/indexer.php69
-rw-r--r--inc/pageutils.php2
-rw-r--r--inc/utf8.php4
5 files changed, 75 insertions, 26 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index f48250548..34520f0c5 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -19,7 +19,7 @@
function ft_pageSearch($query,&$poswords){
$q = ft_queryParser($query);
// use this for higlighting later:
- $poswords = join(' ',$q['and']);
+ $poswords = str_replace('*','',join(' ',$q['and']));
// lookup all words found in the query
$words = array_merge($q['and'],$q['not']);
@@ -268,7 +268,7 @@ function ft_queryParser($query){
$words = explode(' ',$query);
foreach($words as $w){
if($w{0} == '-'){
- $token = idx_tokenizer($w,$stopwords);
+ $token = idx_tokenizer($w,$stopwords,true);
if(count($token)) $q['not'] = array_merge($q['not'],$token);
}else{
// asian "words" need to be searched as phrases
@@ -276,7 +276,7 @@ function ft_queryParser($query){
$q['phrases'] = array_merge($q['phrases'],$matches[1]);
}
- $token = idx_tokenizer($w,$stopwords);
+ $token = idx_tokenizer($w,$stopwords,true);
if(count($token)) $q['and'] = array_merge($q['and'],$token);
}
}
diff --git a/inc/geshi.php b/inc/geshi.php
index 69846eea4..4319afa02 100644
--- a/inc/geshi.php
+++ b/inc/geshi.php
@@ -28,7 +28,7 @@
* @author Nigel McNie <nigel@geshi.org>
* @copyright Copyright &copy; 2004, 2005, Nigel McNie
* @license http://gnu.org/copyleft/gpl.html GNU GPL
- * @version $Id: geshi.php,v 1.19 2005/10/22 07:52:59 oracleshinoda Exp $
+ * @version $Id: geshi.php,v 1.23 2005/11/19 02:23:37 oracleshinoda Exp $
*
*/
@@ -40,7 +40,7 @@
//
/** The version of this GeSHi file */
-define('GESHI_VERSION', '1.0.7.4');
+define('GESHI_VERSION', '1.0.7.5');
/** For the future (though this may never be realised) */
define('GESHI_OUTPUT_HTML', 0);
@@ -286,6 +286,7 @@ class GeSHi
/**
* Whether important blocks should be recognised or not
* @var boolean
+ * @deprecated
* @todo REMOVE THIS FUNCTIONALITY!
*/
var $enable_important_blocks = false;
@@ -293,6 +294,7 @@ class GeSHi
/**
* Styles for important parts of the code
* @var string
+ * @deprecated
* @todo As above - rethink the whole idea of important blocks as it is buggy and
* will be hard to implement in 1.2
*/
@@ -1305,6 +1307,7 @@ class GeSHi
* Sets whether context-important blocks are highlighted
*
* @todo REMOVE THIS SHIZ FROM GESHI!
+ * @deprecated
*/
function enable_important_blocks ( $flag )
{
@@ -1590,6 +1593,12 @@ class GeSHi
$attributes = ' class="es0"';
}
$char = "<span$attributes>" . $char;
+ if (substr($code, $i + 1, 1) == "\n") {
+ // escaping a newline, what's the point in putting the span around
+ // the newline? It only causes hassles when inserting line numbers
+ $char .= '</span>';
+ $ESCAPE_CHAR_OPEN = false;
+ }
}
} else {
$ESCAPE_CHAR_OPEN = false;
@@ -1897,7 +1906,7 @@ class GeSHi
*/
function add_url_to_keyword ($keyword, $group, $start_or_end)
{
- if (isset($this->language_data['URLS'][$group]) &&
+ if (isset($this->language_data['URLS'][$group]) &&
$this->language_data['URLS'][$group] != '' &&
substr($keyword, 0, 5) != '&lt;/') {
// There is a base group for this keyword
@@ -1915,7 +1924,8 @@ class GeSHi
) . '">';
}
return '';
- } else {
+ // HTML fix. Again, dirty hackage...
+ } elseif (!($this->language == 'html4strict' && '&gt;' == $keyword)) {
return '</a>';
}
}
@@ -2405,7 +2415,7 @@ class GeSHi
if ($this->use_classes) {
$attr = ' class="foot"';
} else {
- $attr = " style=\"{$this->footer_content_style}\">";
+ $attr = " style=\"{$this->footer_content_style}\"";
}
return "<div$attr>$footer</div>";
}
diff --git a/inc/indexer.php b/inc/indexer.php
index 22bd8566b..915f26938 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -55,7 +55,7 @@ function idx_getPageWords($page){
// handle asian chars as single words (may fail on older PHP version)
$asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
if(!is_null($asia)) $word = $asia; //recover from regexp failure
- $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
+ $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
$arr = array_count_values($arr);
foreach ($arr as $w => $c) {
@@ -231,12 +231,41 @@ function idx_lookup($words){
// get word IDs
$wids = array();
foreach($words as $word){
- $wid = array_search("$word\n",$word_idx);
- if(is_int($wid)){
- $wids[] = $wid;
- $result[$word] = $wid;
- }else{
- $result[$word] = array();
+ $result[$word] = array();
+ $wild = 0;
+ $xword = $word;
+
+ // check for wildcards
+ if(substr($xword,0,1) == '*'){
+ $xword = substr($xword,1);
+ $wild = 1;
+ }
+ if(substr($xword,-1,1) == '*'){
+ $xword = substr($xword,0,-1);
+ $wild += 2;
+ }
+
+ // look for the ID(s) for the given word
+ if($wild){ // handle wildcard search
+ $cnt = count($word_idx);
+ for($wid=0; $wid<$cnt; $wid++){
+ $iword = $word_idx[$wid];
+ if( (($wild==3) && is_int(strpos($iword,$xword))) ||
+ (($wild==1) && ("$xword\n" == substr($iword,(-1*strlen($xword))-1))) ||
+ (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+ ){
+ $wids[] = $wid;
+ $result[$word][] = $wid;
+ }
+ }
+ }else{ // handle exact search
+ $wid = array_search("$word\n",$word_idx);
+ if(is_int($wid)){
+ $wids[] = $wid;
+ $result[$word][] = $wid;
+ }else{
+ $result[$word] = array();
+ }
}
}
sort($wids);
@@ -245,7 +274,7 @@ function idx_lookup($words){
// Open index
$idx = fopen($conf['cachedir'].'/index.idx','r');
if(!$idx){
- msg("Failed to open index files",-1);
+ msg("Failed to open index file",-1);
return false;
}
@@ -275,14 +304,19 @@ function idx_lookup($words){
}
fclose($idx);
- // merge found pages into result array
+
+ // merge found pages into final result array
+ $final = array();
foreach(array_keys($result) as $word){
- if(is_int($result[$word])){
- $result[$word] = $docs[$result[$word]];
+ $final[$word] = array();
+ foreach($result[$word] as $wid){
+ $hits = &$docs[$wid];
+ foreach ($hits as $hitkey => $hitcnt) {
+ $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
+ }
}
}
-
- return $result;
+ return $final;
}
/**
@@ -321,17 +355,22 @@ function idx_parseIndexLine(&$page_idx,$line){
*
* Uses the same algorithm as idx_getPageWords()
*
+ * @param string $string the query as given by the user
+ * @param arrayref $stopwords array of stopwords
+ * @param boolean $wc are wildcards allowed?
+ *
* @todo make combined function to use alone or in getPageWords
*/
-function idx_tokenizer($string,&$stopwords){
+function idx_tokenizer($string,&$stopwords,$wc=false){
$words = array();
+ if(!$wc) $wc = '\*';
if(preg_match('/[^0-9A-Za-z]/u', $string)){
// handle asian chars as single words (may fail on older PHP version)
$asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
if(!is_null($asia)) $string = $asia; //recover from regexp failure
- $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
+ $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'.$wc));
foreach ($arr as $w) {
if (!is_numeric($w) && strlen($w) < 3) continue;
$w = utf8_strtolower($w);
diff --git a/inc/pageutils.php b/inc/pageutils.php
index 0f9b47e47..a6432619d 100644
--- a/inc/pageutils.php
+++ b/inc/pageutils.php
@@ -89,7 +89,7 @@ function cleanID($id){
if($conf['deaccent']) $id = utf8_deaccent($id,-1);
//remove specials
- $id = utf8_stripspecials($id,$sepchar);
+ $id = utf8_stripspecials($id,$sepchar,'\*');
//clean up
$id = preg_replace($sepcharpat,$sepchar,$id);
diff --git a/inc/utf8.php b/inc/utf8.php
index 64100f658..46d30f85d 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -581,14 +581,14 @@ $UTF8_UPPER_ACCENTS = array(
* chars.
*
* The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
- * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d
+ * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
*
* @author Andreas Gohr <andi@splitbrain.org>
* @see utf8_stripspecials()
*/
$UTF8_SPECIAL_CHARS = array(
0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
- 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
+ 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c,
0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,