summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--inc/fulltext.php79
-rw-r--r--inc/indexer.php9
-rw-r--r--inc/utf8.php1
3 files changed, 58 insertions, 31 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index b9450c172..fa3ec05d2 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -255,7 +255,6 @@ switch ($algorithm) {
break;
case 'opt2' :
- default :
// option 2 ... CS 2006-08-25
// above + reduce amount of the file searched
$match = array();
@@ -311,15 +310,22 @@ switch ($algorithm) {
break;
case 'utf8':
+ default :
+
$match = array();
$snippets = array();
- $utf8_offset = $offset = 0;
+ $utf8_offset = $offset = $end = 0;
$len = utf8_strlen($text);
+
for ($cnt=3; $cnt--;) {
if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
list($str,$idx) = $match[0];
+ // is it ok to use utf8_substr() -- see bug #891,
+ // check idx against (2^16)-1 - 400 (100x4 byte utf-8 characters)
+ if ($idx <= 65135) {
+
// convert $idx (a byte offset) into a utf8 character offset
$utf8_idx = utf8_strlen(substr($text,0,$idx));
$utf8_len = utf8_strlen($str);
@@ -328,39 +334,54 @@ switch ($algorithm) {
// first look to see if we can go 100 either side,
// then drop to 50 adding any excess if the other side can't go to 50,
// NOTE: these are byte adjustments and will have to be corrected for utf-8
- $pre = min($utf8_idx-$utf8_offset,100);
- $post = min($len-$utf8_idx-$utf8_len,100);
+ $pre = min($utf8_idx-$utf8_offset,100);
+ $post = min($len-$utf8_idx-$utf8_len,100);
+
+ if ($pre>50 && $post>50) {
+ $pre = $post = 50;
+ } else if ($pre>50) {
+ $pre = min($pre,100-$post);
+ } else if ($post>50) {
+ $post = min($post, 100-$pre);
+ } else {
+ // both are less than 50, means the context is the whole string
+ // make it so and break out of this loop - there is no need for the complex snippet calculations
+ $snippets = array($text);
+ break;
+ }
- if ($pre>50 && $post>50) {
- $pre = $post = 50;
- } else if ($pre>50) {
- $pre = min($pre,100-$post);
- } else if ($post>50) {
- $post = min($post, 100-$pre);
- } else {
- // both are less than 50, means the context is the whole string
- // make it so and break out of this loop - there is no need for the complex snippet calculations
- $snippets = array($text);
- break;
- }
+ // establish context start and end points, try to append to previous context if possible
+ $start = $utf8_idx - $pre;
+ $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
+ $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
- // establish context start and end points, try to append to previous context if possible
- $start = $idx - $pre;
- $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
- $end = $idx + $utf8_len + $post; // now set it to the end of this context
+ if ($append) {
+ $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+ } else {
+ $snippets[] = utf8_substr($text,$start,$end-$start);
+ }
- if ($append) {
- $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+ // set $offset for next match attempt
+ // substract strlen to avoid splitting a potential search success, this is an approximation as the
+ // search pattern may match strings of varying length and it will fail if the context snippet
+ // boundary breaks a matching string longer than the current match
+ $utf8_offset = $utf8_idx + $post;
+ $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
+ $offset = utf8_correctIdx($text,$offset);
} else {
- $snippets[] = utf8_substr($text,$start,$end-$start);
+ // code for strings too large for utf8_substr
+ // use a larger context number as its bytes not characters
+ $pre = 70;
+ $post = min(strlen($text)-$idx-strlen($str), 70);
+ if ($post < 70) { $pre = 70 - $post; }
+
+ $start = utf8_correctIdx($text,$idx - $pre);
+ $end = utf8_correctIdx($text, $idx + strlen($str) + $post);
+
+ $snippets[] = substr($text,$start,$end-$start);
+ $offset = $end - strlen($str);
}
- // set $offset for next match attempt
- // substract strlen to avoid splitting a potential search success, this is an approximation as the
- // search pattern may match strings of varying length and it will fail if the context snippet
- // boundary breaks a matching string longer than the current match
- $utf8_offset = $end - $utf8_len;
- $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset)));
}
$m = "\1";
$snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
diff --git a/inc/indexer.php b/inc/indexer.php
index 9af4b5b84..a2b7a0637 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -233,6 +233,8 @@ function idx_lookup($words){
if(substr($xword,0,1) == '*'){
$xword = substr($xword,1);
$wild = 1;
+ $ptn = '/'.preg_quote($xword,'/').'$/';
+# $l = -1*strlen($xword)-1;
}
if(substr($xword,-1,1) == '*'){
$xword = substr($xword,0,-1);
@@ -245,8 +247,11 @@ function idx_lookup($words){
for($wid=0; $wid<$cnt; $wid++){
$iword = $word_idx[$wid];
if( (($wild==3) && is_int(strpos($iword,$xword))) ||
- (($wild==1) && ("$xword\n" == substr($iword,(-1*strlen($xword))-1))) ||
- (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+# (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
+ (($wild==1) && preg_match($ptn,$iword)) ||
+# (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+ (($wild==2) && (0 === strpos($iword,$xword)))
+
){
$wids[] = $wid;
$result[$word][] = $wid;
diff --git a/inc/utf8.php b/inc/utf8.php
index aa9594c42..dbf09b6fc 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -13,6 +13,7 @@
if(!defined('UTF8_MBSTRING')){
if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
define('UTF8_MBSTRING',1);
+ mb_internal_encoding('UTF-8');
}else{
define('UTF8_MBSTRING',0);
}