diff options
-rw-r--r-- | inc/fulltext.php | 68 |
1 files changed, 65 insertions, 3 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index 6ab22a5c2..b9450c172 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -242,8 +242,8 @@ switch ($algorithm) { if ($idx < $end) continue; $pre = min($idx,50); - $start = $idx - $pre; - $end = min($idx+100+strlen($str)-$pre,$len); + $start = utf8_correctIdx($text, $idx - $pre); + $end = utf8_correctIdx($text, min($idx+100+strlen($str)-$pre,$len)); $snippets[] = substr($text,$start,$end-$start); if (!($cnt--)) break; } @@ -255,7 +255,7 @@ switch ($algorithm) { break; case 'opt2' : - default : + default : // option 2 ... CS 2006-08-25 // above + reduce amount of the file searched $match = array(); @@ -280,6 +280,11 @@ switch ($algorithm) { $pre = min($pre,100-$post); } else if ($post>50) { $post = min($post, 100-$pre); + } else { + // means both pre & post are less than 50, the context is the whole string + // make it so and break out of this loop - there is no need for the complex snippet calculations + $snippets = array($text); + break; } // establish context start and end points, try to append to previous context if possible @@ -304,6 +309,63 @@ switch ($algorithm) { $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets))); break; + + case 'utf8': + $match = array(); + $snippets = array(); + $utf8_offset = $offset = 0; + $len = utf8_strlen($text); + for ($cnt=3; $cnt--;) { + if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; + + list($str,$idx) = $match[0]; + + // convert $idx (a byte offset) into a utf8 character offset + $utf8_idx = utf8_strlen(substr($text,0,$idx)); + $utf8_len = utf8_strlen($str); + + // establish context, 100 bytes surrounding the match string + // first look to see if we can go 100 either side, + // then drop to 50 adding any excess if the other side can't go to 50, + // NOTE: these are byte adjustments and will have to be corrected for utf-8 + $pre = min($utf8_idx-$utf8_offset,100); + $post = min($len-$utf8_idx-$utf8_len,100); + + if ($pre>50 && $post>50) { + $pre = $post = 50; + } else if ($pre>50) { + $pre = min($pre,100-$post); + } else if ($post>50) { + $post = min($post, 100-$pre); + } else { + // both are less than 50, means the context is the whole string + // make it so and break out of this loop - there is no need for the complex snippet calculations + $snippets = array($text); + break; + } + + // establish context start and end points, try to append to previous context if possible + $start = $idx - $pre; + $append = ($start < $end) ? $end : false; // still the end of the previous context snippet + $end = $idx + $utf8_len + $post; // now set it to the end of this context + + if ($append) { + $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); + } else { + $snippets[] = utf8_substr($text,$start,$end-$start); + } + + // set $offset for next match attempt + // substract strlen to avoid splitting a potential search success, this is an approximation as the + // search pattern may match strings of varying length and it will fail if the context snippet + // boundary breaks a matching string longer than the current match + $utf8_offset = $end - $utf8_len; + $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset))); + } + $m = "\1"; + $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); + $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets))); + break; } return $snippet; |