summaryrefslogtreecommitdiff
path: root/inc/fulltext.php
diff options
context:
space:
mode:
authorchris <chris@jalakai.co.uk>2006-08-27 01:43:33 +0200
committerchris <chris@jalakai.co.uk>2006-08-27 01:43:33 +0200
commitced0762eb24c25ac27438d5bef2a6ff434e55dca (patch)
treeb8a80c46a3b6384e589e41cf007341e8df8401af /inc/fulltext.php
parente1d60f58d6db29397d9f05e5c4704a97e46fa2d5 (diff)
downloadrpg-ced0762eb24c25ac27438d5bef2a6ff434e55dca.tar.gz
rpg-ced0762eb24c25ac27438d5bef2a6ff434e55dca.tar.bz2
ft_snippet() update
- correct "opt1" algorithm for multibyte utf8 - minor improvement to "opt2" for short pages - add "utf8" algorithm, this algorithm endeavours to work with whole utf8 character as much as possible. The resulting snippet will tend to 100 characters, rather than the 100 bytes of "opt1" and "opt2". darcs-hash:20060826234333-9b6ab-ae4c60c8855a92b133cb8d5a230098203f610e7b.gz
Diffstat (limited to 'inc/fulltext.php')
-rw-r--r--inc/fulltext.php68
1 files changed, 65 insertions, 3 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 6ab22a5c2..b9450c172 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -242,8 +242,8 @@ switch ($algorithm) {
if ($idx < $end) continue;
$pre = min($idx,50);
- $start = $idx - $pre;
- $end = min($idx+100+strlen($str)-$pre,$len);
+ $start = utf8_correctIdx($text, $idx - $pre);
+ $end = utf8_correctIdx($text, min($idx+100+strlen($str)-$pre,$len));
$snippets[] = substr($text,$start,$end-$start);
if (!($cnt--)) break;
}
@@ -255,7 +255,7 @@ switch ($algorithm) {
break;
case 'opt2' :
- default :
+ default :
// option 2 ... CS 2006-08-25
// above + reduce amount of the file searched
$match = array();
@@ -280,6 +280,11 @@ switch ($algorithm) {
$pre = min($pre,100-$post);
} else if ($post>50) {
$post = min($post, 100-$pre);
+ } else {
+ // means both pre & post are less than 50, the context is the whole string
+ // make it so and break out of this loop - there is no need for the complex snippet calculations
+ $snippets = array($text);
+ break;
}
// establish context start and end points, try to append to previous context if possible
@@ -304,6 +309,63 @@ switch ($algorithm) {
$snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
break;
+
+ case 'utf8':
+ $match = array();
+ $snippets = array();
+ $utf8_offset = $offset = 0;
+ $len = utf8_strlen($text);
+ for ($cnt=3; $cnt--;) {
+ if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
+
+ list($str,$idx) = $match[0];
+
+ // convert $idx (a byte offset) into a utf8 character offset
+ $utf8_idx = utf8_strlen(substr($text,0,$idx));
+ $utf8_len = utf8_strlen($str);
+
+ // establish context, 100 bytes surrounding the match string
+ // first look to see if we can go 100 either side,
+ // then drop to 50 adding any excess if the other side can't go to 50,
+ // NOTE: these are byte adjustments and will have to be corrected for utf-8
+ $pre = min($utf8_idx-$utf8_offset,100);
+ $post = min($len-$utf8_idx-$utf8_len,100);
+
+ if ($pre>50 && $post>50) {
+ $pre = $post = 50;
+ } else if ($pre>50) {
+ $pre = min($pre,100-$post);
+ } else if ($post>50) {
+ $post = min($post, 100-$pre);
+ } else {
+ // both are less than 50, means the context is the whole string
+ // make it so and break out of this loop - there is no need for the complex snippet calculations
+ $snippets = array($text);
+ break;
+ }
+
+ // establish context start and end points, try to append to previous context if possible
+ $start = $idx - $pre;
+ $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
+ $end = $idx + $utf8_len + $post; // now set it to the end of this context
+
+ if ($append) {
+ $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+ } else {
+ $snippets[] = utf8_substr($text,$start,$end-$start);
+ }
+
+ // set $offset for next match attempt
+ // substract strlen to avoid splitting a potential search success, this is an approximation as the
+ // search pattern may match strings of varying length and it will fail if the context snippet
+ // boundary breaks a matching string longer than the current match
+ $utf8_offset = $end - $utf8_len;
+ $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset)));
+ }
+ $m = "\1";
+ $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
+ $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
+ break;
}
return $snippet;