diff options
author | chris <chris@jalakai.co.uk> | 2006-08-27 01:43:33 +0200 |
---|---|---|
committer | chris <chris@jalakai.co.uk> | 2006-08-27 01:43:33 +0200 |
commit | ced0762eb24c25ac27438d5bef2a6ff434e55dca (patch) | |
tree | b8a80c46a3b6384e589e41cf007341e8df8401af | |
parent | e1d60f58d6db29397d9f05e5c4704a97e46fa2d5 (diff) | |
download | rpg-ced0762eb24c25ac27438d5bef2a6ff434e55dca.tar.gz rpg-ced0762eb24c25ac27438d5bef2a6ff434e55dca.tar.bz2 |
ft_snippet() update
- correct "opt1" algorithm for multibyte utf8
- minor improvement to "opt2" for short pages
- add "utf8" algorithm, this algorithm endeavours
to work with whole utf8 character as much as
possible. The resulting snippet will tend to
100 characters, rather than the 100 bytes of
"opt1" and "opt2".
darcs-hash:20060826234333-9b6ab-ae4c60c8855a92b133cb8d5a230098203f610e7b.gz
-rw-r--r-- | inc/fulltext.php | 68 |
1 files changed, 65 insertions, 3 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index 6ab22a5c2..b9450c172 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -242,8 +242,8 @@ switch ($algorithm) { if ($idx < $end) continue; $pre = min($idx,50); - $start = $idx - $pre; - $end = min($idx+100+strlen($str)-$pre,$len); + $start = utf8_correctIdx($text, $idx - $pre); + $end = utf8_correctIdx($text, min($idx+100+strlen($str)-$pre,$len)); $snippets[] = substr($text,$start,$end-$start); if (!($cnt--)) break; } @@ -255,7 +255,7 @@ switch ($algorithm) { break; case 'opt2' : - default : + default : // option 2 ... CS 2006-08-25 // above + reduce amount of the file searched $match = array(); @@ -280,6 +280,11 @@ switch ($algorithm) { $pre = min($pre,100-$post); } else if ($post>50) { $post = min($post, 100-$pre); + } else { + // means both pre & post are less than 50, the context is the whole string + // make it so and break out of this loop - there is no need for the complex snippet calculations + $snippets = array($text); + break; } // establish context start and end points, try to append to previous context if possible @@ -304,6 +309,63 @@ switch ($algorithm) { $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets))); break; + + case 'utf8': + $match = array(); + $snippets = array(); + $utf8_offset = $offset = 0; + $len = utf8_strlen($text); + for ($cnt=3; $cnt--;) { + if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; + + list($str,$idx) = $match[0]; + + // convert $idx (a byte offset) into a utf8 character offset + $utf8_idx = utf8_strlen(substr($text,0,$idx)); + $utf8_len = utf8_strlen($str); + + // establish context, 100 bytes surrounding the match string + // first look to see if we can go 100 either side, + // then drop to 50 adding any excess if the other side can't go to 50, + // NOTE: these are byte adjustments and will have to be corrected for utf-8 + $pre = min($utf8_idx-$utf8_offset,100); + $post = min($len-$utf8_idx-$utf8_len,100); + + if ($pre>50 && $post>50) { + $pre = $post = 50; + } else if ($pre>50) { + $pre = min($pre,100-$post); + } else if ($post>50) { + $post = min($post, 100-$pre); + } else { + // both are less than 50, means the context is the whole string + // make it so and break out of this loop - there is no need for the complex snippet calculations + $snippets = array($text); + break; + } + + // establish context start and end points, try to append to previous context if possible + $start = $idx - $pre; + $append = ($start < $end) ? $end : false; // still the end of the previous context snippet + $end = $idx + $utf8_len + $post; // now set it to the end of this context + + if ($append) { + $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); + } else { + $snippets[] = utf8_substr($text,$start,$end-$start); + } + + // set $offset for next match attempt + // substract strlen to avoid splitting a potential search success, this is an approximation as the + // search pattern may match strings of varying length and it will fail if the context snippet + // boundary breaks a matching string longer than the current match + $utf8_offset = $end - $utf8_len; + $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset))); + } + $m = "\1"; + $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); + $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets))); + break; } return $snippet; |