diff options
author | chris <chris@jalakai.co.uk> | 2006-09-28 18:51:22 +0200 |
---|---|---|
committer | chris <chris@jalakai.co.uk> | 2006-09-28 18:51:22 +0200 |
commit | 2626ee0c66b9293c433a1b593ce1e2fe45bdb040 (patch) | |
tree | 7499f734c38a2eecf5977191c2f4c74f1013bb5f /inc/fulltext.php | |
parent | d07dd8ee598c9b9dc8dfc9a61e0fdfa023ad59de (diff) | |
download | rpg-2626ee0c66b9293c433a1b593ce1e2fe45bdb040.tar.gz rpg-2626ee0c66b9293c433a1b593ce1e2fe45bdb040.tar.bz2 |
more utf8_substr improvements (re FS#891 and yesterday's patch)
- rework utf8_substr() NOMBSTRING code to always use pcre
- remove work around for utf8_substr() and large strings from ft_snippet()
darcs-hash:20060928165122-9b6ab-0eefc216f07f9d7e7d8eb62ce26605c28ee340fa.gz
Diffstat (limited to 'inc/fulltext.php')
-rw-r--r-- | inc/fulltext.php | 75 |
1 files changed, 29 insertions, 46 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index 4aeb622e2..794209a4b 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -328,10 +328,6 @@ switch ($algorithm) { list($str,$idx) = $match[0]; - // is it ok to use utf8_substr() -- see bug #891, - // check idx against (2^16)-1 - 400 (100x4 byte utf-8 characters) - if ($idx <= 65135) { - // convert $idx (a byte offset) into a utf8 character offset $utf8_idx = utf8_strlen(substr($text,0,$idx)); $utf8_len = utf8_strlen($str); @@ -339,55 +335,42 @@ switch ($algorithm) { // establish context, 100 bytes surrounding the match string // first look to see if we can go 100 either side, // then drop to 50 adding any excess if the other side can't go to 50, - $pre = min($utf8_idx-$utf8_offset,100); - $post = min($len-$utf8_idx-$utf8_len,100); - - if ($pre>50 && $post>50) { - $pre = $post = 50; - } else if ($pre>50) { - $pre = min($pre,100-$post); - } else if ($post>50) { - $post = min($post, 100-$pre); - } else { - // both are less than 50, means the context is the whole string - // make it so and break out of this loop - there is no need for the complex snippet calculations - $snippets = array($text); - break; - } - - // establish context start and end points, try to append to previous context if possible - $start = $utf8_idx - $pre; - $append = ($start < $end) ? $end : false; // still the end of the previous context snippet - $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context + $pre = min($utf8_idx-$utf8_offset,100); + $post = min($len-$utf8_idx-$utf8_len,100); - if ($append) { - $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); - } else { - $snippets[] = utf8_substr($text,$start,$end-$start); - } - - // set $offset for next match attempt - // substract strlen to avoid splitting a potential search success, this is an approximation as the - // search pattern may match strings of varying length and it will fail if the context snippet - // boundary breaks a matching string longer than the current match - $utf8_offset = $utf8_idx + $post; - $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); - $offset = utf8_correctIdx($text,$offset); + if ($pre>50 && $post>50) { + $pre = $post = 50; + } else if ($pre>50) { + $pre = min($pre,100-$post); + } else if ($post>50) { + $post = min($post, 100-$pre); } else { - // code for strings too large for utf8_substr - // use a larger context number as its bytes not characters - // no need to check for short pre, $idx is nearly 64k - $post = min(strlen($text)-$idx-strlen($str), 70); - $pre = ($post < 70) ? 140 - $post : 70; + // both are less than 50, means the context is the whole string + // make it so and break out of this loop - there is no need for the complex snippet calculations + $snippets = array($text); + break; + } - $start = utf8_correctIdx($text,$idx - $pre); - $end = utf8_correctIdx($text, $idx + strlen($str) + $post); + // establish context start and end points, try to append to previous context if possible + $start = $utf8_idx - $pre; + $append = ($start < $end) ? $end : false; // still the end of the previous context snippet + $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context - $snippets[] = substr($text,$start,$end-$start); - $offset = $end - strlen($str); + if ($append) { + $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); + } else { + $snippets[] = utf8_substr($text,$start,$end-$start); } + // set $offset for next match attempt + // substract strlen to avoid splitting a potential search success, this is an approximation as the + // search pattern may match strings of varying length and it will fail if the context snippet + // boundary breaks a matching string longer than the current match + $utf8_offset = $utf8_idx + $post; + $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); + $offset = utf8_correctIdx($text,$offset); } + $m = "\1"; $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets))); |