more utf8_substr improvements (re FS#891 and yesterday's patch)

- rework utf8_substr() NOMBSTRING code to always use pcre - remove work around for utf8_substr() and large strings from ft_snippet() darcs-hash:20060928165122-9b6ab-0eefc216f07f9d7e7d8eb62ce26605c28ee340fa.gz
author: chris <chris@jalakai.co.uk> 2006-09-28 18:51:22 +0200
committer: chris <chris@jalakai.co.uk> 2006-09-28 18:51:22 +0200
commit: 2626ee0c66b9293c433a1b593ce1e2fe45bdb040 (patch)
tree: 7499f734c38a2eecf5977191c2f4c74f1013bb5f /inc/fulltext.php
parent: d07dd8ee598c9b9dc8dfc9a61e0fdfa023ad59de (diff)
download: rpg-2626ee0c66b9293c433a1b593ce1e2fe45bdb040.tar.gz
rpg-2626ee0c66b9293c433a1b593ce1e2fe45bdb040.tar.bz2
1 files changed, 29 insertions, 46 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 4aeb622e2..794209a4b 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -328,10 +328,6 @@ switch ($algorithm) {
 
       list($str,$idx) = $match[0];
       
-      // is it ok to use utf8_substr() -- see bug #891,
-      //   check idx against (2^16)-1 - 400 (100x4 byte utf-8 characters)
-      if ($idx <= 65135) {
-
       // convert $idx (a byte offset) into a utf8 character offset
       $utf8_idx = utf8_strlen(substr($text,0,$idx));
       $utf8_len = utf8_strlen($str);
@@ -339,55 +335,42 @@ switch ($algorithm) {
       // establish context, 100 bytes surrounding the match string
       // first look to see if we can go 100 either side,
       // then drop to 50 adding any excess if the other side can't go to 50,
-        $pre = min($utf8_idx-$utf8_offset,100);
-        $post = min($len-$utf8_idx-$utf8_len,100);
-
-        if ($pre>50 && $post>50) {
-          $pre = $post = 50;
-        } else if ($pre>50) {
-          $pre = min($pre,100-$post);
-        } else if ($post>50) {
-          $post = min($post, 100-$pre);
-        } else {
-          // both are less than 50, means the context is the whole string
-          // make it so and break out of this loop - there is no need for the complex snippet calculations
-          $snippets = array($text);
-          break;
-        }
-
-        // establish context start and end points, try to append to previous context if possible
-        $start = $utf8_idx - $pre;
-        $append = ($start < $end) ? $end : false;       // still the end of the previous context snippet
-        $end = $utf8_idx + $utf8_len + $post;           // now set it to the end of this context
+      $pre = min($utf8_idx-$utf8_offset,100);
+      $post = min($len-$utf8_idx-$utf8_len,100);
 
-        if ($append) {
-          $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
-        } else {
-          $snippets[] = utf8_substr($text,$start,$end-$start);
-        }
-
-        // set $offset for next match attempt
-        //   substract strlen to avoid splitting a potential search success, this is an approximation as the
-        //   search pattern may match strings of varying length and it will fail if the context snippet
-        //   boundary breaks a matching string longer than the current match
-        $utf8_offset = $utf8_idx + $post;
-        $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
-        $offset = utf8_correctIdx($text,$offset);
+      if ($pre>50 && $post>50) {
+        $pre = $post = 50;
+      } else if ($pre>50) {
+        $pre = min($pre,100-$post);
+      } else if ($post>50) {
+        $post = min($post, 100-$pre);
       } else {
-        // code for strings too large for utf8_substr
-        // use a larger context number as its bytes not characters
-        // no need to check for short pre, $idx is nearly 64k
-        $post = min(strlen($text)-$idx-strlen($str), 70);
-        $pre = ($post < 70) ?  140 - $post : 70;
+        // both are less than 50, means the context is the whole string
+        // make it so and break out of this loop - there is no need for the complex snippet calculations
+        $snippets = array($text);
+        break;
+      }
 
-        $start = utf8_correctIdx($text,$idx - $pre);
-        $end = utf8_correctIdx($text, $idx + strlen($str) + $post);
+      // establish context start and end points, try to append to previous context if possible
+      $start = $utf8_idx - $pre;
+      $append = ($start < $end) ? $end : false;       // still the end of the previous context snippet
+      $end = $utf8_idx + $utf8_len + $post;           // now set it to the end of this context
 
-        $snippets[] = substr($text,$start,$end-$start);
-        $offset = $end - strlen($str);
+      if ($append) {
+        $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+      } else {
+        $snippets[] = utf8_substr($text,$start,$end-$start);
       }
 
+      // set $offset for next match attempt
+      //   substract strlen to avoid splitting a potential search success, this is an approximation as the
+      //   search pattern may match strings of varying length and it will fail if the context snippet
+      //   boundary breaks a matching string longer than the current match
+      $utf8_offset = $utf8_idx + $post;
+      $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
+      $offset = utf8_correctIdx($text,$offset);
     }
+
     $m = "\1";
     $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
     $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
author	chris <chris@jalakai.co.uk>	2006-09-28 18:51:22 +0200
committer	chris <chris@jalakai.co.uk>	2006-09-28 18:51:22 +0200
commit	2626ee0c66b9293c433a1b593ce1e2fe45bdb040 (patch)
tree	7499f734c38a2eecf5977191c2f4c74f1013bb5f /inc/fulltext.php
parent	d07dd8ee598c9b9dc8dfc9a61e0fdfa023ad59de (diff)
download	rpg-2626ee0c66b9293c433a1b593ce1e2fe45bdb040.tar.gz rpg-2626ee0c66b9293c433a1b593ce1e2fe45bdb040.tar.bz2