ft_snippet() update

- correct "opt1" algorithm for multibyte utf8 - minor improvement to "opt2" for short pages - add "utf8" algorithm, this algorithm endeavours to work with whole utf8 character as much as possible. The resulting snippet will tend to 100 characters, rather than the 100 bytes of "opt1" and "opt2". darcs-hash:20060826234333-9b6ab-ae4c60c8855a92b133cb8d5a230098203f610e7b.gz
author: chris <chris@jalakai.co.uk> 2006-08-27 01:43:33 +0200
committer: chris <chris@jalakai.co.uk> 2006-08-27 01:43:33 +0200
commit: ced0762eb24c25ac27438d5bef2a6ff434e55dca (patch)
tree: b8a80c46a3b6384e589e41cf007341e8df8401af /inc/fulltext.php
parent: e1d60f58d6db29397d9f05e5c4704a97e46fa2d5 (diff)
download: rpg-ced0762eb24c25ac27438d5bef2a6ff434e55dca.tar.gz
rpg-ced0762eb24c25ac27438d5bef2a6ff434e55dca.tar.bz2
1 files changed, 65 insertions, 3 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 6ab22a5c2..b9450c172 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -242,8 +242,8 @@ switch ($algorithm) {
       if ($idx < $end) continue;
 
       $pre = min($idx,50);
-      $start = $idx - $pre;
-      $end = min($idx+100+strlen($str)-$pre,$len);
+      $start = utf8_correctIdx($text, $idx - $pre);
+      $end = utf8_correctIdx($text, min($idx+100+strlen($str)-$pre,$len));
       $snippets[] = substr($text,$start,$end-$start);
       if (!($cnt--)) break;
     }
@@ -255,7 +255,7 @@ switch ($algorithm) {
   break;
 
   case 'opt2' :
-	default :
+  default :
 // option 2 ... CS 2006-08-25
 // above + reduce amount of the file searched
     $match = array();
@@ -280,6 +280,11 @@ switch ($algorithm) {
         $pre = min($pre,100-$post);
       } else if ($post>50) {
         $post = min($post, 100-$pre);
+      } else {
+        // means both pre & post are less than 50, the context is the whole string
+        // make it so and break out of this loop - there is no need for the complex snippet calculations
+        $snippets = array($text);
+        break;
       }
 
       // establish context start and end points, try to append to previous context if possible
@@ -304,6 +309,63 @@ switch ($algorithm) {
     $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
 
   break;
+  
+  case 'utf8':
+    $match = array();
+    $snippets = array();
+    $utf8_offset = $offset = 0;
+    $len = utf8_strlen($text);
+    for ($cnt=3; $cnt--;) {
+      if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
+
+      list($str,$idx) = $match[0];
+      
+      // convert $idx (a byte offset) into a utf8 character offset
+      $utf8_idx = utf8_strlen(substr($text,0,$idx));
+      $utf8_len = utf8_strlen($str);
+
+      // establish context, 100 bytes surrounding the match string
+      // first look to see if we can go 100 either side,
+      // then drop to 50 adding any excess if the other side can't go to 50,
+      // NOTE: these are byte adjustments and will have to be corrected for utf-8
+      $pre = min($utf8_idx-$utf8_offset,100);
+      $post = min($len-$utf8_idx-$utf8_len,100);
+
+      if ($pre>50 && $post>50) {
+        $pre = $post = 50;
+      } else if ($pre>50) {
+        $pre = min($pre,100-$post);
+      } else if ($post>50) {
+        $post = min($post, 100-$pre);
+      } else {
+        // both are less than 50, means the context is the whole string
+        // make it so and break out of this loop - there is no need for the complex snippet calculations
+        $snippets = array($text);
+        break;
+      }
+
+      // establish context start and end points, try to append to previous context if possible
+      $start = $idx - $pre;
+      $append = ($start < $end) ? $end : false;       // still the end of the previous context snippet
+      $end = $idx + $utf8_len + $post;                // now set it to the end of this context
+
+      if ($append) {
+        $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+      } else {
+        $snippets[] = utf8_substr($text,$start,$end-$start);
+      }
+
+      // set $offset for next match attempt
+      //   substract strlen to avoid splitting a potential search success, this is an approximation as the
+      //   search pattern may match strings of varying length and it will fail if the context snippet
+      //   boundary breaks a matching string longer than the current match
+      $utf8_offset = $end - $utf8_len;
+      $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset)));
+    }
+    $m = "\1";
+    $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
+    $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
+  break;
 }
 
     return $snippet;
author	chris <chris@jalakai.co.uk>	2006-08-27 01:43:33 +0200
committer	chris <chris@jalakai.co.uk>	2006-08-27 01:43:33 +0200
commit	ced0762eb24c25ac27438d5bef2a6ff434e55dca (patch)
tree	b8a80c46a3b6384e589e41cf007341e8df8401af /inc/fulltext.php
parent	e1d60f58d6db29397d9f05e5c4704a97e46fa2d5 (diff)
download	rpg-ced0762eb24c25ac27438d5bef2a6ff434e55dca.tar.gz rpg-ced0762eb24c25ac27438d5bef2a6ff434e55dca.tar.bz2