make search result highlightings right

Ignore-this: b48abc447a5d32123322eb08717d275d darcs-hash:20091115121708-9b77a-4a0b6c7afb8db60e22ca07a4d9725474a869c1c8.gz
author: Kazutaka Miyasaka <kazmiya@gmail.com> 2009-11-15 13:17:08 +0100
committer: Kazutaka Miyasaka <kazmiya@gmail.com> 2009-11-15 13:17:08 +0100
commit: 2f502d705ba80c8914da825ad997ddbdef0772cb (patch)
tree: b8fa2db9c0e300cc27a57e222a7c33be5067dfea /inc
parent: d9baf1a70bff3db65f4443658c00eae23ad11c8a (diff)
download: rpg-2f502d705ba80c8914da825ad997ddbdef0772cb.tar.gz
rpg-2f502d705ba80c8914da825ad997ddbdef0772cb.tar.bz2
1 files changed, 51 insertions, 24 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php
index afb15528e..c8236e1d4 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -53,11 +53,13 @@ function _ft_pageSearch(&$data) {
     foreach ($q['parsed_ary'] as $token) {
         switch (substr($token, 0, 3)) {
             case 'W+:':
-            case 'W-:': // word
+            case 'W-:':
+            case 'W_:': // word
                 $word    = substr($token, 3);
                 $stack[] = (array) $lookup[$word];
                 break;
-            case 'P_:': // phrase
+            case 'P+:':
+            case 'P-:': // phrase
                 $phrase = substr($token, 3);
                 // since phrases are always parsed as ((W1)(W2)...(P)),
                 // the end($stack) always points the pages that contain
@@ -72,7 +74,8 @@ function _ft_pageSearch(&$data) {
                 }
                 $stack[] = $pages_matched;
                 break;
-            case 'N_:': // namespace
+            case 'N+:':
+            case 'N-:': // namespace
                 $ns = substr($token, 3);
                 $pages_matched = array();
                 foreach (array_keys($pages_all) as $id) {
@@ -487,14 +490,13 @@ function ft_queryParser($query){
      *
      * intermediate representation consists of the following parts:
      *
-     *   ( ) - group
-     *   AND - logical and
-     *   OR  - logical or
-     *   NOT - logical not
-     *   W+: - word (needs to be highlighted)
-     *   W-: - word (no need to highlight)
-     *   P_: - phrase
-     *   N_: - namespace
+     *   ( )           - group
+     *   AND           - logical and
+     *   OR            - logical or
+     *   NOT           - logical not
+     *   W+:, W-:, W_: - word      (underscore: no need to highlight)
+     *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
+     *   N+:, N-:      - namespace
      */
     $parsed_query = '';
     $parens_level = 0;
@@ -546,10 +548,10 @@ function ft_queryParser($query){
                     $parsed .= 'OR';
                 } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
                     // namespace-exclude
-                    $parsed .= 'NOT(N_:'.$matches[1].')';
+                    $parsed .= 'NOT(N+:'.$matches[1].')';
                 } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
                     // namespace-include
-                    $parsed .= '(N_:'.$matches[1].')';
+                    $parsed .= '(N+:'.$matches[1].')';
                 } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
                     // word-exclude
                     $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
@@ -574,6 +576,26 @@ function ft_queryParser($query){
     $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
     $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
 
+    // adjustment: make highlightings right
+    $parens_level     = 0;
+    $notgrp_levels    = array();
+    $parsed_query_new = '';
+    $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
+    foreach ($tokens as $token) {
+        if ($token === 'NOT(') {
+            $notgrp_levels[] = ++$parens_level;
+        } elseif ($token === '(') {
+            ++$parens_level;
+        } elseif ($token === ')') {
+            if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
+        } elseif (count($notgrp_levels) % 2 === 1) {
+            // turn highlight-flag off if terms are logically in "NOT" group
+            $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
+        }
+        $parsed_query_new .= $token;
+    }
+    $parsed_query = $parsed_query_new;
+
     /**
      * convert infix notation string into postfix (Reverse Polish notation) array
      * by Shunting-yard algorithm
@@ -628,32 +650,37 @@ function ft_queryParser($query){
         $body = substr($token, 3);
 
         switch (substr($token, 0, 3)) {
-            case 'N_:':
+            case 'N+:':
                 $q['ns'][]        = $body; // for backward compatibility
                 break;
+            case 'N-:':
+                $q['notns'][]     = $body; // for backward compatibility
+                break;
+            case 'W_:':
+                $q['words'][]     = $body;
+                break;
             case 'W-:':
                 $q['words'][]     = $body;
+                $q['not'][]       = $body; // for backward compatibility
                 break;
             case 'W+:':
                 $q['words'][]     = $body;
                 $q['highlight'][] = str_replace('*', '', $body);
+                $q['and'][]       = $body; // for backward compatibility
                 break;
-            case 'P_:':
+            case 'P-:':
+                $q['phrases'][]   = $body;
+                break;
+            case 'P+:':
                 $q['phrases'][]   = $body;
                 $q['highlight'][] = str_replace('*', '', $body);
                 break;
         }
     }
-    foreach (array('words', 'phrases', 'highlight', 'ns') as $key) {
+    foreach (array('words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not') as $key) {
         $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
     }
 
-    // keep backward compatibility (to some extent)
-    // this part can be deleted if no plugins use ft_queryParser() directly
-    $q['and']   = $q['words'];
-    $q['not']   = array(); // difficult to set: imagine [ aaa -(bbb -ccc) ]
-    $q['notns'] = array(); // same as above
-
     return $q;
 }
 
@@ -677,14 +704,14 @@ function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode
         $term_noparen = str_replace(array('(', ')'), ' ', $term);
         $words = idx_tokenizer($term_noparen, $stopwords, true);
 
-        // W+: needs to be highlighted, W-: no need to highlight
+        // W_: no need to highlight
         if (empty($words)) {
             $parsed = '()'; // important: do not remove
         } elseif ($words[0] === $term) {
             $parsed = '(W+:'.$words[0].')';
         } elseif ($phrase_mode) {
             $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term);
-            $parsed = '((W-:'.implode(')(W-:', $words).')(P_:'.$term_encoded.'))';
+            $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
         } else {
             $parsed = '((W+:'.implode(')(W+:', $words).'))';
         }
author	Kazutaka Miyasaka <kazmiya@gmail.com>	2009-11-15 13:17:08 +0100
committer	Kazutaka Miyasaka <kazmiya@gmail.com>	2009-11-15 13:17:08 +0100
commit	2f502d705ba80c8914da825ad997ddbdef0772cb (patch)
tree	b8fa2db9c0e300cc27a57e222a7c33be5067dfea /inc
parent	d9baf1a70bff3db65f4443658c00eae23ad11c8a (diff)
download	rpg-2f502d705ba80c8914da825ad997ddbdef0772cb.tar.gz rpg-2f502d705ba80c8914da825ad997ddbdef0772cb.tar.bz2