diff options
Diffstat (limited to 'modules/search/search.module')
-rw-r--r-- | modules/search/search.module | 99 |
1 files changed, 95 insertions, 4 deletions
diff --git a/modules/search/search.module b/modules/search/search.module index 027643485..51db8695d 100644 --- a/modules/search/search.module +++ b/modules/search/search.module @@ -1142,12 +1142,13 @@ function search_excerpt($keys, $text) { array_walk($keys, '_search_excerpt_replace'); $workkeys = $keys; - // Extract a fragment per keyword for at most 4 keywords. + // Extract fragments around keywords. // First we collect ranges of text around each keyword, starting/ending - // at spaces. + // at spaces, trying to get to 256 characters. // If the sum of all fragments is too short, we look for second occurrences. $ranges = array(); $included = array(); + $foundkeys = array(); $length = 0; while ($length < 256 && count($workkeys)) { foreach ($workkeys as $k => $key) { @@ -1164,10 +1165,26 @@ function search_excerpt($keys, $text) { if (!isset($included[$key])) { $included[$key] = 0; } - // Locate a keyword (position $p), then locate a space in front (position - // $q) and behind it (position $s) + // Locate a keyword (position $p, always >0 because $text starts with a + // space). First try bare keyword, but if that doesn't work, try to find a + // derived form from search_simplify(). + $p = 0; if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) { $p = $match[0][1]; + } + else { + $info = search_simplify_excerpt_match($key, $text, $included[$key], $boundary); + if ($info['where']) { + $p = $info['where']; + if ($info['keyword']) { + $foundkeys[] = $info['keyword']; + } + } + } + // Now locate a space in front (position $q) and behind it (position $s), + // leaving about 60 characters extra before and after for context. + // Note that a space was added to the front and end of $text above. + if ($p) { if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) { $end = substr($text . ' ', $p, 80); if (($s = strrpos($end, ' ')) !== FALSE) { @@ -1233,6 +1250,10 @@ function search_excerpt($keys, $text) { $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2]; $text = check_plain($text); + // Slash-escape quotes in keys found in a derived form and merge with original keys. + array_walk($foundkeys, '_search_excerpt_replace'); + $keys = array_merge($keys, $foundkeys); + // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>'). $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', $text); return $text; @@ -1250,6 +1271,76 @@ function _search_excerpt_replace(&$text) { } /** + * Find words in the original text that matched via search_simplify(). + * + * This is called in search_excerpt() if an exact match is not found in the + * text, so that we can find the derived form that matches. + * + * @param $key + * The keyword to find. + * @param $text + * The text to search for the keyword. + * @param $offset + * Offset position in $text to start searching at. + * @param $boundary + * Text to include in a regular expression that will match a word boundary. + * + * @return + * FALSE if no match is found. If a match is found, return an associative + * array with element 'where' giving the position of the match, and element + * 'keyword' giving the actual word found in the text at that position. + */ +function search_simplify_excerpt_match($key, $text, $offset, $boundary) { + $pos = NULL; + $simplified_key = search_simplify($key); + $simplified_text = search_simplify($text); + + // Check if we have a match after simplification in the text. + if (!preg_match('/' . $boundary . $simplified_key . $boundary . '/iu', $simplified_text, $match, PREG_OFFSET_CAPTURE, $offset)) { + return FALSE; + } + + // If we get here, we have a match. Now find the exact location of the match + // and the original text that matched. Start by splitting up the text by all + // potential starting points of the matching text and iterating through them. + $split = array_filter(preg_split('/' . $boundary . '/iu', $text, -1, PREG_SPLIT_OFFSET_CAPTURE), '_search_excerpt_match_filter'); + foreach ($split as $value) { + // Skip starting points before the offset. + if ($value[1] < $offset) { + continue; + } + + // Check a window of 80 characters after the starting point for a match, + // based on the size of the excerpt window. + $window = substr($text, $value[1], 80); + $simplified_window = search_simplify($window); + if (strpos($simplified_window, $simplified_key) === 0) { + // We have a match in this window. Store the position of the match. + $pos = $value[1]; + // Iterate through the text in the window until we find the full original + // matching text. + $length = strlen($window); + for ($i = 1; $i <= $length; $i++) { + $keyfound = substr($text, $value[1], $i); + if ($simplified_key == search_simplify($keyfound)) { + break; + } + } + break; + } + } + + return $pos ? array('where' => $pos, 'keyword' => $keyfound) : FALSE; +} + +/** + * Helper function for array_filter() in search_search_excerpt_match(). + */ +function _search_excerpt_match_filter($var) { + return strlen(trim($var[0])); +} + +/** * Implements hook_forms(). */ function search_forms() { |