diff options
author | Dries Buytaert <dries@buytaert.net> | 2004-10-31 03:03:27 +0000 |
---|---|---|
committer | Dries Buytaert <dries@buytaert.net> | 2004-10-31 03:03:27 +0000 |
commit | 8daed9cbf353de947bc3916f103206edd121de33 (patch) | |
tree | 4b51e7bcc748bc53a478ef8149e340ab4966293f /includes | |
parent | 83dc5f9bab88a855ed621804221100e47a55749e (diff) | |
download | brdo-8daed9cbf353de947bc3916f103206edd121de33.tar.gz brdo-8daed9cbf353de947bc3916f103206edd121de33.tar.bz2 |
- Patch #12232 by Steven/UnConed: search module improvements.
1) Clean up the text analyser: make it handle UTF-8 and all sorts of characters. The word splitter now does intelligent splitting into words and supports all Unicode characters. It has smart handling of acronyms, URLs, dates, ...
2) It now indexes the filtered output, which means it can take advantage of HTML tags. Meaningful tags (headers, strong, em, ...) are analysed and used to boost certain words scores. This has the side-effect of allowing the indexing of PHP nodes.
3) Link analyser for node links. The HTML analyser also checks for links. If they point to a node on the current site (handles path aliases) then the link's words are counted as part of the target node. This helps bring out commonly linked FAQs and answers to the top of the results.
4) Index comments along with the node. This means that the search can make a difference between a single node/comment about 'X' and a whole thread about 'X'. It also makes the search results much shorter and more relevant (before this patch, comments were even shown first).
5) We now keep track of total counts as well as a per item count for a word. This allows us to divide the word score by the total before adding up the scores for different words, and automatically makes noisewords have less influence than rare words. This dramatically improves the relevancy of multiword searches. This also makes the disadvantage of now using OR searching instead of AND searching less problematic.
6) Includes support for text preprocessors through a hook. This is required to index Chinese and Japanese, because these languages do not use spaces between words. An external utility can be used to split these into words through a simple wrapper module. Other uses could be spell checking (although it would have no UI).
7) Indexing is now regulated: only a certain amount of items will be indexed per cron run. This prevents PHP from running out of memory or timing out. This also makes the reindexing required for this patch automatic. I also added an index coverage estimate to the search admin screen.
8) Code cleanup! Moved all the search stuff from common.inc into search.module, rewired some hooks and simplified the functions used. The search form and results now also use valid XHTML and form_ functions. The search admin was moved from search/configure to admin/search for consistency.
9) Improved search output: we also show much more info per item: date, author, node type, amount of comments and a cool dynamic excerpt à la Google. The search form is now much more simpler and the help is only displayed as tips when no search results are found.
10) By moving all search logic to SQL, I was able to add a pager to the search results. This improves usability and performance dramatically.
Diffstat (limited to 'includes')
-rw-r--r-- | includes/common.inc | 183 |
1 files changed, 54 insertions, 129 deletions
diff --git a/includes/common.inc b/includes/common.inc index 36e3ac3df..7a5f8e961 100644 --- a/includes/common.inc +++ b/includes/common.inc @@ -668,134 +668,6 @@ function valid_input_data($data) { * @} End of "defgroup validation". */ -/** - * @defgroup search Search interface - * @{ - * The Drupal search interface manages a global search mechanism. - * - * Modules may plug into this system to provide searches of different types of - * data. Most of the system is handled by search.module, so this must be enabled - * for all of the search features to work. - */ - -/** - * Format a single result entry of a search query. - * - * Modules may implement hook_search_item() in order to override this default - * function to display search results. - * - * @param $item - * A single search result as returned by hook_search(). The result should be - * an array with keys "count", "link", "title", "user", "date", and "keywords". - * @param $type - * The type of item found, such as "user" or "comment". - */ -function search_item($item, $type) { - if (module_hook($type, 'search_item')) { - $output = module_invoke($type, 'search_item', $item); - } - else { - $output = ' <dt class="title"><a href="'. $item['link'] .'">'. $item['title'] .'</a></dt>'; - $output .= ' <dd class="small">' . t($type) . ($item['user'] ? ' - '. $item['user'] : '') .''. ($item['date'] ? ' - '. format_date($item['date'], 'small') : '') .'</dd>'; - } - - return $output; -} - -/** - * Render a generic search form. - * - * This form must be usable not only within "http://example.com/search", but also - * as a simple search box (without "Restrict search to", help text, etc.), in the - * theme's header, and so forth. This means we must provide options to - * conditionally render certain parts of this form. - * - * @param $action - * Form action. Defaults to "search". - * @param $keys - * The search string entered by the user, containing keywords for the search. - * @param $options - * Whether to render the optional form fields and text ("Restrict search - * to", help text, etc.). - * @return - * An HTML string containing the search form. - */ -function search_form($action = '', $keys = '', $options = FALSE) { - $edit = $_POST['edit']; - - if (!$action) { - $action = url('search'); - } - - $output = ' <div class="search-form"><br /><input type="text" class="form-text" size="50" value="'. check_form($keys) .'" name="keys" />'; - $output .= ' <input type="submit" class="form-submit" value="'. t('Search') ."\" />\n"; - - if ($options) { - $output .= '<br />'; - $output .= t('Restrict search to') .': '; - - foreach (module_list() as $name) { - if (module_hook($name, 'search')) { - $output .= ' <input type="checkbox" name="edit[type]['. $name .']" '. ($edit['type'][$name] ? ' checked="checked"' : '') .' /> '. t($name); - } - } - } - $output .= '</div>'; - - return form($output, 'post', $action); -} - -/** - * Perform a global search on the given keys, and return the formatted results. - */ -function search_data($keys = NULL) { - $edit = $_POST['edit']; - $output = ''; - - if (isset($keys)) { - foreach (module_list() as $name) { - if (module_hook($name, 'search') && (!$edit['type'] || $edit['type'][$name])) { - list($title, $results) = module_invoke($name, 'search', $keys); - if ($results) { - $output .= '<h2>'. $title .'</h2>'; - $output .= '<dl class="search-results">'; - foreach ($results as $entry) { - $output .= search_item($entry, $name); - } - $output .= '</dl>'; - } - } - } - } - - return $output; -} - -/** - * Display a search form for a particular type of data. - * - * @param $type - * The type of content to search within. - * @param $action - * Form action. Defaults to "search". - * @param $keys - * The search string entered by the user, containing keywords for the search. - * @param $options - * Whether to render the optional form fields and text ("Restrict search - * to", help text, etc.). - * @return - * An HTML string containing the search form and results. - */ -function search_type($type, $action = '', $keys = '', $options = FALSE) { - $_POST['edit']['type'][$type] = 'on'; - - return search_form($action, $keys, $options) . '<br />'. search_data($keys); -} - -/** - * @} End of "defgroup search". - */ - function check_form($text) { return drupal_specialchars($text, ENT_QUOTES); } @@ -1840,7 +1712,7 @@ function truncate_utf8($string, $len) { /** * Encodes MIME/HTTP header values that contain non US-ASCII characters. * - * For example, mime_header_encode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=". + * For example, mime_header_encode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=". * * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information. * @@ -1863,6 +1735,59 @@ function mime_header_encode($string, $charset = 'UTF-8') { } /** + * Decode all HTML entities (including numerical ones) to regular UTF-8 bytes. + */ +function decode_entities($text) { + static $table; + // We store named entities in a table for quick processing. + if (!isset($table)) { + // Get all named HTML entities. + $table = array_flip(get_html_translation_table(HTML_ENTITIES, $special)); + // PHP gives us Windows-1252/ISO-8859-1 data, we need UTF-8. + $table = array_map('utf8_encode', $table); + } + $text = strtr($text, $table); + + // Any remaining entities are numerical. Use a regexp to replace them. + return preg_replace('/&#(x?)([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2")', $text); +} + +/** + * Helper function for decode_entities + */ +function _decode_entities($hex, $codepoint) { + if ($hex != '') { + $codepoint = base_convert($codepoint, 16, 10); + } + if ($codepoint < 0x80) { + return chr($codepoint); + } + else if ($codepoint < 0x800) { + return chr(0xC0 | ($codepoint >> 6)) + . chr(0x80 | ($codepoint & 0x3F)); + } + else if ($codepoint < 0x10000) { + return chr(0xE0 | ( $codepoint >> 12)) + . chr(0x80 | (($codepoint >> 6) & 0x3F)) + . chr(0x80 | ( $codepoint & 0x3F)); + } + else if ($codepoint < 0x200000) { + return chr(0xF0 | ( $codepoint >> 18)) + . chr(0x80 | (($codepoint >> 12) & 0x3F)) + . chr(0x80 | (($codepoint >> 6) & 0x3F)) + . chr(0x80 | ( $codepoint & 0x3F)); + } +} + +/** + * Count the amount of characters in a UTF-8 string. This is less than or + * equal to the byte count. + */ +function string_length(&$text) { + return strlen(preg_replace("/[\x80-\xBF]/", '', $text)); +} + +/** * Evaluate a string of PHP code. * * This is a wrapper around PHP's eval(). It uses output buffering to capture both |