summaryrefslogtreecommitdiff
path: root/modules/search.module
diff options
context:
space:
mode:
authorSteven Wittens <steven@10.no-reply.drupal.org>2005-10-18 14:41:27 +0000
committerSteven Wittens <steven@10.no-reply.drupal.org>2005-10-18 14:41:27 +0000
commit909d6928acb47cb9b50740e589b5e7447353e19c (patch)
tree6662a983847628472dbd1140d20a9b1e30311d0d /modules/search.module
parent782d5c98c9b37f1d22152af53c6b0604674c38c8 (diff)
downloadbrdo-909d6928acb47cb9b50740e589b5e7447353e19c.tar.gz
brdo-909d6928acb47cb9b50740e589b5e7447353e19c.tar.bz2
- #28159: Advanced search features (hello from DrupalCon)
Presentation about it: http://www.acko.net/files/drupal-search-slim.pdf
Diffstat (limited to 'modules/search.module')
-rw-r--r--modules/search.module558
1 files changed, 411 insertions, 147 deletions
diff --git a/modules/search.module b/modules/search.module
index adc020eb5..e26f7045e 100644
--- a/modules/search.module
+++ b/modules/search.module
@@ -15,32 +15,80 @@
* Lu Letter, Uppercase
* Ll Letter, Lowercase
* Lt Letter, Titlecase
- * Lm Letter, Modifier
* Lo Letter, Other
- * Mn Mark, Nonspacing
- * Mc Mark, Spacing Combining
* Nd Number, Decimal Digit
- * Nl Number, Letter
* No Number, Other
- * Sm Symbol, Math
- * Sc Symbol, Currency
- * Sk Symbol, Modifier
- * So Symbol, Other
- *
- * All character classes not in the list above (enclosing marks, punctuation, control codes and spacers):
- * 'Me', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Cs', 'Co'
*/
-define('PREG_CLASS_SEARCH_EXCLUDE', '\x{0}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{7f}-\x{a1}\x{ab}\x{ad}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{488}\x{489}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{600}-\x{603}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{6dd}\x{6de}\x{700}-\x{70d}\x{70f}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17b4}\x{17b5}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{180e}\x{1944}\x{1945}\x{2000}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{205f}-\x{2063}\x{206a}-\x{206f}\x{207d}\x{207e}\x{208d}\x{208e}\x{20dd}-\x{20e0}\x{20e2}-\x{20e4}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3000}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{d800}\x{db7f}\x{db80}\x{dbff}\x{dc00}\x{dfff}\x{e000}\x{f8ff}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{feff}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{fff9}-\x{fffb}\x{10100}\x{10101}\x{1039f}\x{1d173}-\x{1d17a}\x{e0001}\x{e0020}-\x{e007f}\x{f0000}\x{ffffd}\x{100000}');
+define('PREG_CLASS_SEARCH_EXCLUDE',
+'\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'.
+'\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'.
+'\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'.
+'\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'.
+'\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'.
+'\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'.
+'\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'.
+'\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'.
+'\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'.
+'\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'.
+'\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'.
+'\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'.
+'\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'.
+'\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'.
+'\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'.
+'\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'.
+'\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'.
+'\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'.
+'\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'.
+'\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'.
+'\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'.
+'\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'.
+'\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}'.
+'\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}'.
+'\x{a80b}\x{a823}-\x{a82b}\x{d800}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}'.
+'\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-'.
+'\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
/**
* Matches all 'N' Unicode character classes (numbers)
*/
-define('PREG_CLASS_NUMBERS', '\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}\x{10107}-\x{10133}\x{10320}-\x{10323}\x{1034a}\x{104a0}-\x{104a9}\x{1d7ce}-\x{1d7ff}');
+define('PREG_CLASS_NUMBERS',
+'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}'.
+'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}'.
+'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}'.
+'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-'.
+'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}'.
+'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}'.
+'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}'.
+'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-'.
+'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
/**
* Matches all 'P' Unicode character classes (punctuation)
*/
-define('PREG_CLASS_PUNCTUATION', '\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{10100}\x{10101}\x{1039f}');
+define('PREG_CLASS_PUNCTUATION',
+'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}'.
+'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}'.
+'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}'.
+'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}'.
+'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}'.
+'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}'.
+'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}'.
+'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}'.
+'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-'.
+'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}'.
+'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}'.
+'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}'.
+'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}'.
+'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-'.
+'\x{ff65}');
+
+/**
+ * Matches all CJK characters that are candidates for auto-splitting
+ * (Chinese, Japanese, Korean).
+ * Contains kana and BMP ideographs.
+ */
+define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}'.
+'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
/**
* Implementation of hook_help().
@@ -56,9 +104,9 @@ function search_help($section = 'admin/help#search') {
case 'search#noresults':
return t('<p><ul>
<li>Check if your spelling is correct.</li>
-<li>Try using wildcards: <em>walk*</em> matches <em>walker</em>, <em>walking</em>, ...</li>
-<li>Use longer words (words shorter than %number letters are ignored).</li>
-</ul></p>', array('%number' => variable_get('minimum_word_size', 3)));
+<li>Remove quotes around phrases to match each word individually: <em>"blue smurf"</em> will match less than <em>blue smurf</em>.</li>
+<li>Consider loosening your query with <em>OR</em>: <em>blue smurf</em> will match less than <em>blue OR smurf</em>.</li>
+</ul></p>');
}
}
@@ -120,7 +168,7 @@ function search_menu($may_cache) {
*/
function search_settings_form_validate($form_id, &$form) {
// If the word length settings change, the index needs to be rebuilt.
- if (variable_get('minimum_word_size', 3) != $form['minimum_word_size']) {
+ if (variable_get('minimum_word_size', 4) != $form['minimum_word_size']) {
drupal_set_message(t('The index will be rebuilt.'));
search_wipe();
}
@@ -154,9 +202,11 @@ function search_settings() {
// Indexing settings:
$form['indexing_settings'] = array('#type' => 'fieldset', '#title' => t('Indexing settings'));
$form['indexing_settings']['info'] = array('#type' => 'markup', '#value' => '<em>'. t('<p>Changing the setting below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>');
- $form['indexing_settings']['minimum_word_size'] = array('#type' => 'textfield', '#title' => t('Minimum word length to index'), '#default_value' => variable_get('minimum_word_size', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be indexed. Words shorter than this will not be searchable.'));
+ $form['indexing_settings']['minimum_word_size'] = array('#type' => 'textfield', '#title' => t('Minimum word length to index'), '#default_value' => variable_get('minimum_word_size', 4), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'));
$form['indexing_settings']['remove_short'] = array('#type' => 'textfield', '#title' => t('Minimum word length to search for'), '#default_value' => variable_get('remove_short', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be searched for, including wildcard characters.'));
+ // Per module settings
+ $form = array_merge($form, module_invoke_all('search', 'admin'));
return $form;
}
@@ -174,6 +224,7 @@ function search_wipe($sid = NULL, $type = NULL) {
module_invoke_all('search', 'reset');
}
else {
+ db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type);
db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
}
@@ -205,12 +256,15 @@ function search_cron() {
foreach (module_list() as $module) {
module_invoke($module, 'update_index');
}
- // Update word counts for new/changed words
+ // Update word IDF (Inverse Document Frequency) counts for new/changed words
foreach (search_dirty() as $word => $dummy) {
+ // Get total count
$total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
- db_query("UPDATE {search_total} SET count = %d WHERE word = '%s'", $total, $word);
+ // Apply Zipf's law to equalize the probability distribution
+ $total = log10(1 + 1/(max(1, $total)));
+ db_query("UPDATE {search_total} SET count = %f WHERE word = '%s'", $total, $word);
if (!db_affected_rows()) {
- db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %d)", $word, $total);
+ db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %f)", $word, $total);
}
}
// Find words that were deleted from search_index, but are still in
@@ -223,22 +277,21 @@ function search_cron() {
}
/**
- * Splits a string into component words according to indexing rules.
+ * Simplifies a string according to indexing rules.
*/
-function search_keywords_split($text) {
- static $last = null;
- static $lastsplit = null;
-
- if ($last == $text) {
- return $lastsplit;
- }
-
+function search_simplify($text) {
// Decode entities to UTF-8
$text = decode_entities($text);
+ // Lowercase
+ $text = drupal_strtolower($text);
+
// Call an external processor for word handling.
search_preprocess($text);
+ // Baseline CJK handling
+ $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text);
+
// To improve searching for numerical data such as dates, IP addresses
// or version numbers, we consider a group of numerical characters
// separated only by punctuation characters to be one piece.
@@ -255,9 +308,44 @@ function search_keywords_split($text) {
// marks, spacers, etc, to be a word boundary.
$text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
+ return $text;
+}
+
+/**
+ * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
+ * pairs of characters.
+ */
+function search_expand_cjk($matches) {
+ $tokens = ' ';
+ // Split off first character
+ $last = drupal_substr($matches[0], 0, 1);
+ $str = substr($matches[0], strlen($last));
+ // Begin loop
+ $l = drupal_strlen($str);
+ for ($i = 0; $i < $l; ++$i) {
+ // Grab next character
+ $current = drupal_substr($str, 0, 1);
+ $str = substr($str, strlen($last));
+ $tokens .= $last . $current .' ';
+ $last = $current;
+ }
+ return $tokens;
+}
+
+/**
+ * Splits a string into tokens for indexing.
+ */
+function search_index_split($text) {
+ static $last = null;
+ static $lastsplit = null;
+
+ if ($last == $text) {
+ return $lastsplit;
+ }
// Process words
+ $text = search_simplify($text);
$words = explode(' ', $text);
- array_walk($words, '_search_keywords_truncate');
+ array_walk($words, '_search_index_truncate');
// Save last keyword result
$last = $text;
@@ -267,28 +355,13 @@ function search_keywords_split($text) {
}
/**
- * Helper function for array_walk in search_keywords_split.
+ * Helper function for array_walk in search_index_split.
*/
-function _search_keywords_truncate(&$text) {
+function _search_index_truncate(&$text) {
$text = truncate_utf8($text, 50);
}
/**
- * Loosens up a set of search keywords by adding wildcards, if possible.
- *
- * @param $text
- * The keywords as entered by the user.
- * @return
- * If more wildcards can be added, the adjusted keywords are returned.
- * If the query is already as loose as possible, NULL is returned.
- */
-function search_keywords_variation($text) {
- $text = trim($text);
- $new = preg_replace('/\*+/', '*', '*'. implode('* *', explode(' ', trim($text))) .'*');
- return ($new != $text) ? $new : NULL;
-}
-
-/**
* Invokes hook_search_preprocess() in modules.
*/
function search_preprocess(&$text) {
@@ -297,7 +370,6 @@ function search_preprocess(&$text) {
}
}
-
/**
* Update the full-text search index for a particular item.
*
@@ -313,23 +385,25 @@ function search_preprocess(&$text) {
* @ingroup search
*/
function search_index($sid, $type, $text) {
- $minimum_word_size = variable_get('minimum_word_size', 3);
+ $minimum_word_size = variable_get('minimum_word_size', 4);
+ // Link matching
global $base_url;
- $node_regexp = '!href=[\'"]?(?:'. preg_quote($base_url) .'/)?(?:\?q=)?([^\'">]+)[\'">]!i';
+ $node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/)?(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
// Multipliers for scores of words inside certain HTML tags.
// Note: 'a' must be included for link ranking to work.
- $tags = array('h1' => 21,
+ $tags = array('h1' => 25,
'h2' => 18,
'h3' => 15,
'h4' => 12,
'h5' => 9,
'h6' => 6,
- 'u' => 5,
- 'b' => 5,
- 'strong' => 5,
- 'em' => 5,
+ 'u' => 3,
+ 'b' => 3,
+ 'i' => 3,
+ 'strong' => 3,
+ 'em' => 3,
'a' => 10);
// Strip off all ignored tags to speed up processing, but insert space before/after
@@ -345,24 +419,46 @@ function search_index($sid, $type, $text) {
$tag = false; // Odd/even counter. Tag or no tag.
$link = false; // State variable for link analyser
$score = 1; // Starting score per word
+ $accum = ' '; // Accumulator for cleaned up data
+ $tagstack = array(); // Stack with open tags
+ $tagwords = 0; // Counter for consecutive words
+ $focus = 1; // Focus state
- $results = array(0 => array());
+ $results = array(0 => array()); // Accumulator for words for index
foreach ($split as $value) {
if ($tag) {
// Increase or decrease score per word based on tag
list($tagname) = explode(' ', $value, 2);
$tagname = drupal_strtolower($tagname);
+ // Closing or opening tag?
if ($tagname{0} == '/') {
- $score -= $tags[substr($tagname, 1)];
- if ($score < 1) { // possible due to bad HTML
+ $tagname = substr($tagname, 1);
+ // If we encounter unexpected tags, reset score to avoid incorrect boosting.
+ if (!count($tagstack) || $tagstack[0] != $tagname) {
+ $tagstack = array();
$score = 1;
}
- if ($tagname == '/a') {
+ else {
+ // Remove from tag stack and decrement score
+ $score = max(1, $score - $tags[array_shift($tagstack)]);
+ }
+ if ($tagname == 'a') {
$link = false;
}
}
else {
+ if ($tagstack[0] == $tagname) {
+ // None of the tags we look for make sense when nested identically.
+ // If they are, it's probably broken HTML.
+ $tagstack = array();
+ $score = 1;
+ }
+ else {
+ // Add to open tag stack and increment score
+ array_unshift($tagstack, $tagname);
+ $score += $tags[$tagname];
+ }
if ($tagname == 'a') {
// Check if link points to a node on this site
if (preg_match($node_regexp, $value, $match)) {
@@ -370,32 +466,60 @@ function search_index($sid, $type, $text) {
if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
$linknid = $match[1];
if ($linknid > 0) {
- $link = true;
+ // Note: ignore links to uncachable nodes to avoid redirect bugs.
+ $node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
+ if (filter_format_allowcache($node->format)) {
+ $link = true;
+ $linktitle = $node->title;
+ }
}
}
}
}
- $score += $tags[$tagname];
}
+ // A tag change occurred, reset counter.
+ $tagwords = 0;
}
else {
// Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
if ($value != '') {
- $words = search_keywords_split($value);
+ if ($link) {
+ // Check to see if the node link text is its URL. If so, we use the target node title instead.
+ if (preg_match('!^https?://!i', $value)) {
+ $value = $linktitle;
+ }
+ }
+ $words = search_index_split($value);
foreach ($words as $word) {
+ // Add word to accumulator
+ $accum .= $word .' ';
+ $num = is_numeric($word);
// Check wordlength
- if (drupal_strlen($word) >= $minimum_word_size) {
- $word = drupal_strtolower($word);
+ if ($num || drupal_strlen($word) >= $minimum_word_size) {
+ // Normalize numbers
+ if ($num) {
+ $word = (int)ltrim($word, '-0');
+ }
+
if ($link) {
if (!isset($results[$linknid])) {
$results[$linknid] = array();
}
- $results[$linknid][$word] += $score;
+ $results[$linknid][$word] += $score * $focus;
}
else {
- $results[0][$word] += $score;
+ $results[0][$word] += $score * $focus;
+ // Focus is a decaying value in terms of the amount of unique words up to this point.
+ // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
+ $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
}
}
+ $tagwords++;
+ // Too many words inside a single tag probably mean a tag was accidentally left open.
+ if (count($tagstack) && $tagwords >= 15) {
+ $tagstack = array();
+ $score = 1;
+ }
}
}
}
@@ -404,6 +528,9 @@ function search_index($sid, $type, $text) {
search_wipe($sid, $type);
+ // Insert cleaned up data into dataset
+ db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum);
+
// Insert results into search index
foreach ($results[0] as $word => $score) {
db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %d)", $word, $sid, $type, $score);
@@ -421,20 +548,172 @@ function search_index($sid, $type, $text) {
}
/**
+ * Extract a module-specific search option from a search query. e.g. 'type:book'
+ */
+function search_query_extract($keys, $option) {
+ if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) {
+ return $matches[2];
+ }
+}
+
+/**
+ * Return a query with the given module-specific search option inserted in.
+ * e.g. 'type:book'.
+ */
+function search_query_insert($keys, $option, $value = '') {
+ if (search_query_extract($keys, $option)) {
+ $keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys));
+ }
+ if ($value != '') {
+ $keys .= ' '. $option .':'. $value;
+ }
+ return $keys;
+}
+
+/**
+ * Parse a search query into SQL conditions.
+ *
+ * We build a query that matches the dataset bodies
+ */
+function search_parse_query($text) {
+ $keys = array('positive' => array(), 'negative' => array());
+
+ // Tokenize query string
+ preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER);
+
+ if (count($matches) < 1) {
+ return NULL;
+ }
+
+ // Classify tokens
+ $or = false;
+ foreach ($matches as $match) {
+ // Strip off quotes
+ if ($match[2]{0} == '"') {
+ $match[2] = substr($match[2], 1, -1);
+ }
+ // Simplify keyword according to indexing rules
+ $match[2] = search_simplify($match[2]);
+ // Negative matches
+ if ($match[1] == '-') {
+ $keys['negative'][] = $match[2];
+ }
+ // OR operator: instead of a single keyword, we store an array of all
+ // OR'd keywords.
+ elseif ($match[2] == 'OR' && count($keys['positive'])) {
+ $keys['positive'][] = array(array_pop($keys['positive']));
+ $or = true;
+ continue;
+ }
+ // Plain keyword
+ else {
+ if ($or) {
+ $keys['positive'][count($keys['positive']) - 1][] = $match[2];
+ }
+ else {
+ $keys['positive'][] = $match[2];
+ }
+ }
+ $or = false;
+ }
+
+ // Convert keywords into SQL statements.
+ $scorewords = array();
+ $query = array();
+ $query2 = array();
+ $arguments = array();
+ $arguments2 = array();
+ $matches = 0; // Counts the minimal number of words per item must match in the index.
+ // Positive matches
+ foreach ($keys['positive'] as $key) {
+ // Group of ORed terms
+ if (is_array($key) && count($key)) {
+ $queryor = array();
+ foreach ($key as $or) {
+ $q = _search_parse_query($or, $scorewords);
+ if ($q) {
+ $queryor[] = $q;
+ $arguments[] = $or;
+ }
+ }
+ if (count($queryor)) {
+ $query[] = '('. implode(' OR ', $queryor) .')';
+ }
+ }
+ // Single ANDed term
+ else {
+ $q = _search_parse_query($key, $scorewords);
+ if ($q) {
+ $query[] = $q;
+ $arguments[] = $key;
+ }
+ }
+ $matches++;
+ }
+ foreach ($keys['negative'] as $key) {
+ $q = _search_parse_query($key, $scorewords, true);
+ if ($q) {
+ $query[] = $q;
+ $arguments[] = $key;
+ }
+ }
+ // We separate word-index conditions because they are not needed in the
+ // counting query.
+ foreach ($scorewords as $word) {
+ $query2[] = "i.word = '%s'";
+ $arguments2[] = $word;
+ }
+ $query = implode(' AND ', $query);
+ $query2 = implode(' OR ', $query2);
+ return array($query, $arguments, $query2, $arguments2, $matches);
+}
+
+/**
+ * Helper function for search_parse_query();
+ */
+function _search_parse_query(&$word, &$scores, $not = false) {
+ // Determine the scorewords of this word/phrase
+ if (!$not) {
+ $split = explode(' ', $word);
+ foreach ($split as $s) {
+ $num = is_numeric($s);
+ if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 4)) {
+ $scores[] = $num ? ((int)ltrim($word, '-0')) : $s;
+ }
+ }
+ }
+ // Return matching snippet
+ return "d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'";
+}
+
+/**
* Do a query on the full-text search index for a word or words.
*
* This function is normally only called by each module that support the
* indexed search (and thus, implements hook_update_index()).
*
- * The final query is an SQL select on the search_index table. As a guide for
- * writing the optional extra SQL fragments (see below), use this query:
+ * Two queries are performed which can be extended by the caller.
+ *
+ * The first query selects a set of possible matches based on the search index
+ * and any extra given restrictions. This is the classic "OR" search.
*
- * SELECT i.type, i.sid, i.word, SUM(i.score/t.count) AS score
+ * SELECT i.type, i.sid, SUM(i.score*t.count) AS relevance
* FROM {search_index} i
- * $join INNER JOIN {search_total} t ON i.word = t.word
- * WHERE $where AND (i.word = '...' OR ...)
+ * INNER JOIN {search_total} t ON i.word = t.word
+ * $join1
+ * WHERE $where1 AND (...)
* GROUP BY i.type, i.sid
- * ORDER BY score DESC";
+ *
+ * The second query further refines this set by verifying advanced text
+ * conditions (such as AND, negative or phrase matches), and orders the results
+ * on a the column or expression 'score':
+ *
+ * SELECT i.type, i.sid, $select2
+ * FROM temp_search_sids i
+ * INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type
+ * $join2
+ * WHERE (...)
+ * ORDER BY score DESC
*
* @param $keywords
* A search string as entered by the user.
@@ -442,85 +721,69 @@ function search_index($sid, $type, $text) {
* @param $type
* A string identifying the calling module.
*
- * @param $join
- * (optional) A string to be inserted into the JOIN part of the SQL query.
+ * @param $join1
+ * (optional) Inserted into the JOIN part of the first SQL query.
* For example "INNER JOIN {node} n ON n.nid = i.sid".
*
- * @param $where
- * (optional) A string to be inserted into the WHERE part of the SQL query.
- * For example "(n.status > 0)".
+ * @param $where1
+ * (optional) Inserted into the WHERE part of the first SQL query.
+ * For example "(n.status > %d)".
+ *
+ * @param $arguments1
+ * (optional) Extra SQL arguments belonging to the first query.
+ *
+ * @param $select2
+ * (optional) Inserted into the SELECT pat of the second query. Must contain
+ * a column selected as 'score'.
+ * defaults to 'i.relevance AS score'
*
- * @param $variation
- * Used internally. Must not be specified.
+ * @param $join2
+ * (optional) Inserted into the JOIN par of the second SQL query.
+ * For example "INNER JOIN {node_comment_statistics} n ON n.nid = i.sid"
+ *
+ * @param $arguments2
+ * (optional) Extra SQL arguments belonging to the second query parameter.
*
* @return
* An array of SIDs for the search results.
*
* @ingroup search
*/
-function do_search($keywords, $type, $join = '', $where = '1', $variation = true) {
- // Note, we replace the wildcards with U+FFFD (Replacement character) to pass
- // through the keyword extractor. Multiple wildcards are collapsed into one.
- $keys = preg_replace('!\*+!', '�', $keywords);
-
- // Split into words
- $keys = search_keywords_split($keys);
+function do_search($keywords, $type, $join1 = '', $where1 = '1', $arguments1 = array(), $select2 = 'i.relevance AS score', $join2 = '', $arguments2 = array()) {
+ $query = search_parse_query($keywords);
- $words = array();
- $arguments = array();
- $refused = array();
- // Build WHERE clause
- foreach ($keys as $word) {
- if (drupal_strlen($word) < variable_get('remove_short', 3)) {
- if ($word != '') {
- $refused[] = str_replace('�', '*', $word);
- }
- continue;
- }
- if (strpos($word, '�') !== false) {
- $words[] = "i.word LIKE '%s'";
- $arguments[] = str_replace('�', '%', drupal_strtolower($word));
- }
- else {
- $words[] = "i.word = '%s'";
- $arguments[] = drupal_strtolower($word);
- }
- }
- // Tell the user which words were excluded
- if (count($refused) && $variation) {
- $message = format_plural(count($refused),
- 'The word %words was not included because it is too short.',
- 'The words %words were not included because they were too short.');
- drupal_set_message(strtr($message, array('%words' => theme('placeholder', implode(', ', $refused)))));
+ if ($query === NULL || $query[0] == '' || $query[2] == '') {
+ return array();
}
- if (count($words) == 0) {
+ // First pass: select all possible matching sids, doing a simple index-based OR matching on the keywords.
+ // 'matches' is used to reject those items that cannot possibly match the query.
+ $conditions = $where1 .' AND ('. $query[2] .") AND i.type = '%s'";
+ $arguments = array_merge($arguments1, $query[3], array($type, $query[4]));
+ $result = db_query_temporary("SELECT i.type, i.sid, SUM(i.score * t.count) AS relevance, COUNT(*) AS matches FROM {search_index} i INNER JOIN {search_total} t ON i.word = t.word $join1 WHERE $conditions GROUP BY i.type, i.sid HAVING matches >= %d", $arguments, 'temp_search_sids');
+
+ // Calculate maximum relevance, to normalize it
+ $normalize = db_result(db_query('SELECT MAX(relevance) FROM temp_search_sids'));
+ if (!$normalize) {
return array();
}
- $conditions = $where .' AND ('. implode(' OR ', $words) .')';
+ $select2 = str_replace('i.relevance', '('. (1.0 / $normalize) .' * i.relevance)', $select2);
- // Get result count (for pager)
- $count = db_num_rows(db_query("SELECT DISTINCT i.sid, i.type FROM {search_index} i $join WHERE $conditions", $arguments));
- if ($count == 0) {
- // Try out a looser search query if nothing was found.
- if ($variation && $loose = search_keywords_variation($keywords)) {
- return do_search($loose, $type, $join, $where, false);
- }
- else {
- return array();
- }
+ // Second pass: only keep items that match the complicated keywords conditions (phrase search, negative keywords, ...)
+ $conditions = '('. $query[0] .')';
+ $arguments = array_merge($arguments2, $query[1]);
+ $result = db_query_temporary("SELECT i.type, i.sid, $select2 FROM temp_search_sids i INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type $join2 WHERE $conditions ORDER BY score DESC", $arguments, 'temp_search_results');
+ if (($count = db_result(db_query('SELECT COUNT(*) FROM temp_search_results'))) == 0) {
+ return array();
}
$count_query = "SELECT $count";
- // Do pager query
- $query = "SELECT i.type, i.sid, SUM(i.score/t.count) AS score FROM {search_index} i $join INNER JOIN {search_total} t ON i.word = t.word WHERE $conditions GROUP BY i.type, i.sid ORDER BY score DESC";
- $result = pager_query($query, 15, 0, $count_query, $arguments);
-
+ // Do actual search query
+ $result = pager_query("SELECT * FROM temp_search_results", 10, 0, $count_query, $arguments);
$results = array();
while ($item = db_fetch_object($result)) {
$results[] = $item->sid;
}
-
return $results;
}
@@ -543,11 +806,12 @@ function search_view() {
// Search form submits with POST but redirects to GET. This way we can keep
// the search query URL clean as a whistle:
// search/type/keyword+keyword
- if ($_POST['edit']['keys']) {
+ if (isset($_POST['op'])) {
if ($type == '') {
$type = 'node';
}
- drupal_goto('search/'. urlencode($type) .'/'. urlencode($_POST['edit']['keys']));
+ $keys = module_invoke($type, 'search', 'post', $_POST['edit']['keys']);
+ drupal_goto('search/'. urlencode($type) .'/'. urlencode(is_null($keys) ? $_POST['edit']['keys'] : $keys));
}
else if ($type == '') {
// Note: search/node can not be a default tab because it would take on the
@@ -647,21 +911,17 @@ function search_form($action = '', $keys = '', $type = null, $prompt = null) {
$prompt = t('Enter your keywords');
}
+ $form = array();
$form['#action'] = $action;
- $form['prompt'] = array('#type' => 'item', '#title' => $prompt);
- $form['keys'] = array('#type' => 'textfield', '#title' => '', '#default_value' => $keys, '#size' => $prompt ? 40 : 30, '#maxlength' => 255);
- $form['submit'] = array('#type' => 'submit', '#value' => t('Search'));
$form['#attributes'] = array('class' => 'search-form');
+ $form['basic'] = array('#type' => 'item', '#title' => $prompt);
+ $form['basic']['inline'] = array('#type' => 'markup', '#prefix' => '<div class="container-inline">', '#suffix' => '</div>');
+ $form['basic']['inline']['keys'] = array('#type' => 'textfield', '#title' => '', '#default_value' => $keys, '#size' => $prompt ? 40 : 30, '#maxlength' => 255);
+ $form['basic']['inline']['submit'] = array('#type' => 'submit', '#value' => t('Search'));
- return drupal_get_form('search_form', $form);
-}
+ $form = array_merge($form, module_invoke($type, 'search', 'form', $keys));
-function theme_search_form($form) {
- $output = form_render($form['prompt']);
- $output .= '<div class="container-inline">';
- $output .= form_render($form);
- $output .= '</div>';
- return $output;
+ return drupal_get_form('search_form', $form);
}
/**
@@ -692,8 +952,7 @@ function search_data($keys = NULL, $type = 'node') {
* Used for formatting search results.
*
* @param $keys
- * A string containing keywords. They are split into words using the same
- * rules as search indexing.
+ * A string containing a search query.
*
* @param $text
* The text to extract fragments from.
@@ -702,7 +961,11 @@ function search_data($keys = NULL, $type = 'node') {
* A string containing HTML for the excerpt.
*/
function search_excerpt($keys, $text) {
- $keys = search_keywords_split($keys);
+ // Extract positive keywords and phrases
+ preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' '. $keys, $matches);
+ $keys = array_merge($matches[2], $matches[3]);
+
+ // Prepare text
$text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
array_walk($keys, '_search_excerpt_replace');
$workkeys = $keys;
@@ -718,6 +981,7 @@ function search_excerpt($keys, $text) {
foreach ($workkeys as $k => $key) {
if (strlen($key) == 0) {
unset($workkeys[$k]);
+ unset($keys[$k]);
continue;
}
if ($length >= 256) {