summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--inc/indexer.php5
-rw-r--r--inc/lang/en/stopwords.txt1
2 files changed, 4 insertions, 2 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index c5faa5756..7ca870526 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -37,19 +37,20 @@ function idx_getPageWords($page){
$words = array();
foreach ($tokens as $word => $count) {
- $word = utf8_strtolower($word);
// simple filter to restrict use of utf8_stripspecials
- if (preg_match('/\W/', $word)) {
+ if (preg_match('/[^0-9A-Za-z]/u', $word)) {
$arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
$arr = array_count_values($arr);
foreach ($arr as $w => $c) {
if (!is_numeric($w) && strlen($w) < 3) continue;
+ $w = utf8_strtolower($w);
$words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0);
}
} else {
if (!is_numeric($w) && strlen($w) < 3) continue;
+ $word = strtolower($word);
$words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
}
}
diff --git a/inc/lang/en/stopwords.txt b/inc/lang/en/stopwords.txt
index 478fb33ef..bc6eb48ae 100644
--- a/inc/lang/en/stopwords.txt
+++ b/inc/lang/en/stopwords.txt
@@ -12,6 +12,7 @@ their
com
for
from
+into
how
that
the