diff options
author | chris <chris@teacherscpd.co.uk> | 2005-08-16 05:24:08 +0200 |
---|---|---|
committer | chris <chris@teacherscpd.co.uk> | 2005-08-16 05:24:08 +0200 |
commit | bc54ab520f404e26a95af051e9082aa8fad07d98 (patch) | |
tree | 9e367ae12dc8416906d345d4208c154801d5192d /inc/indexer.php | |
parent | a99d323627999434881d8f05a493ebdbfaa4eadf (diff) | |
download | rpg-bc54ab520f404e26a95af051e9082aa8fad07d98.tar.gz rpg-bc54ab520f404e26a95af051e9082aa8fad07d98.tar.bz2 |
indexer improvements & fix for underscores
darcs-hash:20050816032408-50fdc-6e41585c9b97d70a218877b8ad169df9117d9965.gz
Diffstat (limited to 'inc/indexer.php')
-rw-r--r-- | inc/indexer.php | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/inc/indexer.php b/inc/indexer.php index c5faa5756..7ca870526 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -37,19 +37,20 @@ function idx_getPageWords($page){ $words = array(); foreach ($tokens as $word => $count) { - $word = utf8_strtolower($word); // simple filter to restrict use of utf8_stripspecials - if (preg_match('/\W/', $word)) { + if (preg_match('/[^0-9A-Za-z]/u', $word)) { $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); $arr = array_count_values($arr); foreach ($arr as $w => $c) { if (!is_numeric($w) && strlen($w) < 3) continue; + $w = utf8_strtolower($w); $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0); } } else { if (!is_numeric($w) && strlen($w) < 3) continue; + $word = strtolower($word); $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); } } |