summaryrefslogtreecommitdiff
path: root/inc/indexer.php
diff options
context:
space:
mode:
authorAndreas Gohr <andi@splitbrain.org>2005-10-09 14:48:33 +0200
committerAndreas Gohr <andi@splitbrain.org>2005-10-09 14:48:33 +0200
commit91bb5faaff4ff41771606c58f608afd76263b8c7 (patch)
tree2182f0f0147053a05880d40beaea92eb8d5e7fcc /inc/indexer.php
parent037f7611417e32cbedc686d949b2ed6f25653868 (diff)
downloadrpg-91bb5faaff4ff41771606c58f608afd76263b8c7.tar.gz
rpg-91bb5faaff4ff41771606c58f608afd76263b8c7.tar.bz2
ignore regexp failures when handling asian chars
The new handling of asian chars as single words needs a recent PCRE library (PHP 4.3.10 is known work). If this support isn't available the regexp compilation will fail. This patch adds a workaround - this means the search will not work as expected with asian words on older PHP versions. darcs-hash:20051009124833-7ad00-1319829be5cb73246e13eb65e4c950d43c6ce5bf.gz
Diffstat (limited to 'inc/indexer.php')
-rw-r--r--inc/indexer.php22
1 files changed, 12 insertions, 10 deletions
diff --git a/inc/indexer.php b/inc/indexer.php
index a8511b1ee..22bd8566b 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -17,12 +17,12 @@
// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
// I'm no language expert. If you think some ranges are wrongly chosen or
// a range is missing, please contact me
-define(IDX_ASIAN,'['.
- '\x{0E00}-\x{0E7F}'. // Thai
- '\x{2E80}-\x{D7AF}'. // CJK -> Hangul
- '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs
- '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms
- ']');
+define('IDX_ASIAN','['.
+ '\x{0E00}-\x{0E7F}'. // Thai
+ '\x{2E80}-\x{D7AF}'. // CJK -> Hangul
+ '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs
+ '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms
+ ']');
/**
@@ -52,8 +52,9 @@ function idx_getPageWords($page){
foreach ($tokens as $word => $count) {
// simple filter to restrict use of utf8_stripspecials
if (preg_match('/[^0-9A-Za-z]/u', $word)) {
- // handle asian chars as single words
- $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
+ // handle asian chars as single words (may fail on older PHP version)
+ $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
+ if(!is_null($asia)) $word = $asia; //recover from regexp failure
$arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
$arr = array_count_values($arr);
@@ -326,8 +327,9 @@ function idx_tokenizer($string,&$stopwords){
$words = array();
if(preg_match('/[^0-9A-Za-z]/u', $string)){
- #handle asian chars as single words
- $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
+ // handle asian chars as single words (may fail on older PHP version)
+ $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
+ if(!is_null($asia)) $string = $asia; //recover from regexp failure
$arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
foreach ($arr as $w) {