diff options
author | Andreas Gohr <andi@splitbrain.org> | 2005-10-09 14:48:33 +0200 |
---|---|---|
committer | Andreas Gohr <andi@splitbrain.org> | 2005-10-09 14:48:33 +0200 |
commit | 91bb5faaff4ff41771606c58f608afd76263b8c7 (patch) | |
tree | 2182f0f0147053a05880d40beaea92eb8d5e7fcc /inc | |
parent | 037f7611417e32cbedc686d949b2ed6f25653868 (diff) | |
download | rpg-91bb5faaff4ff41771606c58f608afd76263b8c7.tar.gz rpg-91bb5faaff4ff41771606c58f608afd76263b8c7.tar.bz2 |
ignore regexp failures when handling asian chars
The new handling of asian chars as single words needs a recent PCRE library
(PHP 4.3.10 is known work). If this support isn't available the regexp
compilation will fail. This patch adds a workaround - this means the search
will not work as expected with asian words on older PHP versions.
darcs-hash:20051009124833-7ad00-1319829be5cb73246e13eb65e4c950d43c6ce5bf.gz
Diffstat (limited to 'inc')
-rw-r--r-- | inc/fulltext.php | 2 | ||||
-rw-r--r-- | inc/indexer.php | 22 |
2 files changed, 13 insertions, 11 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index 89fa5b259..4d4b8138c 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -265,7 +265,7 @@ function ft_queryParser($query){ if(count($token)) $q['not'] = array_merge($q['not'],$token); }else{ // asian "words" need to be searched as phrases - if(preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){ + if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){ $q['phrases'] = array_merge($q['phrases'],$matches[1]); } diff --git a/inc/indexer.php b/inc/indexer.php index a8511b1ee..22bd8566b 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -17,12 +17,12 @@ // Ranges taken from http://en.wikipedia.org/wiki/Unicode_block // I'm no language expert. If you think some ranges are wrongly chosen or // a range is missing, please contact me -define(IDX_ASIAN,'['. - '\x{0E00}-\x{0E7F}'. // Thai - '\x{2E80}-\x{D7AF}'. // CJK -> Hangul - '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs - '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms - ']'); +define('IDX_ASIAN','['. + '\x{0E00}-\x{0E7F}'. // Thai + '\x{2E80}-\x{D7AF}'. // CJK -> Hangul + '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs + '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms + ']'); /** @@ -52,8 +52,9 @@ function idx_getPageWords($page){ foreach ($tokens as $word => $count) { // simple filter to restrict use of utf8_stripspecials if (preg_match('/[^0-9A-Za-z]/u', $word)) { - // handle asian chars as single words - $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); + if(!is_null($asia)) $word = $asia; //recover from regexp failure $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); $arr = array_count_values($arr); @@ -326,8 +327,9 @@ function idx_tokenizer($string,&$stopwords){ $words = array(); if(preg_match('/[^0-9A-Za-z]/u', $string)){ - #handle asian chars as single words - $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); + if(!is_null($asia)) $string = $asia; //recover from regexp failure $arr = explode(' ', utf8_stripspecials($string,' ','._\-:')); foreach ($arr as $w) { |