From 4b9792c696658fe0cbedc187198fa463b6ff83fc Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Sun, 14 Nov 2010 14:22:08 -0500 Subject: Measure length of multi-character Asian words --- inc/indexer.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'inc') diff --git a/inc/indexer.php b/inc/indexer.php index 7a8bb3ff8..d9eccac76 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -52,8 +52,10 @@ function wordlen($w){ $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked - if(preg_match('/'.IDX_ASIAN2.'/u',$w)) - $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF + if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { + foreach($leadbytes[0] as $b) + $l += ord($b) - 0xE1; + } return $l; } -- cgit v1.2.3