From d5b233026fa2930e4c6ee362ebdfd3586b2f3fb8 Mon Sep 17 00:00:00 2001 From: Tom N Harris Date: Fri, 17 Nov 2006 13:30:32 +0100 Subject: Indexer asian language fixes and speed-ups Make Chinese and Japanese work better with the new indexer. Some missing punctuation added to utf8_stripspecials. Misc. other changes to make indexing faster. The indexes will expire on backend upgrades, so you don't have to delete *.indexed darcs-hash:20061117123032-6942e-774b38e08234928c49b37e40addba375acf67ac0.gz --- inc/utf8.php | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'inc/utf8.php') diff --git a/inc/utf8.php b/inc/utf8.php index e32b64b17..8da981fd9 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -1027,11 +1027,20 @@ $UTF8_SPECIAL_CHARS = array( 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, - 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, + 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, + 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017, + 0x3018, 0x3019, 0x301a, 0x301b, 0x3036, + 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, + 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, + 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c, + 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b, + 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, + 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea, + 0xffeb, 0xffec, 0xffed, 0xffee, ); // utf8 version of above data @@ -1051,8 +1060,12 @@ $UTF8_SPECIAL_CHARS2 = '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. - '➷➸➹➺➻➼➽➾�'. - '�ﹼﹽ'; + '➷➸➹➺➻➼➽➾'. + ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'. + '�'. + '�ﹼﹽ'. + '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'. + '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'; /** * Romanization lookup table -- cgit v1.2.3