diff options
author | Andreas Gohr <andi@splitbrain.org> | 2006-08-26 10:29:19 +0200 |
---|---|---|
committer | Andreas Gohr <andi@splitbrain.org> | 2006-08-26 10:29:19 +0200 |
commit | 0eac1afbfcac819df65253478138374667f9b8df (patch) | |
tree | 9b7e64f8d57e51f4984d7eb87a04b602ef13a479 | |
parent | fd49f8df8d347af8d355ec7b8cb2261a17e2ae5c (diff) | |
download | rpg-0eac1afbfcac819df65253478138374667f9b8df.tar.gz rpg-0eac1afbfcac819df65253478138374667f9b8df.tar.bz2 |
code to remove bad UTF-8 bytes added
This adds code to remove or replace invalid UTF-8 bytes and uses it
in the ft_snippets function.
darcs-hash:20060826082919-7ad00-a94004de159ae93ff5b7270fd3e631ff467233cd.gz
-rw-r--r-- | inc/fulltext.php | 2 | ||||
-rw-r--r-- | inc/utf8.php | 39 |
2 files changed, 40 insertions, 1 deletions
diff --git a/inc/fulltext.php b/inc/fulltext.php index 3943bdae4..de1a4217b 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -305,7 +305,7 @@ switch ($algorithm) { break; } - return $snippet; + return utf8_bad_replace($snippet); } /** diff --git a/inc/utf8.php b/inc/utf8.php index ee8b4fc13..16722ab2e 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -722,6 +722,45 @@ function utf16be_to_utf8(&$str) { return unicode_to_utf8($uni); } +/** + * Replace bad bytes with an alternative character + * + * ASCII character is recommended for replacement char + * + * PCRE Pattern to locate bad bytes in a UTF-8 string + * Comes from W3 FAQ: Multilingual Forms + * Note: modified to include full ASCII range including control chars + * + * @author Harry Fuecks <hfuecks@gmail.com> + * @see http://www.w3.org/International/questions/qa-forms-utf-8 + * @param string to search + * @param string to replace bad bytes with (defaults to '?') - use ASCII + * @return string + */ +function utf8_bad_replace($str, $replace = '') { + $UTF8_BAD = + '([\x00-\x7F]'. # ASCII (including control chars) + '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte + '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs + '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte + '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates + '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 + '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 + '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 + '|(.{1}))'; # invalid byte + ob_start(); + while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { + if ( !isset($matches[2])) { + echo $matches[0]; + } else { + echo $replace; + } + $str = substr($str,strlen($matches[0])); + } + $result = ob_get_contents(); + ob_end_clean(); + return $result; +} // only needed if no mb_string available if(!UTF8_MBSTRING){ |