summaryrefslogtreecommitdiff
path: root/inc/utf8.php
diff options
context:
space:
mode:
authorAndreas Gohr <andi@splitbrain.org>2006-08-26 10:29:19 +0200
committerAndreas Gohr <andi@splitbrain.org>2006-08-26 10:29:19 +0200
commit0eac1afbfcac819df65253478138374667f9b8df (patch)
tree9b7e64f8d57e51f4984d7eb87a04b602ef13a479 /inc/utf8.php
parentfd49f8df8d347af8d355ec7b8cb2261a17e2ae5c (diff)
downloadrpg-0eac1afbfcac819df65253478138374667f9b8df.tar.gz
rpg-0eac1afbfcac819df65253478138374667f9b8df.tar.bz2
code to remove bad UTF-8 bytes added
This adds code to remove or replace invalid UTF-8 bytes and uses it in the ft_snippets function. darcs-hash:20060826082919-7ad00-a94004de159ae93ff5b7270fd3e631ff467233cd.gz
Diffstat (limited to 'inc/utf8.php')
-rw-r--r--inc/utf8.php39
1 files changed, 39 insertions, 0 deletions
diff --git a/inc/utf8.php b/inc/utf8.php
index ee8b4fc13..16722ab2e 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -722,6 +722,45 @@ function utf16be_to_utf8(&$str) {
return unicode_to_utf8($uni);
}
+/**
+ * Replace bad bytes with an alternative character
+ *
+ * ASCII character is recommended for replacement char
+ *
+ * PCRE Pattern to locate bad bytes in a UTF-8 string
+ * Comes from W3 FAQ: Multilingual Forms
+ * Note: modified to include full ASCII range including control chars
+ *
+ * @author Harry Fuecks <hfuecks@gmail.com>
+ * @see http://www.w3.org/International/questions/qa-forms-utf-8
+ * @param string to search
+ * @param string to replace bad bytes with (defaults to '?') - use ASCII
+ * @return string
+ */
+function utf8_bad_replace($str, $replace = '') {
+ $UTF8_BAD =
+ '([\x00-\x7F]'. # ASCII (including control chars)
+ '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
+ '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
+ '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
+ '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
+ '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
+ '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
+ '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
+ '|(.{1}))'; # invalid byte
+ ob_start();
+ while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
+ if ( !isset($matches[2])) {
+ echo $matches[0];
+ } else {
+ echo $replace;
+ }
+ $str = substr($str,strlen($matches[0]));
+ }
+ $result = ob_get_contents();
+ ob_end_clean();
+ return $result;
+}
// only needed if no mb_string available
if(!UTF8_MBSTRING){