diff options
-rw-r--r-- | conf/dokuwiki.php | 1 | ||||
-rw-r--r-- | inc/SafeFN.class.php | 210 | ||||
-rw-r--r-- | inc/load.php | 1 | ||||
-rw-r--r-- | inc/pageutils.php | 50 | ||||
-rw-r--r-- | inc/utf8.php | 39 | ||||
-rw-r--r-- | lib/plugins/config/lang/en/lang.php | 1 | ||||
-rw-r--r-- | lib/plugins/config/settings/config.class.php | 2 | ||||
-rw-r--r-- | lib/plugins/config/settings/config.metadata.php | 1 |
8 files changed, 265 insertions, 40 deletions
diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php index d3823eb94..f2a843f96 100644 --- a/conf/dokuwiki.php +++ b/conf/dokuwiki.php @@ -89,6 +89,7 @@ $conf['usedraft'] = 1; //automatically save a draft while edit $conf['sepchar'] = '_'; //word separator character in page names; may be a // letter, a digit, '_', '-', or '.'. $conf['canonical'] = 0; //Should all URLs use full canonical http://... style? +$conf['fnencode'] = 'url'; //encode filenames (url|safe|utf-8) $conf['autoplural'] = 0; //try (non)plural form of nonexisting files? $conf['compression'] = 'gz'; //compress old revisions: (0: off) ('gz': gnuzip) ('bz2': bzip) // bz2 generates smaller files, but needs more cpu-power diff --git a/inc/SafeFN.class.php b/inc/SafeFN.class.php new file mode 100644 index 000000000..4ce5c94a2 --- /dev/null +++ b/inc/SafeFN.class.php @@ -0,0 +1,210 @@ +<?php + +/** + * Class to safely store UTF-8 in a Filename + * + * Encodes a utf8 string using only the following characters 0-9a-z_.-% + * characters 0-9a-z in the original string are preserved, "plain". + * all other characters are represented in a substring that starts + * with '%' are "converted". + * The transition from converted substrings to plain characters is + * marked with a '.' + * + * @author Christopher Smith + * @date 2010-04-02 + */ +class SafeFN { + + private static $plain = '/_-0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted + private static $pre_indicator = '%'; + private static $post_indicator = '.'; // this character can be included in "plain" set + private static $adjustments = array(); // must be initialized, use getAdjustments() + + /** + * Convert an UTF-8 string to a safe ASCII String + * + * conversion process + * - if codepoint is a plain character, + * - if previous character was "converted", append post_indicator + * to output + * - append ascii byte for character to output (continue to + * next character) + * + * - reduce codepoint value to fill the holes left by "plain" + * - choose marker character for conversion by taking modulus + * (number of possible pre_indicators) of modified codepoint + * - calculate value for conversion to base36 by integer division + * (number of possible pre_indicators) of modified codepoint + * - convert above value to a base36 string + * - append marker characater followed by base36 string to + * output (continue to next character) + */ + public function encode($utf8) { + return self::unicode_safe(self::utf8_unicode($utf8)); + } + + /** + * decoding process + * - split the string into substrings at marker characters, + * discarding post_indicator character but keeping + * pre_indicator characters (along with their following + * base36 string) + * - check the first character of the substring + * - if its not a pre_indicator character, convert each + * character in the substring into its codepoint value + * and append to output (continue to next substring) + * - if it is a pre_indicator character, get its position in the + * pre_indicator string (order is important) + * - convert the remainder of the string from base36 to base10 + * and then to an (int). + * - multiply the converted int by the number of pre_indicator + * characters and add the pre_indicator position + * - reverse the conversion adjustment for codepoint holes left by + * "plain" characters + * - append resulting codepoint value to output (continue to next + * substring) + */ + public function decode($safe) { + return self::unicode_utf8(self::safe_unicode(strtolower($safe))); + } + + public function validate_printable_utf8($printable_utf8) { + return !preg_match('/[\x01-\x1f]/',$printable_utf8); + } + + public function validate_safe($safe) { + return !preg_match('/[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']/',$safe); + } + + private function utf8_unicode($utf8) { + return utf8_to_unicode($utf8); + } + + private function unicode_utf8($unicode) { + return unicode_to_utf8($unicode); + } + + private function unicode_safe($unicode) { + + $safe = ''; + $converted = false; + + foreach ($unicode as $codepoint) { + if (self::isPlain($codepoint)) { + if ($converted) { + $safe .= self::$post_indicator; + $converted = false; + } + $safe .= chr($codepoint); + + } else if (self::isPreIndicator($codepoint)) { + $converted = true; + $safe .= chr($codepoint); + + } else { + $converted = true; + $adjusted = self::adjustForPlain($codepoint); + + $marker = $adjusted % strlen(self::$pre_indicator); + $base = (int) ($adjusted / strlen(self::$pre_indicator)); + + $safe .= self::$pre_indicator[$marker]; + $safe .= base_convert((string)$base,10,36); + } + } + return $safe; + } + + private function safe_unicode($safe) { + $unicode = array(); + $split = preg_split('/(?=['.self::$post_indicator.self::$pre_indicator.'])/',$safe,-1,PREG_SPLIT_NO_EMPTY); + + $converted = false; + foreach ($split as $sub) { + if (($marker = strpos(self::$pre_indicator,$sub[0])) === false) { + if ($converted) { + // strip post_indicator + $sub = substr($sub,1); + $converted = false; + } + for ($i=0; $i < strlen($sub); $i++) { + $unicode[] = ord($sub[$i]); + } + } else if (strlen($sub)==1) { + $converted = true; + $unicode[] = ord($sub); + } else { + // a single codepoint in our base + $converted = true; + $base = (int)base_convert(substr($sub,1),36,10); + $adjusted = ($base*strlen(self::$pre_indicator)) + $marker; + + $unicode[] = self::reverseForPlain($adjusted); + } + } + + return $unicode; + } + + private function isPlain($codepoint) { + return ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)); + } + + private function isPreIndicator($codepoint) { + return ($codepoint < 127 && (strpos(self::$pre_indicator,chr($codepoint)) !== false)); + } + + /** + * adjust for plain and non-printable (ascii 0-31) + * this makes SPACE (0x20) the first character we allow + */ + private function adjustForPlain($codepoint) { + $adjustment = self::getAdjustments(); + + // codepoint is higher than that of the plain character with the highest codepoint + if ($codepoint > ord($adjustment[count($adjustment)-1])) { + $adjusted = $codepoint - count($adjustment); + } else if ($codepoint > ord($adjustment[0])) { + for ($i=1; $i < count($adjustment); $i++) { + if ($codepoint < ord($adjustment[$i])) { + break; + } + } + $adjusted = $codepoint - $i; + } else { + $adjusted = $codepoint; + } + + // substract number of non-printable characters and return + return $adjusted - ord(' '); + } + + private function reverseForPlain($adjusted) { + $adjustment = self::getAdjustments(); + + // reverse adjustment for non-printable characters + $adjusted += ord(' '); + + if ($adjusted + count($adjustment) > ord($adjustment[count($adjustment)-1])) { + $adjusted += count($adjustment); + } else if ($adjusted > ord($adjustment[0])) { + for ($i=1; $i < count($adjustment); $i++) { + if ($adjusted + $i < ord($adjustment[$i])) { + break; + } + } + $adjusted += $i; + } + + return $adjusted; + } + + private function getAdjustments() { + if (empty(self::$adjustments)) { + self::$adjustments = str_split(self::$plain.self::$pre_indicator.self::$post_indicator); + sort(self::$adjustments); + } + + return self::$adjustments; + } +} diff --git a/inc/load.php b/inc/load.php index faf4e9570..2f5be6d63 100644 --- a/inc/load.php +++ b/inc/load.php @@ -73,6 +73,7 @@ function load_autoload($name){ 'ZipLib' => DOKU_INC.'inc/ZipLib.class.php', 'DokuWikiFeedCreator' => DOKU_INC.'inc/feedcreator.class.php', 'Doku_Parser_Mode' => DOKU_INC.'inc/parser/parser.php', + 'SafeFN' => DOKU_INC.'inc/SafeFN.class.php', 'DokuWiki_Action_Plugin' => DOKU_PLUGIN.'action.php', 'DokuWiki_Admin_Plugin' => DOKU_PLUGIN.'admin.php', diff --git a/inc/pageutils.php b/inc/pageutils.php index cd3cf1fce..43c84038f 100644 --- a/inc/pageutils.php +++ b/inc/pageutils.php @@ -543,3 +543,53 @@ function prettyprint_id($id) { } return hsc($id); } + +/** + * Encode a UTF-8 filename to use on any filesystem + * + * Uses the 'fnencode' option to determine encoding + * + * When the second parameter is true the string will + * be encoded only if non ASCII characters are detected - + * This makes it safe to run it multiple times on the + * same string (default is true) + * + * @author Andreas Gohr <andi@splitbrain.org> + * @see urlencode + */ +function utf8_encodeFN($file,$safe=true){ + global $conf; + if($conf['fnencode'] == 'utf-8') return $file; + + if($safe && preg_match('#^[a-zA-Z0-9/_\-\.%]+$#',$file)){ + return $file; + } + + if($conf['fnencode'] == 'safe'){ + return SafeFN::encode($file); + } + + $file = urlencode($file); + $file = str_replace('%2F','/',$file); + return $file; +} + +/** + * Decode a filename back to UTF-8 + * + * Uses the 'fnencode' option to determine encoding + * + * @author Andreas Gohr <andi@splitbrain.org> + * @see urldecode + */ +function utf8_decodeFN($file){ + global $conf; + if($conf['fnencode'] == 'utf-8') return $file; + + if($conf['fnencode'] == 'safe'){ + return SafeFN::decode($file); + } + + return urldecode($file); +} + diff --git a/inc/utf8.php b/inc/utf8.php index b078540d2..c10e33ffa 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -19,45 +19,6 @@ if(!defined('UTF8_MBSTRING')){ if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } -if(!function_exists('utf8_encodeFN')){ - /** - * URL-Encode a filename to allow unicodecharacters - * - * Slashes are not encoded - * - * When the second parameter is true the string will - * be encoded only if non ASCII characters are detected - - * This makes it safe to run it multiple times on the - * same string (default is true) - * - * @author Andreas Gohr <andi@splitbrain.org> - * @see urlencode - */ - function utf8_encodeFN($file,$safe=true){ - if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ - return $file; - } - $file = urlencode($file); - $file = str_replace('%2F','/',$file); - return $file; - } -} - -if(!function_exists('utf8_decodeFN')){ - /** - * URL-Decode a filename - * - * This is just a wrapper around urldecode - * - * @author Andreas Gohr <andi@splitbrain.org> - * @see urldecode - */ - function utf8_decodeFN($file){ - $file = urldecode($file); - return $file; - } -} - if(!function_exists('utf8_isASCII')){ /** * Checks if a string contains 7bit ASCII only diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php index 2bcd17c12..dd13464fe 100644 --- a/lib/plugins/config/lang/en/lang.php +++ b/lib/plugins/config/lang/en/lang.php @@ -116,6 +116,7 @@ $lang['useslash'] = 'Use slash as namespace separator in URLs'; $lang['usedraft'] = 'Automatically save a draft while editing'; $lang['sepchar'] = 'Page name word separator'; $lang['canonical'] = 'Use fully canonical URLs'; +$lang['fnencode'] = 'Method for encoding non-ASCII filenames.'; $lang['autoplural'] = 'Check for plural forms in links'; $lang['compression'] = 'Compression method for attic files'; $lang['cachetime'] = 'Maximum age for cache (sec)'; diff --git a/lib/plugins/config/settings/config.class.php b/lib/plugins/config/settings/config.class.php index b7428bf6c..2a1d3a28f 100644 --- a/lib/plugins/config/settings/config.class.php +++ b/lib/plugins/config/settings/config.class.php @@ -343,7 +343,7 @@ if (!class_exists('setting')) { var $_cautionList = array( 'basedir' => 'danger', 'baseurl' => 'danger', 'savedir' => 'danger', 'useacl' => 'danger', 'authtype' => 'danger', 'superuser' => 'danger', 'userewrite' => 'danger', 'start' => 'warning', 'camelcase' => 'warning', 'deaccent' => 'warning', 'sepchar' => 'warning', 'compression' => 'warning', 'xsendfile' => 'warning', 'renderer_xhtml' => 'warning', - 'allowdebug' => 'security', 'htmlok' => 'security', 'phpok' => 'security', 'iexssprotect' => 'security', 'xmlrpc' => 'security' + 'allowdebug' => 'security', 'htmlok' => 'security', 'phpok' => 'security', 'iexssprotect' => 'security', 'xmlrpc' => 'security', 'fnencode' => 'warning' ); function setting($key, $params=NULL) { diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php index cea191f56..316b4d1c5 100644 --- a/lib/plugins/config/settings/config.metadata.php +++ b/lib/plugins/config/settings/config.metadata.php @@ -171,6 +171,7 @@ $meta['userewrite'] = array('multichoice','_choices' => array(0,1,2)); $meta['useslash'] = array('onoff'); $meta['sepchar'] = array('sepchar'); $meta['canonical'] = array('onoff'); +$meta['fnencode'] = array('multichoice','_choices' => array('url','safe','utf-8')); $meta['autoplural'] = array('onoff'); $meta['mailfrom'] = array('richemail'); $meta['compress'] = array('onoff'); |