summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--conf/dokuwiki.php1
-rw-r--r--inc/SafeFN.class.php210
-rw-r--r--inc/load.php1
-rw-r--r--inc/pageutils.php50
-rw-r--r--inc/utf8.php39
-rw-r--r--lib/plugins/config/lang/en/lang.php1
-rw-r--r--lib/plugins/config/settings/config.class.php2
-rw-r--r--lib/plugins/config/settings/config.metadata.php1
8 files changed, 265 insertions, 40 deletions
diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php
index d3823eb94..f2a843f96 100644
--- a/conf/dokuwiki.php
+++ b/conf/dokuwiki.php
@@ -89,6 +89,7 @@ $conf['usedraft'] = 1; //automatically save a draft while edit
$conf['sepchar'] = '_'; //word separator character in page names; may be a
// letter, a digit, '_', '-', or '.'.
$conf['canonical'] = 0; //Should all URLs use full canonical http://... style?
+$conf['fnencode'] = 'url'; //encode filenames (url|safe|utf-8)
$conf['autoplural'] = 0; //try (non)plural form of nonexisting files?
$conf['compression'] = 'gz'; //compress old revisions: (0: off) ('gz': gnuzip) ('bz2': bzip)
// bz2 generates smaller files, but needs more cpu-power
diff --git a/inc/SafeFN.class.php b/inc/SafeFN.class.php
new file mode 100644
index 000000000..4ce5c94a2
--- /dev/null
+++ b/inc/SafeFN.class.php
@@ -0,0 +1,210 @@
+<?php
+
+/**
+ * Class to safely store UTF-8 in a Filename
+ *
+ * Encodes a utf8 string using only the following characters 0-9a-z_.-%
+ * characters 0-9a-z in the original string are preserved, "plain".
+ * all other characters are represented in a substring that starts
+ * with '%' are "converted".
+ * The transition from converted substrings to plain characters is
+ * marked with a '.'
+ *
+ * @author Christopher Smith
+ * @date 2010-04-02
+ */
+class SafeFN {
+
+ private static $plain = '/_-0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted
+ private static $pre_indicator = '%';
+ private static $post_indicator = '.'; // this character can be included in "plain" set
+ private static $adjustments = array(); // must be initialized, use getAdjustments()
+
+ /**
+ * Convert an UTF-8 string to a safe ASCII String
+ *
+ * conversion process
+ * - if codepoint is a plain character,
+ * - if previous character was "converted", append post_indicator
+ * to output
+ * - append ascii byte for character to output (continue to
+ * next character)
+ *
+ * - reduce codepoint value to fill the holes left by "plain"
+ * - choose marker character for conversion by taking modulus
+ * (number of possible pre_indicators) of modified codepoint
+ * - calculate value for conversion to base36 by integer division
+ * (number of possible pre_indicators) of modified codepoint
+ * - convert above value to a base36 string
+ * - append marker characater followed by base36 string to
+ * output (continue to next character)
+ */
+ public function encode($utf8) {
+ return self::unicode_safe(self::utf8_unicode($utf8));
+ }
+
+ /**
+ * decoding process
+ * - split the string into substrings at marker characters,
+ * discarding post_indicator character but keeping
+ * pre_indicator characters (along with their following
+ * base36 string)
+ * - check the first character of the substring
+ * - if its not a pre_indicator character, convert each
+ * character in the substring into its codepoint value
+ * and append to output (continue to next substring)
+ * - if it is a pre_indicator character, get its position in the
+ * pre_indicator string (order is important)
+ * - convert the remainder of the string from base36 to base10
+ * and then to an (int).
+ * - multiply the converted int by the number of pre_indicator
+ * characters and add the pre_indicator position
+ * - reverse the conversion adjustment for codepoint holes left by
+ * "plain" characters
+ * - append resulting codepoint value to output (continue to next
+ * substring)
+ */
+ public function decode($safe) {
+ return self::unicode_utf8(self::safe_unicode(strtolower($safe)));
+ }
+
+ public function validate_printable_utf8($printable_utf8) {
+ return !preg_match('/[\x01-\x1f]/',$printable_utf8);
+ }
+
+ public function validate_safe($safe) {
+ return !preg_match('/[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']/',$safe);
+ }
+
+ private function utf8_unicode($utf8) {
+ return utf8_to_unicode($utf8);
+ }
+
+ private function unicode_utf8($unicode) {
+ return unicode_to_utf8($unicode);
+ }
+
+ private function unicode_safe($unicode) {
+
+ $safe = '';
+ $converted = false;
+
+ foreach ($unicode as $codepoint) {
+ if (self::isPlain($codepoint)) {
+ if ($converted) {
+ $safe .= self::$post_indicator;
+ $converted = false;
+ }
+ $safe .= chr($codepoint);
+
+ } else if (self::isPreIndicator($codepoint)) {
+ $converted = true;
+ $safe .= chr($codepoint);
+
+ } else {
+ $converted = true;
+ $adjusted = self::adjustForPlain($codepoint);
+
+ $marker = $adjusted % strlen(self::$pre_indicator);
+ $base = (int) ($adjusted / strlen(self::$pre_indicator));
+
+ $safe .= self::$pre_indicator[$marker];
+ $safe .= base_convert((string)$base,10,36);
+ }
+ }
+ return $safe;
+ }
+
+ private function safe_unicode($safe) {
+ $unicode = array();
+ $split = preg_split('/(?=['.self::$post_indicator.self::$pre_indicator.'])/',$safe,-1,PREG_SPLIT_NO_EMPTY);
+
+ $converted = false;
+ foreach ($split as $sub) {
+ if (($marker = strpos(self::$pre_indicator,$sub[0])) === false) {
+ if ($converted) {
+ // strip post_indicator
+ $sub = substr($sub,1);
+ $converted = false;
+ }
+ for ($i=0; $i < strlen($sub); $i++) {
+ $unicode[] = ord($sub[$i]);
+ }
+ } else if (strlen($sub)==1) {
+ $converted = true;
+ $unicode[] = ord($sub);
+ } else {
+ // a single codepoint in our base
+ $converted = true;
+ $base = (int)base_convert(substr($sub,1),36,10);
+ $adjusted = ($base*strlen(self::$pre_indicator)) + $marker;
+
+ $unicode[] = self::reverseForPlain($adjusted);
+ }
+ }
+
+ return $unicode;
+ }
+
+ private function isPlain($codepoint) {
+ return ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false));
+ }
+
+ private function isPreIndicator($codepoint) {
+ return ($codepoint < 127 && (strpos(self::$pre_indicator,chr($codepoint)) !== false));
+ }
+
+ /**
+ * adjust for plain and non-printable (ascii 0-31)
+ * this makes SPACE (0x20) the first character we allow
+ */
+ private function adjustForPlain($codepoint) {
+ $adjustment = self::getAdjustments();
+
+ // codepoint is higher than that of the plain character with the highest codepoint
+ if ($codepoint > ord($adjustment[count($adjustment)-1])) {
+ $adjusted = $codepoint - count($adjustment);
+ } else if ($codepoint > ord($adjustment[0])) {
+ for ($i=1; $i < count($adjustment); $i++) {
+ if ($codepoint < ord($adjustment[$i])) {
+ break;
+ }
+ }
+ $adjusted = $codepoint - $i;
+ } else {
+ $adjusted = $codepoint;
+ }
+
+ // substract number of non-printable characters and return
+ return $adjusted - ord(' ');
+ }
+
+ private function reverseForPlain($adjusted) {
+ $adjustment = self::getAdjustments();
+
+ // reverse adjustment for non-printable characters
+ $adjusted += ord(' ');
+
+ if ($adjusted + count($adjustment) > ord($adjustment[count($adjustment)-1])) {
+ $adjusted += count($adjustment);
+ } else if ($adjusted > ord($adjustment[0])) {
+ for ($i=1; $i < count($adjustment); $i++) {
+ if ($adjusted + $i < ord($adjustment[$i])) {
+ break;
+ }
+ }
+ $adjusted += $i;
+ }
+
+ return $adjusted;
+ }
+
+ private function getAdjustments() {
+ if (empty(self::$adjustments)) {
+ self::$adjustments = str_split(self::$plain.self::$pre_indicator.self::$post_indicator);
+ sort(self::$adjustments);
+ }
+
+ return self::$adjustments;
+ }
+}
diff --git a/inc/load.php b/inc/load.php
index faf4e9570..2f5be6d63 100644
--- a/inc/load.php
+++ b/inc/load.php
@@ -73,6 +73,7 @@ function load_autoload($name){
'ZipLib' => DOKU_INC.'inc/ZipLib.class.php',
'DokuWikiFeedCreator' => DOKU_INC.'inc/feedcreator.class.php',
'Doku_Parser_Mode' => DOKU_INC.'inc/parser/parser.php',
+ 'SafeFN' => DOKU_INC.'inc/SafeFN.class.php',
'DokuWiki_Action_Plugin' => DOKU_PLUGIN.'action.php',
'DokuWiki_Admin_Plugin' => DOKU_PLUGIN.'admin.php',
diff --git a/inc/pageutils.php b/inc/pageutils.php
index cd3cf1fce..43c84038f 100644
--- a/inc/pageutils.php
+++ b/inc/pageutils.php
@@ -543,3 +543,53 @@ function prettyprint_id($id) {
}
return hsc($id);
}
+
+/**
+ * Encode a UTF-8 filename to use on any filesystem
+ *
+ * Uses the 'fnencode' option to determine encoding
+ *
+ * When the second parameter is true the string will
+ * be encoded only if non ASCII characters are detected -
+ * This makes it safe to run it multiple times on the
+ * same string (default is true)
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ * @see urlencode
+ */
+function utf8_encodeFN($file,$safe=true){
+ global $conf;
+ if($conf['fnencode'] == 'utf-8') return $file;
+
+ if($safe && preg_match('#^[a-zA-Z0-9/_\-\.%]+$#',$file)){
+ return $file;
+ }
+
+ if($conf['fnencode'] == 'safe'){
+ return SafeFN::encode($file);
+ }
+
+ $file = urlencode($file);
+ $file = str_replace('%2F','/',$file);
+ return $file;
+}
+
+/**
+ * Decode a filename back to UTF-8
+ *
+ * Uses the 'fnencode' option to determine encoding
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ * @see urldecode
+ */
+function utf8_decodeFN($file){
+ global $conf;
+ if($conf['fnencode'] == 'utf-8') return $file;
+
+ if($conf['fnencode'] == 'safe'){
+ return SafeFN::decode($file);
+ }
+
+ return urldecode($file);
+}
+
diff --git a/inc/utf8.php b/inc/utf8.php
index b078540d2..c10e33ffa 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -19,45 +19,6 @@ if(!defined('UTF8_MBSTRING')){
if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
-if(!function_exists('utf8_encodeFN')){
- /**
- * URL-Encode a filename to allow unicodecharacters
- *
- * Slashes are not encoded
- *
- * When the second parameter is true the string will
- * be encoded only if non ASCII characters are detected -
- * This makes it safe to run it multiple times on the
- * same string (default is true)
- *
- * @author Andreas Gohr <andi@splitbrain.org>
- * @see urlencode
- */
- function utf8_encodeFN($file,$safe=true){
- if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
- return $file;
- }
- $file = urlencode($file);
- $file = str_replace('%2F','/',$file);
- return $file;
- }
-}
-
-if(!function_exists('utf8_decodeFN')){
- /**
- * URL-Decode a filename
- *
- * This is just a wrapper around urldecode
- *
- * @author Andreas Gohr <andi@splitbrain.org>
- * @see urldecode
- */
- function utf8_decodeFN($file){
- $file = urldecode($file);
- return $file;
- }
-}
-
if(!function_exists('utf8_isASCII')){
/**
* Checks if a string contains 7bit ASCII only
diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php
index 2bcd17c12..dd13464fe 100644
--- a/lib/plugins/config/lang/en/lang.php
+++ b/lib/plugins/config/lang/en/lang.php
@@ -116,6 +116,7 @@ $lang['useslash'] = 'Use slash as namespace separator in URLs';
$lang['usedraft'] = 'Automatically save a draft while editing';
$lang['sepchar'] = 'Page name word separator';
$lang['canonical'] = 'Use fully canonical URLs';
+$lang['fnencode'] = 'Method for encoding non-ASCII filenames.';
$lang['autoplural'] = 'Check for plural forms in links';
$lang['compression'] = 'Compression method for attic files';
$lang['cachetime'] = 'Maximum age for cache (sec)';
diff --git a/lib/plugins/config/settings/config.class.php b/lib/plugins/config/settings/config.class.php
index b7428bf6c..2a1d3a28f 100644
--- a/lib/plugins/config/settings/config.class.php
+++ b/lib/plugins/config/settings/config.class.php
@@ -343,7 +343,7 @@ if (!class_exists('setting')) {
var $_cautionList = array(
'basedir' => 'danger', 'baseurl' => 'danger', 'savedir' => 'danger', 'useacl' => 'danger', 'authtype' => 'danger', 'superuser' => 'danger', 'userewrite' => 'danger',
'start' => 'warning', 'camelcase' => 'warning', 'deaccent' => 'warning', 'sepchar' => 'warning', 'compression' => 'warning', 'xsendfile' => 'warning', 'renderer_xhtml' => 'warning',
- 'allowdebug' => 'security', 'htmlok' => 'security', 'phpok' => 'security', 'iexssprotect' => 'security', 'xmlrpc' => 'security'
+ 'allowdebug' => 'security', 'htmlok' => 'security', 'phpok' => 'security', 'iexssprotect' => 'security', 'xmlrpc' => 'security', 'fnencode' => 'warning'
);
function setting($key, $params=NULL) {
diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php
index cea191f56..316b4d1c5 100644
--- a/lib/plugins/config/settings/config.metadata.php
+++ b/lib/plugins/config/settings/config.metadata.php
@@ -171,6 +171,7 @@ $meta['userewrite'] = array('multichoice','_choices' => array(0,1,2));
$meta['useslash'] = array('onoff');
$meta['sepchar'] = array('sepchar');
$meta['canonical'] = array('onoff');
+$meta['fnencode'] = array('multichoice','_choices' => array('url','safe','utf-8'));
$meta['autoplural'] = array('onoff');
$meta['mailfrom'] = array('richemail');
$meta['compress'] = array('onoff');