8 files changed, 265 insertions, 40 deletions
diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php
index d3823eb94..f2a843f96 100644
--- a/conf/dokuwiki.php
+++ b/conf/dokuwiki.php
@@ -89,6 +89,7 @@ $conf['usedraft']    = 1;                //automatically save a draft while edit
 $conf['sepchar']     = '_';              //word separator character in page names; may be a
                                          //  letter, a digit, '_', '-', or '.'.
 $conf['canonical']   = 0;                //Should all URLs use full canonical http://... style?
+$conf['fnencode']    = 'url';            //encode filenames (url|safe|utf-8)
 $conf['autoplural']  = 0;                //try (non)plural form of nonexisting files?
 $conf['compression'] = 'gz';             //compress old revisions: (0: off) ('gz': gnuzip) ('bz2': bzip)
                                          //  bz2 generates smaller files, but needs more cpu-power
diff --git a/inc/SafeFN.class.php b/inc/SafeFN.class.php
new file mode 100644
index 000000000..4ce5c94a2
--- /dev/null
+++ b/inc/SafeFN.class.php
@@ -0,0 +1,210 @@
+<?php
+
+/**
+ *  Class to safely store UTF-8 in a Filename
+ *
+ *  Encodes a utf8 string using only the following characters 0-9a-z_.-%
+ *  characters 0-9a-z in the original string are preserved, "plain".
+ *  all other characters are represented in a substring that starts
+ *  with '%' are "converted".
+ *  The transition from converted substrings to plain characters is
+ *  marked with a '.'
+ *
+ *  @author   Christopher Smith
+ *  @date     2010-04-02
+ */
+class SafeFN {
+
+    private static $plain = '/_-0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted
+    private static $pre_indicator = '%';
+    private static $post_indicator = '.';                             // this character can be included in "plain" set
+    private static $adjustments = array();                            // must be initialized, use getAdjustments()
+
+    /**
+     * Convert an UTF-8 string to a safe ASCII String
+     *
+     *  conversion process
+     *    - if codepoint is a plain character,
+     *      - if previous character was "converted", append post_indicator
+     *        to output
+     *      - append ascii byte for character to output (continue to
+     *        next character)
+     *
+     *    - reduce codepoint value to fill the holes left by "plain"
+     *    - choose marker character for conversion by taking modulus
+     *      (number of possible pre_indicators) of modified codepoint
+     *    - calculate value for conversion to base36 by integer division
+     *      (number of possible pre_indicators) of modified codepoint
+     *    - convert above value to a base36 string
+     *    - append marker characater followed by base36 string to
+     *      output (continue to next character)
+     */
+    public function encode($utf8) {
+        return self::unicode_safe(self::utf8_unicode($utf8));
+    }
+
+    /**
+     *  decoding process
+     *    - split the string into substrings at marker characters,
+     *      discarding post_indicator character but keeping
+     *      pre_indicator characters (along with their following
+     *      base36 string)
+     *    - check the first character of the substring
+     *      - if its not a pre_indicator character, convert each
+     *        character in the substring into its codepoint value
+     *        and append to output (continue to next substring)
+     *      - if it is a pre_indicator character, get its position in the
+     *        pre_indicator string (order is important)
+     *    - convert the remainder of the string from base36 to base10
+     *      and then to an (int).
+     *    - multiply the converted int by the number of pre_indicator
+     *      characters and add the pre_indicator position
+     *    - reverse the conversion adjustment for codepoint holes left by
+     *      "plain" characters
+     *    - append resulting codepoint value to output (continue to next
+     *      substring)
+     */
+    public function decode($safe) {
+        return self::unicode_utf8(self::safe_unicode(strtolower($safe)));
+    }
+
+    public function validate_printable_utf8($printable_utf8) {
+        return !preg_match('/[\x01-\x1f]/',$printable_utf8);
+    }
+
+    public function validate_safe($safe) {
+        return !preg_match('/[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']/',$safe);
+    }
+
+    private function utf8_unicode($utf8) {
+        return utf8_to_unicode($utf8);
+    }
+
+    private function unicode_utf8($unicode) {
+        return unicode_to_utf8($unicode);
+    }
+
+    private function unicode_safe($unicode) {
+
+        $safe = '';
+        $converted = false;
+
+        foreach ($unicode as $codepoint) {
+            if (self::isPlain($codepoint)) {
+                if ($converted) {
+                    $safe .= self::$post_indicator;
+                    $converted = false;
+                }
+                $safe .= chr($codepoint);
+
+            } else if (self::isPreIndicator($codepoint)) {
+                $converted = true;
+                $safe .= chr($codepoint);
+
+            } else {
+                $converted = true;
+                $adjusted = self::adjustForPlain($codepoint);
+
+                $marker = $adjusted % strlen(self::$pre_indicator);
+                $base = (int) ($adjusted / strlen(self::$pre_indicator));
+
+                $safe .= self::$pre_indicator[$marker];
+                $safe .= base_convert((string)$base,10,36);
+            }
+        }
+        return $safe;
+    }
+
+    private function safe_unicode($safe) {
+        $unicode = array();
+        $split = preg_split('/(?=['.self::$post_indicator.self::$pre_indicator.'])/',$safe,-1,PREG_SPLIT_NO_EMPTY);
+
+        $converted = false;
+        foreach ($split as $sub) {
+            if (($marker = strpos(self::$pre_indicator,$sub[0])) === false) {
+                if ($converted) {
+                    // strip post_indicator
+                    $sub = substr($sub,1);
+                    $converted = false;
+                }
+                for ($i=0; $i < strlen($sub); $i++) {
+                    $unicode[] = ord($sub[$i]);
+                }
+            } else if (strlen($sub)==1) {
+                $converted =  true;
+                $unicode[] = ord($sub);
+            } else {
+                // a single codepoint in our base
+                $converted = true;
+                $base = (int)base_convert(substr($sub,1),36,10);
+                $adjusted = ($base*strlen(self::$pre_indicator)) + $marker;
+
+                $unicode[] = self::reverseForPlain($adjusted);
+            }
+        }
+
+        return $unicode;
+    }
+
+    private function isPlain($codepoint) {
+        return ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false));
+    }
+
+    private function isPreIndicator($codepoint) {
+        return ($codepoint < 127 && (strpos(self::$pre_indicator,chr($codepoint)) !== false));
+    }
+
+    /**
+     * adjust for plain and non-printable (ascii 0-31)
+     * this makes SPACE (0x20) the first character we allow
+     */
+    private function adjustForPlain($codepoint) {
+        $adjustment = self::getAdjustments();
+
+        // codepoint is higher than that of the plain character with the highest codepoint
+        if ($codepoint > ord($adjustment[count($adjustment)-1])) {
+            $adjusted = $codepoint - count($adjustment);
+        } else if ($codepoint > ord($adjustment[0])) {
+            for ($i=1; $i < count($adjustment); $i++) {
+                if ($codepoint < ord($adjustment[$i])) {
+                    break;
+                }
+            }
+            $adjusted = $codepoint - $i;
+        } else {
+            $adjusted = $codepoint;
+        }
+
+        // substract number of non-printable characters and return
+        return $adjusted - ord(' ');
+    }
+
+    private function reverseForPlain($adjusted) {
+        $adjustment = self::getAdjustments();
+
+        // reverse adjustment for non-printable characters
+        $adjusted += ord(' ');
+
+        if ($adjusted + count($adjustment) > ord($adjustment[count($adjustment)-1])) {
+            $adjusted += count($adjustment);
+        } else if ($adjusted > ord($adjustment[0])) {
+            for ($i=1; $i < count($adjustment); $i++) {
+                if ($adjusted + $i < ord($adjustment[$i])) {
+                    break;
+                }
+            }
+            $adjusted += $i;
+        }
+
+        return $adjusted;
+    }
+
+    private function getAdjustments() {
+        if (empty(self::$adjustments)) {
+            self::$adjustments = str_split(self::$plain.self::$pre_indicator.self::$post_indicator);
+            sort(self::$adjustments);
+        }
+
+        return self::$adjustments;
+    }
+}
diff --git a/inc/load.php b/inc/load.php
index faf4e9570..2f5be6d63 100644
--- a/inc/load.php
+++ b/inc/load.php
@@ -73,6 +73,7 @@ function load_autoload($name){
         'ZipLib'                => DOKU_INC.'inc/ZipLib.class.php',
         'DokuWikiFeedCreator'   => DOKU_INC.'inc/feedcreator.class.php',
         'Doku_Parser_Mode'      => DOKU_INC.'inc/parser/parser.php',
+        'SafeFN'                => DOKU_INC.'inc/SafeFN.class.php',
 
         'DokuWiki_Action_Plugin' => DOKU_PLUGIN.'action.php',
         'DokuWiki_Admin_Plugin'  => DOKU_PLUGIN.'admin.php',
diff --git a/inc/pageutils.php b/inc/pageutils.php
index cd3cf1fce..43c84038f 100644
--- a/inc/pageutils.php
+++ b/inc/pageutils.php
@@ -543,3 +543,53 @@ function prettyprint_id($id) {
     }
     return hsc($id);
 }
+
+/**
+ * Encode a UTF-8 filename to use on any filesystem
+ *
+ * Uses the 'fnencode' option to determine encoding
+ *
+ * When the second parameter is true the string will
+ * be encoded only if non ASCII characters are detected -
+ * This makes it safe to run it multiple times on the
+ * same string (default is true)
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ * @see    urlencode
+ */
+function utf8_encodeFN($file,$safe=true){
+    global $conf;
+    if($conf['fnencode'] == 'utf-8') return $file;
+
+    if($safe && preg_match('#^[a-zA-Z0-9/_\-\.%]+$#',$file)){
+        return $file;
+    }
+
+    if($conf['fnencode'] == 'safe'){
+        return SafeFN::encode($file);
+    }
+
+    $file = urlencode($file);
+    $file = str_replace('%2F','/',$file);
+    return $file;
+}
+
+/**
+ * Decode a filename back to UTF-8
+ *
+ * Uses the 'fnencode' option to determine encoding
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ * @see    urldecode
+ */
+function utf8_decodeFN($file){
+    global $conf;
+    if($conf['fnencode'] == 'utf-8') return $file;
+
+    if($conf['fnencode'] == 'safe'){
+        return SafeFN::decode($file);
+    }
+
+    return urldecode($file);
+}
+
diff --git a/inc/utf8.php b/inc/utf8.php
index b078540d2..c10e33ffa 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -19,45 +19,6 @@ if(!defined('UTF8_MBSTRING')){
 
 if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
 
-if(!function_exists('utf8_encodeFN')){
-    /**
-     * URL-Encode a filename to allow unicodecharacters
-     *
-     * Slashes are not encoded
-     *
-     * When the second parameter is true the string will
-     * be encoded only if non ASCII characters are detected -
-     * This makes it safe to run it multiple times on the
-     * same string (default is true)
-     *
-     * @author Andreas Gohr <andi@splitbrain.org>
-     * @see    urlencode
-     */
-    function utf8_encodeFN($file,$safe=true){
-        if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
-            return $file;
-        }
-        $file = urlencode($file);
-        $file = str_replace('%2F','/',$file);
-        return $file;
-    }
-}
-
-if(!function_exists('utf8_decodeFN')){
-    /**
-     * URL-Decode a filename
-     *
-     * This is just a wrapper around urldecode
-     *
-     * @author Andreas Gohr <andi@splitbrain.org>
-     * @see    urldecode
-     */
-    function utf8_decodeFN($file){
-        $file = urldecode($file);
-        return $file;
-    }
-}
-
 if(!function_exists('utf8_isASCII')){
     /**
      * Checks if a string contains 7bit ASCII only
diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php
index 2bcd17c12..dd13464fe 100644
--- a/lib/plugins/config/lang/en/lang.php
+++ b/lib/plugins/config/lang/en/lang.php
@@ -116,6 +116,7 @@ $lang['useslash']    = 'Use slash as namespace separator in URLs';
 $lang['usedraft']    = 'Automatically save a draft while editing';
 $lang['sepchar']     = 'Page name word separator';
 $lang['canonical']   = 'Use fully canonical URLs';
+$lang['fnencode']    = 'Method for encoding non-ASCII filenames.';
 $lang['autoplural']  = 'Check for plural forms in links';
 $lang['compression'] = 'Compression method for attic files';
 $lang['cachetime']   = 'Maximum age for cache (sec)';
diff --git a/lib/plugins/config/settings/config.class.php b/lib/plugins/config/settings/config.class.php
index b7428bf6c..2a1d3a28f 100644
--- a/lib/plugins/config/settings/config.class.php
+++ b/lib/plugins/config/settings/config.class.php
@@ -343,7 +343,7 @@ if (!class_exists('setting')) {
     var $_cautionList = array(
         'basedir' => 'danger', 'baseurl' => 'danger', 'savedir' => 'danger', 'useacl' => 'danger', 'authtype' => 'danger', 'superuser' => 'danger', 'userewrite' => 'danger',
         'start' => 'warning', 'camelcase' => 'warning', 'deaccent' => 'warning', 'sepchar' => 'warning', 'compression' => 'warning', 'xsendfile' => 'warning', 'renderer_xhtml' => 'warning',
-        'allowdebug' => 'security', 'htmlok' => 'security', 'phpok' => 'security', 'iexssprotect' => 'security', 'xmlrpc' => 'security'
+        'allowdebug' => 'security', 'htmlok' => 'security', 'phpok' => 'security', 'iexssprotect' => 'security', 'xmlrpc' => 'security', 'fnencode' => 'warning'
     );
 
     function setting($key, $params=NULL) {
diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php
index cea191f56..316b4d1c5 100644
--- a/lib/plugins/config/settings/config.metadata.php
+++ b/lib/plugins/config/settings/config.metadata.php
@@ -171,6 +171,7 @@ $meta['userewrite']  = array('multichoice','_choices' => array(0,1,2));
 $meta['useslash']    = array('onoff');
 $meta['sepchar']     = array('sepchar');
 $meta['canonical']   = array('onoff');
+$meta['fnencode']    = array('multichoice','_choices' => array('url','safe','utf-8'));
 $meta['autoplural']  = array('onoff');
 $meta['mailfrom']    = array('richemail');
 $meta['compress']    = array('onoff');