summaryrefslogtreecommitdiff
path: root/inc/SafeFN.class.php
blob: 4ce5c94a20ac4d0721ab3d609ddb251420dc8b10 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
<?php

/**
 *  Class to safely store UTF-8 in a Filename
 *
 *  Encodes a utf8 string using only the following characters 0-9a-z_.-%
 *  characters 0-9a-z in the original string are preserved, "plain".
 *  all other characters are represented in a substring that starts
 *  with '%' are "converted".
 *  The transition from converted substrings to plain characters is
 *  marked with a '.'
 *
 *  @author   Christopher Smith
 *  @date     2010-04-02
 */
class SafeFN {

    private static $plain = '/_-0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted
    private static $pre_indicator = '%';
    private static $post_indicator = '.';                             // this character can be included in "plain" set
    private static $adjustments = array();                            // must be initialized, use getAdjustments()

    /**
     * Convert an UTF-8 string to a safe ASCII String
     *
     *  conversion process
     *    - if codepoint is a plain character,
     *      - if previous character was "converted", append post_indicator
     *        to output
     *      - append ascii byte for character to output (continue to
     *        next character)
     *
     *    - reduce codepoint value to fill the holes left by "plain"
     *    - choose marker character for conversion by taking modulus
     *      (number of possible pre_indicators) of modified codepoint
     *    - calculate value for conversion to base36 by integer division
     *      (number of possible pre_indicators) of modified codepoint
     *    - convert above value to a base36 string
     *    - append marker characater followed by base36 string to
     *      output (continue to next character)
     */
    public function encode($utf8) {
        return self::unicode_safe(self::utf8_unicode($utf8));
    }

    /**
     *  decoding process
     *    - split the string into substrings at marker characters,
     *      discarding post_indicator character but keeping
     *      pre_indicator characters (along with their following
     *      base36 string)
     *    - check the first character of the substring
     *      - if its not a pre_indicator character, convert each
     *        character in the substring into its codepoint value
     *        and append to output (continue to next substring)
     *      - if it is a pre_indicator character, get its position in the
     *        pre_indicator string (order is important)
     *    - convert the remainder of the string from base36 to base10
     *      and then to an (int).
     *    - multiply the converted int by the number of pre_indicator
     *      characters and add the pre_indicator position
     *    - reverse the conversion adjustment for codepoint holes left by
     *      "plain" characters
     *    - append resulting codepoint value to output (continue to next
     *      substring)
     */
    public function decode($safe) {
        return self::unicode_utf8(self::safe_unicode(strtolower($safe)));
    }

    public function validate_printable_utf8($printable_utf8) {
        return !preg_match('/[\x01-\x1f]/',$printable_utf8);
    }

    public function validate_safe($safe) {
        return !preg_match('/[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']/',$safe);
    }

    private function utf8_unicode($utf8) {
        return utf8_to_unicode($utf8);
    }

    private function unicode_utf8($unicode) {
        return unicode_to_utf8($unicode);
    }

    private function unicode_safe($unicode) {

        $safe = '';
        $converted = false;

        foreach ($unicode as $codepoint) {
            if (self::isPlain($codepoint)) {
                if ($converted) {
                    $safe .= self::$post_indicator;
                    $converted = false;
                }
                $safe .= chr($codepoint);

            } else if (self::isPreIndicator($codepoint)) {
                $converted = true;
                $safe .= chr($codepoint);

            } else {
                $converted = true;
                $adjusted = self::adjustForPlain($codepoint);

                $marker = $adjusted % strlen(self::$pre_indicator);
                $base = (int) ($adjusted / strlen(self::$pre_indicator));

                $safe .= self::$pre_indicator[$marker];
                $safe .= base_convert((string)$base,10,36);
            }
        }
        return $safe;
    }

    private function safe_unicode($safe) {
        $unicode = array();
        $split = preg_split('/(?=['.self::$post_indicator.self::$pre_indicator.'])/',$safe,-1,PREG_SPLIT_NO_EMPTY);

        $converted = false;
        foreach ($split as $sub) {
            if (($marker = strpos(self::$pre_indicator,$sub[0])) === false) {
                if ($converted) {
                    // strip post_indicator
                    $sub = substr($sub,1);
                    $converted = false;
                }
                for ($i=0; $i < strlen($sub); $i++) {
                    $unicode[] = ord($sub[$i]);
                }
            } else if (strlen($sub)==1) {
                $converted =  true;
                $unicode[] = ord($sub);
            } else {
                // a single codepoint in our base
                $converted = true;
                $base = (int)base_convert(substr($sub,1),36,10);
                $adjusted = ($base*strlen(self::$pre_indicator)) + $marker;

                $unicode[] = self::reverseForPlain($adjusted);
            }
        }

        return $unicode;
    }

    private function isPlain($codepoint) {
        return ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false));
    }

    private function isPreIndicator($codepoint) {
        return ($codepoint < 127 && (strpos(self::$pre_indicator,chr($codepoint)) !== false));
    }

    /**
     * adjust for plain and non-printable (ascii 0-31)
     * this makes SPACE (0x20) the first character we allow
     */
    private function adjustForPlain($codepoint) {
        $adjustment = self::getAdjustments();

        // codepoint is higher than that of the plain character with the highest codepoint
        if ($codepoint > ord($adjustment[count($adjustment)-1])) {
            $adjusted = $codepoint - count($adjustment);
        } else if ($codepoint > ord($adjustment[0])) {
            for ($i=1; $i < count($adjustment); $i++) {
                if ($codepoint < ord($adjustment[$i])) {
                    break;
                }
            }
            $adjusted = $codepoint - $i;
        } else {
            $adjusted = $codepoint;
        }

        // substract number of non-printable characters and return
        return $adjusted - ord(' ');
    }

    private function reverseForPlain($adjusted) {
        $adjustment = self::getAdjustments();

        // reverse adjustment for non-printable characters
        $adjusted += ord(' ');

        if ($adjusted + count($adjustment) > ord($adjustment[count($adjustment)-1])) {
            $adjusted += count($adjustment);
        } else if ($adjusted > ord($adjustment[0])) {
            for ($i=1; $i < count($adjustment); $i++) {
                if ($adjusted + $i < ord($adjustment[$i])) {
                    break;
                }
            }
            $adjusted += $i;
        }

        return $adjusted;
    }

    private function getAdjustments() {
        if (empty(self::$adjustments)) {
            self::$adjustments = str_split(self::$plain.self::$pre_indicator.self::$post_indicator);
            sort(self::$adjustments);
        }

        return self::$adjustments;
    }
}