2 files changed, 243 insertions, 18 deletions
diff --git a/includes/unicode.inc b/includes/unicode.inc
index 2912cde97..d9b81f062 100644
--- a/includes/unicode.inc
+++ b/includes/unicode.inc
@@ -471,13 +471,13 @@ function drupal_substr($text, $start, $length = NULL) {
   }
   else {
     $strlen = strlen($text);
-    // Find the starting byte offset
+    // Find the starting byte offset.
     $bytes = 0;
     if ($start > 0) {
       // Count all the continuation bytes from the start until we have found
-      // $start characters
+      // $start characters or the end of the string.
       $bytes = -1; $chars = -1;
-      while ($bytes < $strlen && $chars < $start) {
+      while ($bytes < $strlen - 1 && $chars < $start) {
         $bytes++;
         $c = ord($text[$bytes]);
         if ($c < 0x80 || $c >= 0xC0) {
@@ -487,7 +487,7 @@ function drupal_substr($text, $start, $length = NULL) {
     }
     elseif ($start < 0) {
       // Count all the continuation bytes from the end until we have found
-      // abs($start) characters
+      // abs($start) characters.
       $start = abs($start);
       $bytes = $strlen; $chars = 0;
       while ($bytes > 0 && $chars < $start) {
@@ -500,37 +500,48 @@ function drupal_substr($text, $start, $length = NULL) {
     }
     $istart = $bytes;
 
-    // Find the ending byte offset
+    // Find the ending byte offset.
     if ($length === NULL) {
-      $bytes = $strlen - 1;
+      $iend = $strlen;
     }
     elseif ($length > 0) {
       // Count all the continuation bytes from the starting index until we have
-      // found $length + 1 characters. Then backtrack one byte.
-      $bytes = $istart; $chars = 0;
-      while ($bytes < $strlen && $chars < $length) {
-        $bytes++;
-        $c = ord($text[$bytes]);
+      // found $length characters or reached the end of the string, then
+      // backtrace one byte.
+      $iend = $istart - 1; $chars = -1;
+      while ($iend < $strlen - 1 && $chars < $length) {
+        $iend++;
+        $c = ord($text[$iend]);
         if ($c < 0x80 || $c >= 0xC0) {
           $chars++;
         }
       }
-      $bytes--;
+      // Backtrace one byte if the end of the string was not reached.
+      if ($iend < $strlen - 1) {
+        $iend--;
+      }
     }
     elseif ($length < 0) {
       // Count all the continuation bytes from the end until we have found
-      // abs($length) characters
+      // abs($start) characters, then backtrace one byte.
       $length = abs($length);
-      $bytes = $strlen - 1; $chars = 0;
-      while ($bytes >= 0 && $chars < $length) {
-        $c = ord($text[$bytes]);
+      $iend = $strlen; $chars = 0;
+      while ($iend > 0 && $chars < $length) {
+        $iend--;
+        $c = ord($text[$iend]);
         if ($c < 0x80 || $c >= 0xC0) {
           $chars++;
         }
-        $bytes--;
       }
+      // Backtrace one byte if we are not at the begining of the string.
+      if ($iend > 0) {
+        $iend--;
+      }
+    }
+    else {
+      // $length == 0, return an empty string.
+      $iend = $istart - 1;
     }
-    $iend = $bytes;
 
     return substr($text, $istart, max(0, $iend - $istart + 1));
   }
diff --git a/modules/simpletest/tests/unicode.test b/modules/simpletest/tests/unicode.test
new file mode 100644
index 000000000..8970fca25
--- /dev/null
+++ b/modules/simpletest/tests/unicode.test
@@ -0,0 +1,214 @@
+<?php
+// $Id $
+
+/**
+ * @file
+ * Various unicode handling tests.
+ */
+
+/**
+ * Test unicode handling features implemented in unicode.inc.
+ */
+class UnicodeUnitTest extends DrupalWebTestCase {
+
+  /**
+   * Whether to run the extended version of the tests (including non latin1 characters).
+   *
+   * @var boolean
+   */
+  protected $extendedMode = FALSE;
+
+  function getInfo() {
+    return array(
+      'name' => t('Unicode handling'),
+      'description' => t('Tests Drupal Unicode handling.'),
+      'group' => t('System'),
+    );
+  }
+
+  /**
+   * Test full unicode features implemented using the mbstring extension.
+   */
+  function testMbStringUnicode() {
+    global $multibyte;
+
+    // mbstring was not detected on this installation, there is no way to test
+    // multibyte features. Treat that as an exception.
+    if ($multibyte == UNICODE_SINGLEBYTE) {
+      $this->error(t('Unable to test Multibyte features: mbstring extension was not detected.'));
+    }
+
+    $multibyte = UNICODE_MULTIBYTE;
+
+    $this->extendedMode = TRUE;
+    $this->pass(t('Testing in mbstring mode'));
+
+    $this->helperTestStrToLower();
+    $this->helperTestStrToUpper();
+    $this->helperTestUcFirst();
+    $this->helperTestStrLen();
+    $this->helperTestSubStr();
+  }
+
+  /**
+   * Test emulated unicode features.
+   */
+  function testEmulatedUnicode() {
+    global $multibyte;
+
+    $multibyte = UNICODE_SINGLEBYTE;
+
+    $this->extendedMode = FALSE;
+
+    $this->pass(t('Testing in emulated (best-effort) mode'));
+
+    $this->helperTestStrToLower();
+    $this->helperTestStrToUpper();
+    $this->helperTestUcFirst();
+    $this->helperTestStrLen();
+    $this->helperTestSubStr();
+  }
+
+  function helperTestStrToLower() {
+    $testcase = array(
+      'tHe QUIcK bRoWn' => 'the quick brown',
+      'FrançAIS is ÜBER-åwesome' => 'français is über-åwesome',
+    );
+    if ($this->extendedMode) {
+      $testcase['ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΣὨ'] = 'αβγδεζηθικλμνξοσὠ';
+    }
+
+    foreach ($testcase as $input => $output) {
+      $this->assertEqual(drupal_strtolower($input), $output, t('%input is lowercased as %output', array('%input' => $input, '%output' => $output)));
+    }
+  }
+
+  function helperTestStrToUpper() {
+    $testcase = array(
+      'tHe QUIcK bRoWn' => 'THE QUICK BROWN',
+      'FrançAIS is ÜBER-åwesome' => 'FRANÇAIS IS ÜBER-ÅWESOME',
+    );
+    if ($this->extendedMode) {
+      $testcase['αβγδεζηθικλμνξοσὠ'] = 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΣὨ';
+    }
+
+    foreach ($testcase as $input => $output) {
+      $this->assertEqual(drupal_strtoupper($input), $output, t('%input is uppercased as %output', array('%input' => $input, '%output' => $output)));
+    }
+  }
+
+  function helperTestUcFirst() {
+    $testcase = array(
+      'tHe QUIcK bRoWn' => 'THe QUIcK bRoWn',
+      'françAIS' => 'FrançAIS',
+      'über' => 'Über',
+      'åwesome' => 'Åwesome'
+    );
+    if ($this->extendedMode) {
+      $testcase['σion'] = 'Σion';
+    }
+
+    foreach ($testcase as $input => $output) {
+      $this->assertEqual(drupal_ucfirst($input), $output, t('%input is ucfirst-ed as %output', array('%input' => $input, '%output' => $output)));
+    }
+  }
+
+  function helperTestStrLen() {
+    $testcase = array(
+      'tHe QUIcK bRoWn' => 15,
+      'ÜBER-åwesome' => 12,
+    );
+
+    foreach ($testcase as $input => $output) {
+      $this->assertEqual(drupal_strlen($input), $output, t('%input length is %output', array('%input' => $input, '%output' => $output)));
+    }
+  }
+
+  function helperTestSubStr() {
+    $testcase = array(
+      //     012345678901234567890123
+      array('frànçAIS is über-åwesome', 0, 1,
+            'f'),
+      array('frànçAIS is über-åwesome', 0, 8,
+            'frànçAIS'),
+      array('frànçAIS is über-åwesome', 0, 100,
+            'frànçAIS is über-åwesome'),
+      array('frànçAIS is über-åwesome', 4, 4,
+                'çAIS'),
+      array('frànçAIS is über-åwesome', 1, 0,
+            ''),
+      array('frànçAIS is über-åwesome', 100, 0,
+            ''),
+      array('frànçAIS is über-åwesome', -4, 2,
+                                'so'),
+      array('frànçAIS is über-åwesome', -7, 10,
+                             'åwesome'),
+      array('frànçAIS is über-åwesome', 5, -10,
+                 'AIS is üb'),
+
+    );
+
+    foreach ($testcase as $test) {
+      list($input, $start, $length, $output) = $test;
+      $this->assertEqual(drupal_substr($input, $start, $length), $output, t('%input substring-ed at offset %offset for %length characters is %output', array('%input' => $input, '%offset' => $start, '%length' => $length, '%output' => $output)));
+    }
+  }
+
+  /**
+   * Test decode_entities().
+   */
+  function testDecodeEntities() {
+    $testcase = array(
+      'Drupal' => 'Drupal',
+      '<script>' => '<script>',
+      '&lt;script&gt;' => '<script>',
+      '&amp;lt;script&amp;gt;' => '&lt;script&gt;',
+      '"' => '"',
+      '&#34;' => '"',
+      '&amp;#34;' => '&#34;',
+      '&quot;' => '"',
+      '&amp;quot;' => '&quot;',
+      "'" => "'",
+      '&#39;' => "'",
+      '&amp;#39;' => '&#39;',
+      '©' => '©',
+      '&copy;' => '©',
+      '&#169;' => '©',
+      '→' => '→',
+      '&#8594;' => '→',
+      '➼' => '➼',
+      '&#10172;' => '➼',
+    );
+    foreach ($testcase as $input => $output) {
+      $this->assertEqual(decode_entities($input), $output, t('Make sure the decoded entity of @input is @output', array('@input' => $input, '@output' => $output)));
+    }
+  }
+
+  function testDecodeEntitiesExclusion() {
+    $testcase = array(
+      'Drupal' => 'Drupal',
+      '<script>' => '<script>',
+      '&lt;script&gt;' => '&lt;script>',
+      '&amp;lt;script&amp;gt;' => '&amp;lt;script&amp;gt;',
+      '"' => '"',
+      '&#34;' => '&#34;',
+      '&amp;#34;' => '&amp;#34;',
+      '&quot;' => '&quot;',
+      '&amp;quot;' => '&amp;quot;',
+      "'" => "'",
+      '&#39;' => "'",
+      '&amp;#39;' => '&amp;#39;',
+      '©' => '©',
+      '&copy;' => '©',
+      '&#169;' => '©',
+      '→' => '→',
+      '&#8594;' => '→',
+      '➼' => '➼',
+      '&#10172;' => '➼',
+    );
+    $exclude = array('<', '&', '"');
+    foreach ($testcase as $input => $output) {
+      $this->assertIdentical(decode_entities($input, $exclude), $output, t('Make sure the decoded entity of %input, excluding %excludes, is %output', array('%input' => $input, '%excludes' => implode(',', $exclude), '%output' => $output)));
+    }
+  }
+}