Issue #2259 - Reimplement String.prototype.toLocale{Lower,Upper}Case per ECMAScript Intl specification

- Update make_unicode to output SpecialCasing - Handle special casing - Use realloc instead of malloc when resizing a newly created string buffer Based-on: m-c 1318403, 1431957
2026-05-26 13:58:49 +00:00 · 2023-06-29 23:05:33 +02:00
parent 949f69ef4b
commit 8d97bd437a
14 changed files with 3198 additions and 1299 deletions
@@ -82,6 +82,7 @@ included_inclnames_to_ignore = set([
    'unicode/plurrule.h',       # ICU
    'unicode/timezone.h',       # ICU
    'unicode/ucal.h',           # ICU
+    'unicode/uchar.h',          # ICU
    'unicode/uclean.h',         # ICU
    'unicode/ucol.h',           # ICU
    'unicode/udat.h',           # ICU
@@ -731,6 +731,88 @@ function String_localeCompare(that) {
    return intl_CompareStrings(collator, S, That);
 }

+/**
+ * 13.1.2 String.prototype.toLocaleLowerCase ( [ locales ] )
+ *
+ * ES2017 Intl draft rev 94045d234762ad107a3d09bb6f7381a65f1a2f9b
+ */
+function String_toLocaleLowerCase() {
+    // Step 1.
+    RequireObjectCoercible(this);
+
+    // Step 2.
+    var string = ToString(this);
+
+    // Handle the common cases (no locales argument or a single string
+    // argument) first.
+    var locales = arguments.length > 0 ? arguments[0] : undefined;
+    var requestedLocale;
+    if (locales === undefined) {
+        // Steps 3, 6.
+        requestedLocale = undefined;
+    } else if (typeof locales === "string") {
+        // Steps 3, 5.
+        requestedLocale = ValidateAndCanonicalizeLanguageTag(locales);
+    } else {
+        // Step 3.
+        var requestedLocales = CanonicalizeLocaleList(locales);
+
+        // Steps 4-6.
+        requestedLocale = requestedLocales.length > 0 ? requestedLocales[0] : undefined;
+    }
+
+    // Trivial case: When the input is empty, directly return the empty string.
+    if (string.length === 0)
+        return "";
+
+    if (requestedLocale === undefined)
+        requestedLocale = DefaultLocale();
+
+    // Steps 7-16.
+    return intl_toLocaleLowerCase(string, requestedLocale);
+}
+
+/**
+ * 13.1.3 String.prototype.toLocaleUpperCase ( [ locales ] )
+ *
+ * ES2017 Intl draft rev 94045d234762ad107a3d09bb6f7381a65f1a2f9b
+ */
+function String_toLocaleUpperCase() {
+    // Step 1.
+    RequireObjectCoercible(this);
+
+    // Step 2.
+    var string = ToString(this);
+
+    // Handle the common cases (no locales argument or a single string
+    // argument) first.
+    var locales = arguments.length > 0 ? arguments[0] : undefined;
+    var requestedLocale;
+    if (locales === undefined) {
+        // Steps 3, 6.
+        requestedLocale = undefined;
+    } else if (typeof locales === "string") {
+        // Steps 3, 5.
+        requestedLocale = ValidateAndCanonicalizeLanguageTag(locales);
+    } else {
+        // Step 3.
+        var requestedLocales = CanonicalizeLocaleList(locales);
+
+        // Steps 4-6.
+        requestedLocale = requestedLocales.length > 0 ? requestedLocales[0] : undefined;
+    }
+
+    // Trivial case: When the input is empty, directly return the empty string.
+    if (string.length === 0)
+        return "";
+
+    if (requestedLocale === undefined)
+        requestedLocale = DefaultLocale();
+
+    // Steps 7-16.
+    return intl_toLocaleUpperCase(string, requestedLocale);
+}
+
 /* ES6 Draft May 22, 2014 21.1.2.4 */
 function String_static_raw(callSite, ...substitutions) {
    // Step 1 (implicit).
@@ -1014,13 +1096,15 @@ _SetCanonicalName(String_static_trimEnd, "trimEnd");
 function String_static_toLocaleLowerCase(string) {
    if (arguments.length < 1)
        ThrowTypeError(JSMSG_MISSING_FUN_ARG, 0, 'String.toLocaleLowerCase');
-    return callFunction(std_String_toLocaleLowerCase, string);
+    var locales = arguments.length > 1 ? arguments[1] : undefined;
+    return callFunction(String_toLocaleLowerCase, string, locales);
 }

 function String_static_toLocaleUpperCase(string) {
    if (arguments.length < 1)
        ThrowTypeError(JSMSG_MISSING_FUN_ARG, 0, 'String.toLocaleUpperCase');
-    return callFunction(std_String_toLocaleUpperCase, string);
+    var locales = arguments.length > 1 ? arguments[1] : undefined;
+    return callFunction(String_toLocaleUpperCase, string, locales);
 }

 function String_static_normalize(string) {
@@ -446,6 +446,64 @@ function CanonicalizeLanguageTag(locale) {
    return canonical;
 }

+
+/**
+ * Returns true if the input contains only ASCII alphabetical characters.
+ */
+function IsASCIIAlphaString(s) {
+    assert(typeof s === "string", "IsASCIIAlphaString");
+
+    for (var i = 0; i < s.length; i++) {
+        var c = callFunction(std_String_charCodeAt, s, i);
+        if (!((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A)))
+            return false
+    }
+    return true;
+}
+
+
+/**
+ * Validates and canonicalizes the given language tag.
+ */
+function ValidateAndCanonicalizeLanguageTag(locale) {
+    assert(typeof locale === "string", "ValidateAndCanonicalizeLanguageTag");
+
+    // Handle the common case (a standalone language) first.
+    // Only the following BCP47 subset is accepted:
+    //   Language-Tag  = langtag
+    //   langtag       = language
+    //   language      = 2*3ALPHA ; shortest ISO 639 code
+    // For three character long strings we need to make sure it's not a
+    // private use only language tag, for example "x-x".
+    if (locale.length === 2 || (locale.length === 3 && locale[1] !== "-")) {
+        if (!IsASCIIAlphaString(locale))
+            ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale);
+        assert(IsStructurallyValidLanguageTag(locale), "2*3ALPHA is a valid language tag");
+
+        // The language subtag is canonicalized to lower case.
+        locale = callFunction(std_String_toLowerCase, locale);
+
+        // langTagMappings doesn't contain any 2*3ALPHA keys, so we don't need
+        // to check for possible replacements in this map.
+        assert(!callFunction(std_Object_hasOwnProperty, langTagMappings, locale),
+               "langTagMappings contains no 2*3ALPHA mappings");
+
+        // Replace deprecated subtags with their preferred values.
+        locale = callFunction(std_Object_hasOwnProperty, langSubtagMappings, locale)
+                 ? langSubtagMappings[locale]
+                 : locale;
+        assert(locale === CanonicalizeLanguageTag(locale), "expected same canonicalization");
+
+        return locale;
+    }
+
+    if (!IsStructurallyValidLanguageTag(locale))
+        ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale);
+
+    return CanonicalizeLanguageTag(locale);
+}
+
+
 function localeContainsNoUnicodeExtensions(locale) {
    // No "-u-", no possible Unicode extension.
    if (callFunction(std_String_indexOf, locale, "-u-") === -1)
@@ -151,6 +151,10 @@ def readRegistry(registry):
    # Special case for heploc.
    langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"

+    # ValidateAndCanonicalizeLanguageTag in Intl.js expects langTagMappings
+    # contains no 2*3ALPHA.
+    assert all(len(lang) > 3 for lang in langTagMappings.iterkeys())
+
    return {"fileDate": fileDate,
            "langTagMappings": langTagMappings,
            "langSubtagMappings": langSubtagMappings,
@@ -5327,8 +5327,8 @@ JS_ResetDefaultLocale(JSContext* cx);
 * Locale specific string conversion and error message callbacks.
 */
 struct JSLocaleCallbacks {
-    JSLocaleToUpperCase     localeToUpperCase;
-    JSLocaleToLowerCase     localeToLowerCase;
+    JSLocaleToUpperCase     localeToUpperCase; // not used
+    JSLocaleToLowerCase     localeToLowerCase; // not used
    JSLocaleCompare         localeCompare; // not used
    JSLocaleToUnicode       localeToUnicode;
 };
@@ -365,6 +365,7 @@ struct JSContext : public js::ExclusiveContext,
    using ExclusiveContext::permanentAtoms;
    using ExclusiveContext::pod_calloc;
    using ExclusiveContext::pod_malloc;
+    using ExclusiveContext::pod_realloc;
    using ExclusiveContext::staticStrings;
    using ExclusiveContext::updateMallocCounter;
    using ExclusiveContext::wellKnownSymbols;
@@ -31,10 +31,12 @@
 #include "jsutil.h"

 #include "builtin/intl/ICUHeader.h"
+#include "builtin/intl/CommonFunctions.h"
 #include "builtin/RegExp.h"
 #include "jit/InlinableNatives.h"
 #include "js/Conversions.h"
 #include "js/UniquePtr.h"
+#include "unicode/uchar.h"
 #include "unicode/unorm2.h"
 #include "vm/GlobalObject.h"
 #include "vm/Interpreter.h"
@@ -598,19 +600,210 @@ js::SubstringKernel(JSContext* cx, HandleString str, int32_t beginInt, int32_t l
    return NewDependentString(cx, str, begin, len);
 }

+template <typename CharT>
+static auto
+ReallocChars(JSContext* cx, UniquePtr<CharT[], JS::FreePolicy> chars, size_t oldLength,
+             size_t newLength)
+  -> decltype(chars)
+{
+    using AnyCharPtr = decltype(chars);
+
+    CharT* oldChars = chars.release();
+    CharT* newChars = cx->pod_realloc<CharT>(oldChars, oldLength, newLength);
+    if (!newChars) {
+        js_free(oldChars);
+        return AnyCharPtr();
+    }
+
+    return AnyCharPtr(newChars);
+}
+
+/**
+ * U+03A3 GREEK CAPITAL LETTER SIGMA has two different lower case mappings
+ * depending on its context:
+ * When it's preceded by a cased character and not followed by another cased
+ * character, its lower case form is U+03C2 GREEK SMALL LETTER FINAL SIGMA.
+ * Otherwise its lower case mapping is U+03C3 GREEK SMALL LETTER SIGMA.
+ *
+ * Unicode 9.0, §3.13 Default Case Algorithms
+ */
+static char16_t
+Final_Sigma(const char16_t* chars, size_t length, size_t index)
+{
+    MOZ_ASSERT(index < length);
+    MOZ_ASSERT(chars[index] == unicode::GREEK_CAPITAL_LETTER_SIGMA);
+    MOZ_ASSERT(unicode::ToLowerCase(unicode::GREEK_CAPITAL_LETTER_SIGMA) ==
+               unicode::GREEK_SMALL_LETTER_SIGMA);
+
+    // Tell the analysis the BinaryProperty.contains function pointer called by
+    // u_hasBinaryProperty cannot GC.
+    JS::AutoSuppressGCAnalysis nogc;
+
+    bool precededByCased = false;
+    for (size_t i = index; i > 0; ) {
+        char16_t c = chars[--i];
+        uint32_t codePoint = c;
+        if (unicode::IsTrailSurrogate(c) && i > 0) {
+            char16_t lead = chars[i - 1];
+            if (unicode::IsLeadSurrogate(lead)) {
+                codePoint = unicode::UTF16Decode(lead, c);
+                i--;
+            }
+        }
+
+        // Ignore any characters with the property Case_Ignorable.
+        // NB: We need to skip over all Case_Ignorable characters, even when
+        // they also have the Cased binary property.
+        if (u_hasBinaryProperty(codePoint, UCHAR_CASE_IGNORABLE))
+            continue;
+
+        precededByCased = u_hasBinaryProperty(codePoint, UCHAR_CASED);
+        break;
+    }
+    if (!precededByCased)
+        return unicode::GREEK_SMALL_LETTER_SIGMA;
+
+    bool followedByCased = false;
+    for (size_t i = index + 1; i < length; ) {
+        char16_t c = chars[i++];
+        uint32_t codePoint = c;
+        if (unicode::IsLeadSurrogate(c) && i < length) {
+            char16_t trail = chars[i];
+            if (unicode::IsTrailSurrogate(trail)) {
+                codePoint = unicode::UTF16Decode(c, trail);
+                i++;
+            }
+        }
+
+        // Ignore any characters with the property Case_Ignorable.
+        // NB: We need to skip over all Case_Ignorable characters, even when
+        // they also have the Cased binary property.
+        if (u_hasBinaryProperty(codePoint, UCHAR_CASE_IGNORABLE))
+            continue;
+
+        followedByCased = u_hasBinaryProperty(codePoint, UCHAR_CASED);
+        break;
+    }
+    if (!followedByCased)
+        return unicode::GREEK_SMALL_LETTER_FINAL_SIGMA;
+
+    return unicode::GREEK_SMALL_LETTER_SIGMA;
+}
+
+static Latin1Char
+Final_Sigma(const Latin1Char* chars, size_t length, size_t index)
+{
+    MOZ_ASSERT_UNREACHABLE("U+03A3 is not a Latin-1 character");
+    return 0;
+}
+
+// If |srcLength == destLength| is true, the destination buffer was allocated
+// with the same size as the source buffer. When we append characters which
+// have special casing mappings, we test |srcLength == destLength| to decide
+// if we need to back out and reallocate a sufficiently large destination
+// buffer. Otherwise the destination buffer was allocated with the correct
+// size to hold all lower case mapped characters, i.e.
+// |destLength == ToLowerCaseLength(srcChars, 0, srcLength)| is true.
+template <typename CharT>
+static size_t
+ToLowerCaseImpl(CharT* destChars, const CharT* srcChars, size_t startIndex, size_t srcLength,
+                size_t destLength)
+{
+    MOZ_ASSERT(startIndex < srcLength);
+    MOZ_ASSERT(srcLength <= destLength);
+    MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), srcLength == destLength);
+
+    size_t j = startIndex;
+    for (size_t i = startIndex; i < srcLength; i++) {
+        char16_t c = srcChars[i];
+        if (!IsSame<CharT, Latin1Char>::value) {
+            if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
+                char16_t trail = srcChars[i + 1];
+                if (unicode::IsTrailSurrogate(trail)) {
+                    trail = unicode::ToLowerCaseNonBMPTrail(c, trail);
+                    destChars[j++] = c;
+                    destChars[j++] = trail;
+                    i++;
+                    continue;
+                }
+            }
+
+            // Special case: U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
+            // lowercases to <U+0069 U+0307>.
+            if (c == unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+                // Return if the output buffer is too small.
+                if (srcLength == destLength)
+                    return i;
+
+                destChars[j++] = CharT('i');
+                destChars[j++] = CharT(unicode::COMBINING_DOT_ABOVE);
+                continue;
+            }
+
+            // Special case: U+03A3 GREEK CAPITAL LETTER SIGMA lowercases to
+            // one of two codepoints depending on context.
+            if (c == unicode::GREEK_CAPITAL_LETTER_SIGMA) {
+                destChars[j++] = Final_Sigma(srcChars, srcLength, i);
+                continue;
+            }
+        }
+
+        c = unicode::ToLowerCase(c);
+        MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
+        destChars[j++] = c;
+    }
+
+    MOZ_ASSERT(j == destLength);
+    destChars[destLength] = '\0';
+
+    return srcLength;
+}
+
+static size_t
+ToLowerCaseLength(const char16_t* chars, size_t startIndex, size_t length)
+{
+    size_t lowerLength = length;
+    for (size_t i = startIndex; i < length; i++) {
+        char16_t c = chars[i];
+
+        // U+0130 is lowercased to the two-element sequence <U+0069 U+0307>.
+        if (c == unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
+            lowerLength += 1;
+    }
+    return lowerLength;
+}
+
+static size_t
+ToLowerCaseLength(const Latin1Char* chars, size_t startIndex, size_t length)
+{
+    MOZ_ASSERT_UNREACHABLE("never called for Latin-1 strings");
+    return 0;
+}
+
 template <typename CharT>
 static JSString*
 ToLowerCase(JSContext* cx, JSLinearString* str)
 {
-    // Unlike toUpperCase, toLowerCase has the nice invariant that if the input
-    // is a Latin1 string, the output is also a Latin1 string.
-    UniquePtr<CharT[], JS::FreePolicy> newChars;
-    size_t length = str->length();
+    // Unlike toUpperCase, toLowerCase has the nice invariant that if the
+    // input is a Latin-1 string, the output is also a Latin-1 string.
+    using AnyCharPtr = UniquePtr<CharT[], JS::FreePolicy>;
+
+    AnyCharPtr newChars;
+    const size_t length = str->length();
+    size_t resultLength;
    {
        AutoCheckCannotGC nogc;
        const CharT* chars = str->chars<CharT>(nogc);

-        // Look for the first upper case character.
+        // We don't need extra special casing checks in the loop below,
+        // because U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+03A3
+        // GREEK CAPITAL LETTER SIGMA already have simple lower case mappings.
+        MOZ_ASSERT(unicode::CanLowerCase(unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE),
+                   "U+0130 has a simple lower case mapping");
+        MOZ_ASSERT(unicode::CanLowerCase(unicode::GREEK_CAPITAL_LETTER_SIGMA),
+                   "U+03A3 has a simple lower case mapping");
+
+        // Look for the first character that changes when lowercased.
        size_t i = 0;
        for (; i < length; i++) {
            char16_t c = chars[i];
@@ -630,40 +823,35 @@ ToLowerCase(JSContext* cx, JSLinearString* str)
                break;
        }

-        // If all characters are lower case, return the input string.
+        // If no character needs to change, return the input string.
        if (i == length)
            return str;

-        newChars = cx->make_pod_array<CharT>(length + 1);
+        resultLength = length;
+        newChars = cx->make_pod_array<CharT>(resultLength + 1);
        if (!newChars)
            return nullptr;

        PodCopy(newChars.get(), chars, i);

-        for (; i < length; i++) {
-            char16_t c = chars[i];
-            if (!IsSame<CharT, Latin1Char>::value) {
-                if (unicode::IsLeadSurrogate(c) && i + 1 < length) {
-                    char16_t trail = chars[i + 1];
-                    if (unicode::IsTrailSurrogate(trail)) {
-                        trail = unicode::ToLowerCaseNonBMPTrail(c, trail);
-                        newChars[i] = c;
-                        newChars[i + 1] = trail;
-                        i++;
-                        continue;
-                    }
-                }
-            }
+        size_t readChars = ToLowerCaseImpl(newChars.get(), chars, i, length, resultLength);
+        if (readChars < length) {
+            MOZ_ASSERT((!IsSame<CharT, Latin1Char>::value),
+                       "Latin-1 strings don't have special lower case mappings");
+            resultLength = ToLowerCaseLength(chars, readChars, length);

-            c = unicode::ToLowerCase(c);
-            MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
-            newChars[i] = c;
+            AnyCharPtr buf = ReallocChars(cx, Move(newChars), length + 1, resultLength + 1);
+            if (!buf)
+                return nullptr;
+
+            newChars = Move(buf);
+
+            MOZ_ALWAYS_TRUE(length ==
+                ToLowerCaseImpl(newChars.get(), chars, readChars, length, resultLength));
        }
-
-        newChars[length] = 0;
    }

-    JSString* res = NewStringDontDeflate<CanGC>(cx, newChars.get(), length);
+    JSString* res = NewStringDontDeflate<CanGC>(cx, newChars.get(), resultLength);
    if (!res)
        return nullptr;

@@ -671,21 +859,102 @@ ToLowerCase(JSContext* cx, JSLinearString* str)
    return res;
 }

-static inline bool
-ToLowerCaseHelper(JSContext* cx, const CallArgs& args)
+JSString*
+js::StringToLowerCase(JSContext* cx, HandleLinearString string)
 {
+    if (string->hasLatin1Chars())
+        return ToLowerCase<Latin1Char>(cx, string);
+    return ToLowerCase<char16_t>(cx, string);
+}
+
+bool
+js::str_toLowerCase(JSContext* cx, unsigned argc, Value* vp)
+{
+    CallArgs args = CallArgsFromVp(argc, vp);
+
    RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
    if (!str)
        return false;

-    JSLinearString* linear = str->ensureLinear(cx);
+    RootedLinearString linear(cx, str->ensureLinear(cx));
    if (!linear)
        return false;

-    if (linear->hasLatin1Chars())
-        str = ToLowerCase<Latin1Char>(cx, linear);
-    else
-        str = ToLowerCase<char16_t>(cx, linear);
+    JSString* result = StringToLowerCase(cx, linear);
+    if (!result)
+        return false;
+
+    args.rval().setString(result);
+    return true;
+}
+
+static const char*
+CaseMappingLocale(JSContext* cx, JSString* str)
+{
+    JSLinearString* locale = str->ensureLinear(cx);
+    if (!locale)
+        return nullptr;
+
+    MOZ_ASSERT(locale->length() >= 2, "locale is a valid language tag");
+
+    // Lithuanian, Turkish, and Azeri have language dependent case mappings.
+    static const char languagesWithSpecialCasing[][3] = { "lt", "tr", "az" };
+
+    // All strings in |languagesWithSpecialCasing| are of length two, so we
+    // only need to compare the first two characters to find a matching locale.
+    // ES2017 Intl, §9.2.2 BestAvailableLocale
+    if (locale->length() == 2 || locale->latin1OrTwoByteChar(2) == '-') {
+        for (const auto& language : languagesWithSpecialCasing) {
+            if (locale->latin1OrTwoByteChar(0) == language[0] &&
+                locale->latin1OrTwoByteChar(1) == language[1])
+            {
+                return language;
+            }
+        }
+    }
+
+    return ""; // ICU root locale
+}
+
+bool
+js::intl_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp)
+{
+    CallArgs args = CallArgsFromVp(argc, vp);
+    MOZ_ASSERT(args.length() == 2);
+    MOZ_ASSERT(args[0].isString());
+    MOZ_ASSERT(args[1].isString());
+
+    RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx));
+    if (!linear)
+        return false;
+
+    const char* locale = CaseMappingLocale(cx, args[1].toString());
+    if (!locale)
+        return false;
+
+    // Call String.prototype.toLowerCase() for language independent casing.
+    if (intl::StringsAreEqual(locale, "")) {
+        JSString* str = StringToLowerCase(cx, linear);
+        if (!str)
+            return false;
+
+        args.rval().setString(str);
+        return true;
+    }
+
+    AutoStableStringChars inputChars(cx);
+    if (!inputChars.initTwoByte(cx, linear))
+        return false;
+    mozilla::Range<const char16_t> input = inputChars.twoByteRange();
+
+    // Maximum case mapping length is three characters.
+    static_assert(JSString::MAX_LENGTH < INT32_MAX / 3,
+                  "Case conversion doesn't overflow int32_t indices");
+
+    JSString* str = intl::CallICU(cx, [&input, locale](UChar* chars, int32_t size, UErrorCode* status) {
+        return u_strToLower(chars, size, Char16ToUChar(input.begin().get()), input.length(),
+                            locale, status);
+    });
    if (!str)
        return false;

@@ -693,82 +962,192 @@ ToLowerCaseHelper(JSContext* cx, const CallArgs& args)
    return true;
 }

-bool
-js::str_toLowerCase(JSContext* cx, unsigned argc, Value* vp)
+static inline bool
+CanUpperCaseSpecialCasing(Latin1Char charCode)
 {
-    return ToLowerCaseHelper(cx, CallArgsFromVp(argc, vp));
+    // Handle U+00DF LATIN SMALL LETTER SHARP S inline, all other Latin-1
+    // characters don't have special casing rules.
+    MOZ_ASSERT_IF(charCode != unicode::LATIN_SMALL_LETTER_SHARP_S,
+                  !unicode::CanUpperCaseSpecialCasing(charCode));
+
+    return charCode == unicode::LATIN_SMALL_LETTER_SHARP_S;
 }

-bool
-js::str_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp)
+static inline bool
+CanUpperCaseSpecialCasing(char16_t charCode)
 {
-    CallArgs args = CallArgsFromVp(argc, vp);
-
-    /*
-     * Forcefully ignore the first (or any) argument and return toLowerCase(),
-     * ECMA has reserved that argument, presumably for defining the locale.
-     */
-    if (cx->runtime()->localeCallbacks && cx->runtime()->localeCallbacks->localeToLowerCase) {
-        RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
-        if (!str)
-            return false;
-
-        RootedValue result(cx);
-        if (!cx->runtime()->localeCallbacks->localeToLowerCase(cx, str, &result))
-            return false;
-
-        args.rval().set(result);
-        return true;
-    }
-
-    return ToLowerCaseHelper(cx, args);
+    return unicode::CanUpperCaseSpecialCasing(charCode);
 }

+static inline size_t
+LengthUpperCaseSpecialCasing(Latin1Char charCode)
+{
+    // U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'.
+    MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S);
+
+    return 2;
+}
+
+static inline size_t
+LengthUpperCaseSpecialCasing(char16_t charCode)
+{
+    MOZ_ASSERT(CanUpperCaseSpecialCasing(charCode));
+
+    return unicode::LengthUpperCaseSpecialCasing(charCode);
+}
+
+static inline void
+AppendUpperCaseSpecialCasing(char16_t charCode, Latin1Char* elements, size_t* index)
+{
+    // U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'.
+    MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S);
+    static_assert('S' <= JSString::MAX_LATIN1_CHAR, "'S' is a Latin-1 character");
+
+    elements[(*index)++] = 'S';
+    elements[(*index)++] = 'S';
+}
+
+static inline void
+AppendUpperCaseSpecialCasing(char16_t charCode, char16_t* elements, size_t* index)
+{
+    unicode::AppendUpperCaseSpecialCasing(charCode, elements, index);
+}
+
+// See ToLowerCaseImpl for an explanation of the parameters.
 template <typename DestChar, typename SrcChar>
-static void
-ToUpperCaseImpl(DestChar* destChars, const SrcChar* srcChars, size_t firstLowerCase, size_t length)
+static size_t
+ToUpperCaseImpl(DestChar* destChars, const SrcChar* srcChars, size_t startIndex, size_t srcLength,
+                size_t destLength)
 {
-    MOZ_ASSERT(firstLowerCase < length);
+    static_assert(IsSame<SrcChar, Latin1Char>::value || !IsSame<DestChar, Latin1Char>::value,
+                  "cannot write non-Latin-1 characters into Latin-1 string");
+    MOZ_ASSERT(startIndex < srcLength);
+    MOZ_ASSERT(srcLength <= destLength);

-    for (size_t i = 0; i < firstLowerCase; i++)
-        destChars[i] = srcChars[i];
-
-    for (size_t i = firstLowerCase; i < length; i++) {
+    size_t j = startIndex;
+    for (size_t i = startIndex; i < srcLength; i++) {
        char16_t c = srcChars[i];
        if (!IsSame<DestChar, Latin1Char>::value) {
-            if (unicode::IsLeadSurrogate(c) && i + 1 < length) {
+            if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
                char16_t trail = srcChars[i + 1];
                if (unicode::IsTrailSurrogate(trail)) {
                    trail = unicode::ToUpperCaseNonBMPTrail(c, trail);
-                    destChars[i] = c;
-                    destChars[i + 1] = trail;
+                    destChars[j++] = c;
+                    destChars[j++] = trail;
                    i++;
                    continue;
                }
            }
        }
+
+        if (MOZ_UNLIKELY(c > 0x7f && CanUpperCaseSpecialCasing(static_cast<SrcChar>(c)))) {
+            // Return if the output buffer is too small.
+            if (srcLength == destLength)
+                return i;
+
+            AppendUpperCaseSpecialCasing(c, destChars, &j);
+            continue;
+        }
+
        c = unicode::ToUpperCase(c);
        MOZ_ASSERT_IF((IsSame<DestChar, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
-        destChars[i] = c;
+        destChars[j++] = c;
    }

-    destChars[length] = '\0';
+    MOZ_ASSERT(j == destLength);
+    destChars[destLength] = '\0';
+
+    return srcLength;
+}
+
+// Explicit instantiation so we don't hit the static_assert from above.
+static bool
+ToUpperCaseImpl(Latin1Char* destChars, const char16_t* srcChars, size_t startIndex,
+                size_t srcLength, size_t destLength)
+{
+    MOZ_ASSERT_UNREACHABLE("cannot write non-Latin-1 characters into Latin-1 string");
+    return false;
+}
+
+template <typename CharT>
+static size_t
+ToUpperCaseLength(const CharT* chars, size_t startIndex, size_t length)
+{
+    size_t upperLength = length;
+    for (size_t i = startIndex; i < length; i++) {
+        char16_t c = chars[i];
+
+        if (c > 0x7f && CanUpperCaseSpecialCasing(static_cast<CharT>(c)))
+            upperLength += LengthUpperCaseSpecialCasing(static_cast<CharT>(c)) - 1;
+    }
+    return upperLength;
+}
+
+template <typename DestChar, typename SrcChar>
+static inline void
+CopyChars(DestChar* destChars, const SrcChar* srcChars, size_t length)
+{
+    static_assert(!IsSame<DestChar, SrcChar>::value, "PodCopy is used for the same type case");
+    for (size_t i = 0; i < length; i++)
+        destChars[i] = srcChars[i];
+}
+
+template <typename CharT>
+static inline void
+CopyChars(CharT* destChars, const CharT* srcChars, size_t length)
+{
+    PodCopy(destChars, srcChars, length);
+}
+
+template <typename DestChar, typename SrcChar>
+static inline UniquePtr<DestChar[], JS::FreePolicy>
+ToUpperCase(JSContext* cx, const SrcChar* chars, size_t startIndex, size_t length,
+            size_t* resultLength)
+{
+    MOZ_ASSERT(startIndex < length);
+
+    using DestCharPtr = UniquePtr<DestChar[], JS::FreePolicy>;
+
+    *resultLength = length;
+    DestCharPtr buf = cx->make_pod_array<DestChar>(length + 1);
+    if (!buf)
+        return buf;
+
+    CopyChars(buf.get(), chars, startIndex);
+
+    size_t readChars = ToUpperCaseImpl(buf.get(), chars, startIndex, length, length);
+    if (readChars < length) {
+        size_t actualLength = ToUpperCaseLength(chars, readChars, length);
+
+        *resultLength = actualLength;
+        DestCharPtr buf2 = ReallocChars(cx, Move(buf), length + 1, actualLength + 1);
+        if (!buf2)
+            return buf2;
+
+        buf = Move(buf2);
+
+        MOZ_ALWAYS_TRUE(length ==
+            ToUpperCaseImpl(buf.get(), chars, readChars, length, actualLength));
+    }
+
+    return buf;
 }

 template <typename CharT>
 static JSString*
 ToUpperCase(JSContext* cx, JSLinearString* str)
 {
-    typedef UniquePtr<Latin1Char[], JS::FreePolicy> Latin1CharPtr;
-    typedef UniquePtr<char16_t[], JS::FreePolicy> TwoByteCharPtr;
+    using Latin1CharPtr = UniquePtr<Latin1Char[], JS::FreePolicy>;
+    using TwoByteCharPtr = UniquePtr<char16_t[], JS::FreePolicy>;

    mozilla::MaybeOneOf<Latin1CharPtr, TwoByteCharPtr> newChars;
-    size_t length = str->length();
+    const size_t length = str->length();
+    size_t resultLength;
    {
        AutoCheckCannotGC nogc;
        const CharT* chars = str->chars<CharT>(nogc);

-        // Look for the first lower case character.
+        // Look for the first character that changes when uppercased.
        size_t i = 0;
        for (; i < length; i++) {
            char16_t c = chars[i];
@@ -786,21 +1165,33 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
            }
            if (unicode::CanUpperCase(c))
                break;
+            if (MOZ_UNLIKELY(c > 0x7f && CanUpperCaseSpecialCasing(static_cast<CharT>(c))))
+                break;
        }

-        // If all characters are upper case, return the input string.
+        // If no character needs to change, return the input string.
        if (i == length)
            return str;

-        // If the string is Latin1, check if it contains the MICRO SIGN (0xb5)
-        // or SMALL LETTER Y WITH DIAERESIS (0xff) character. The corresponding
-        // upper case characters are not in the Latin1 range.
+        // The string changes when uppercased, so we must create a new string.
+        // Can it be Latin-1?
+        //
+        // If the original string is Latin-1, it can -- unless the string
+        // contains U+00B5 MICRO SIGN or U+00FF SMALL LETTER Y WITH DIAERESIS,
+        // the only Latin-1 codepoints that don't uppercase within Latin-1.
+        // Search for those codepoints to decide whether the new string can be
+        // Latin-1.
+        // If the original string is a two-byte string, its uppercase form is
+        // so rarely Latin-1 that we don't even consider creating a new
+        // Latin-1 string.
        bool resultIsLatin1;
        if (IsSame<CharT, Latin1Char>::value) {
            resultIsLatin1 = true;
            for (size_t j = i; j < length; j++) {
                Latin1Char c = chars[j];
-                if (c == 0xb5 || c == 0xff) {
+                if (c == unicode::MICRO_SIGN ||
+                    c == unicode::LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)
+                {
                    MOZ_ASSERT(unicode::ToUpperCase(c) > JSString::MAX_LATIN1_CHAR);
                    resultIsLatin1 = false;
                    break;
@@ -813,31 +1204,29 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
        }

        if (resultIsLatin1) {
-            Latin1CharPtr buf = cx->make_pod_array<Latin1Char>(length + 1);
+            Latin1CharPtr buf = ToUpperCase<Latin1Char>(cx, chars, i, length, &resultLength);
            if (!buf)
                return nullptr;

-            ToUpperCaseImpl(buf.get(), chars, i, length);
            newChars.construct<Latin1CharPtr>(Move(buf));
        } else {
-            TwoByteCharPtr buf = cx->make_pod_array<char16_t>(length + 1);
+            TwoByteCharPtr buf = ToUpperCase<char16_t>(cx, chars, i, length, &resultLength);
            if (!buf)
                return nullptr;

-            ToUpperCaseImpl(buf.get(), chars, i, length);
            newChars.construct<TwoByteCharPtr>(Move(buf));
        }
    }

    JSString* res;
    if (newChars.constructed<Latin1CharPtr>()) {
-        res = NewStringDontDeflate<CanGC>(cx, newChars.ref<Latin1CharPtr>().get(), length);
+        res = NewStringDontDeflate<CanGC>(cx, newChars.ref<Latin1CharPtr>().get(), resultLength);
        if (!res)
            return nullptr;

        mozilla::Unused << newChars.ref<Latin1CharPtr>().release();
    } else {
-        res = NewStringDontDeflate<CanGC>(cx, newChars.ref<TwoByteCharPtr>().get(), length);
+        res = NewStringDontDeflate<CanGC>(cx, newChars.ref<TwoByteCharPtr>().get(), resultLength);
        if (!res)
            return nullptr;

@@ -847,57 +1236,79 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
    return res;
 }

-static bool
-ToUpperCaseHelper(JSContext* cx, const CallArgs& args)
+JSString*
+js::StringToUpperCase(JSContext* cx, HandleLinearString string)
 {
-    RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
-    if (!str)
-        return false;
-
-    JSLinearString* linear = str->ensureLinear(cx);
-    if (!linear)
-        return false;
-
-    if (linear->hasLatin1Chars())
-        str = ToUpperCase<Latin1Char>(cx, linear);
-    else
-        str = ToUpperCase<char16_t>(cx, linear);
-    if (!str)
-        return false;
-
-    args.rval().setString(str);
-    return true;
+    if (string->hasLatin1Chars())
+        return ToUpperCase<Latin1Char>(cx, string);
+    return ToUpperCase<char16_t>(cx, string);
 }

 bool
 js::str_toUpperCase(JSContext* cx, unsigned argc, Value* vp)
 {
-    return ToUpperCaseHelper(cx, CallArgsFromVp(argc, vp));
+    CallArgs args = CallArgsFromVp(argc, vp);
+
+    RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
+    if (!str)
+        return false;
+
+    RootedLinearString linear(cx, str->ensureLinear(cx));
+    if (!linear)
+        return false;
+
+    JSString* result = StringToUpperCase(cx, linear);
+    if (!result)
+        return false;
+
+    args.rval().setString(result);
+    return true;
 }

 bool
-js::str_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp)
+js::intl_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp)
 {
    CallArgs args = CallArgsFromVp(argc, vp);
+    MOZ_ASSERT(args.length() == 2);
+    MOZ_ASSERT(args[0].isString());
+    MOZ_ASSERT(args[1].isString());

-    /*
-     * Forcefully ignore the first (or any) argument and return toUpperCase(),
-     * ECMA has reserved that argument, presumably for defining the locale.
-     */
-    if (cx->runtime()->localeCallbacks && cx->runtime()->localeCallbacks->localeToUpperCase) {
-        RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
+    RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx));
+    if (!linear)
+        return false;
+
+    const char* locale = CaseMappingLocale(cx, args[1].toString());
+    if (!locale)
+        return false;
+
+    // Call String.prototype.toUpperCase() for language independent casing.
+    if (intl::StringsAreEqual(locale, "")) {
+        JSString* str = StringToUpperCase(cx, linear);
        if (!str)
            return false;

-        RootedValue result(cx);
-        if (!cx->runtime()->localeCallbacks->localeToUpperCase(cx, str, &result))
-            return false;
-
-        args.rval().set(result);
+        args.rval().setString(str);
        return true;
    }

-    return ToUpperCaseHelper(cx, args);
+    AutoStableStringChars inputChars(cx);
+    if (!inputChars.initTwoByte(cx, linear))
+        return false;
+    mozilla::Range<const char16_t> input = inputChars.twoByteRange();
+
+    // Maximum case mapping length is three characters.
+    static_assert(JSString::MAX_LENGTH < INT32_MAX / 3,
+                  "Case conversion doesn't overflow int32_t indices");
+
+    JSString* str = intl::CallICU(cx, [&input, locale](UChar* chars, int32_t size, UErrorCode* status) {
+        return u_strToUpper(chars, size, Char16ToUChar(input.begin().get()), input.length(),
+                            locale, status);
+    });
+    if (!str)
+        return false;
+
+    args.rval().setString(str);
+    return true;
 }

 /* ES2017 21.1.3.12. */
@@ -944,7 +1355,7 @@ js::str_normalize(JSContext* cx, unsigned argc, Value* vp)
    if (!linear)
        return false;

-    // Latin1 strings are already in Normalization Form C.
+    // Latin-1 strings are already in Normalization Form C.
    if (form == NFC && linear->hasLatin1Chars()) {
        // Step 7.
        args.rval().setString(str);
@@ -1359,7 +1770,7 @@ StringMatch(const TextChar* text, uint32_t textLen, const PatChar* pat, uint32_t
    /*
     * For big patterns with large potential overlap we want the SIMD-optimized
     * speed of memcmp. For small patterns, a simple loop is faster. We also can't
-     * use memcmp if one of the strings is TwoByte and the other is Latin1.
+     * use memcmp if one of the strings is TwoByte and the other is Latin-1.
     *
     * FIXME: Linux memcmp performance is sad and the manual loop is faster.
     */
@@ -1555,7 +1966,7 @@ RopeMatch(JSContext* cx, JSRope* text, JSLinearString* pat, int* match)
     * need to build the list of leaf nodes. Do both here: iterate over the
     * nodes so long as there are not too many.
     *
-     * We also don't use rope matching if the rope contains both Latin1 and
+     * We also don't use rope matching if the rope contains both Latin-1 and
     * TwoByte nodes, to simplify the match algorithm.
     */
    {
@@ -2890,8 +3301,8 @@ static const JSFunctionSpec string_methods[] = {
    JS_FN("trimStart",         str_trimStart,         0,0),
    JS_FN("trimRight",         str_trimEnd,           0,0),
    JS_FN("trimEnd",           str_trimEnd,           0,0),
-    JS_FN("toLocaleLowerCase", str_toLocaleLowerCase, 0,0),
-    JS_FN("toLocaleUpperCase", str_toLocaleUpperCase, 0,0),
+    JS_SELF_HOSTED_FN("toLocaleLowerCase", "String_toLocaleLowerCase", 0,0),
+    JS_SELF_HOSTED_FN("toLocaleUpperCase", "String_toLocaleUpperCase", 0,0),
    JS_SELF_HOSTED_FN("localeCompare", "String_localeCompare", 1,0),
    JS_SELF_HOSTED_FN("repeat", "String_repeat",      1,0),
    JS_FN("normalize",         str_normalize,         0,0),
@@ -3000,7 +3411,7 @@ js::str_fromCharCode(JSContext* cx, unsigned argc, Value* vp)
    // string (thin or fat) and so we don't need to malloc the chars. (We could
    // cover some cases where args.length() goes up to
    // JSFatInlineString::MAX_LENGTH_LATIN1 if we also checked if the chars are
-    // all Latin1, but it doesn't seem worth the effort.)
+    // all Latin-1, but it doesn't seem worth the effort.)
    if (args.length() <= JSFatInlineString::MAX_LENGTH_TWO_BYTE)
        return str_fromCharCode_few_args(cx, args);

@@ -3143,7 +3554,7 @@ js::str_fromCodePoint(JSContext* cx, unsigned argc, Value* vp)
    // string (thin or fat) and so we don't need to malloc the chars. (We could
    // cover some cases where |args.length()| goes up to
    // JSFatInlineString::MAX_LENGTH_LATIN1 / 2 if we also checked if the chars
-    // are all Latin1, but it doesn't seem worth the effort.)
+    // are all Latin-1, but it doesn't seem worth the effort.)
    if (args.length() <= JSFatInlineString::MAX_LENGTH_TWO_BYTE / 2)
        return str_fromCodePoint_few_args(cx, args);

@@ -371,11 +371,24 @@ str_trimStart(JSContext* cx, unsigned argc, Value* vp);
 extern bool
 str_trimEnd(JSContext* cx, unsigned argc, Value* vp);

-extern bool
-str_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp);
+/**
+ * Returns the input string converted to lower case based on the language
+ * specific case mappings for the input locale.
+ *
+ * Usage: lowerCase = intl_toLocaleLowerCase(string, locale)
+ */
+extern MOZ_MUST_USE bool
+intl_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp);
+
+/**
+ * Returns the input string converted to upper case based on the language
+ * specific case mappings for the input locale.
+ *
+ * Usage: upperCase = intl_toLocaleUpperCase(string, locale)
+ */
+extern MOZ_MUST_USE bool
+intl_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp);

-extern bool
-str_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp);

 extern bool
 str_normalize(JSContext* cx, unsigned argc, Value* vp);
@@ -480,6 +493,12 @@ JSString*
 str_replaceAll_string_raw(JSContext* cx, HandleString string, HandleString pattern,
                       HandleString replacement);

+extern JSString*
+StringToLowerCase(JSContext* cx, HandleLinearString string);
+
+extern JSString*
+StringToUpperCase(JSContext* cx, HandleLinearString string);
+
 extern bool
 StringConstructor(JSContext* cx, unsigned argc, Value* vp);

@@ -2207,11 +2207,9 @@ static const JSFunctionSpec intrinsic_functions[] = {
    JS_FN("std_String_trimStart",                str_trimStart,                0,0),
    JS_FN("std_String_trimRight",                str_trimEnd,                  0,0),
    JS_FN("std_String_trimEnd",                  str_trimEnd,                  0,0),
-    JS_FN("std_String_toLocaleLowerCase",        str_toLocaleLowerCase,        0,0),
-    JS_FN("std_String_toLocaleUpperCase",        str_toLocaleUpperCase,        0,0),
    JS_FN("std_String_normalize",                str_normalize,                0,0),
    JS_FN("std_String_concat",                   str_concat,                   1,0),
-    
+
    JS_FN("std_TypedArray_buffer",               js::TypedArray_bufferGetter,  1,0),

    JS_FN("std_WeakMap_has",                     WeakMap_has,                  1,0),
@@ -2485,6 +2483,8 @@ static const JSFunctionSpec intrinsic_functions[] = {
    JS_FN("intl_PluralRules_availableLocales", intl_PluralRules_availableLocales, 0,0),
    JS_FN("intl_GetPluralCategories", intl_GetPluralCategories, 2, 0),
    JS_FN("intl_SelectPluralRule", intl_SelectPluralRule, 2,0),
+    JS_FN("intl_toLocaleLowerCase", intl_toLocaleLowerCase, 2,0),
+    JS_FN("intl_toLocaleUpperCase", intl_toLocaleUpperCase, 2,0),
    JS_FN("intl_RelativeTimeFormat_availableLocales", intl_RelativeTimeFormat_availableLocales, 0,0),
    JS_FN("intl_FormatRelativeTime", intl_FormatRelativeTime, 3,0),

@@ -0,0 +1,281 @@
+# SpecialCasing-11.0.0.txt
+# Date: 2018-02-22, 06:16:47 GMT
+# © 2018 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+#   For documentation, see http://www.unicode.org/reports/tr44/
+#
+# Special Casing
+#
+# This file is a supplement to the UnicodeData.txt file. It does not define any
+# properties, but rather provides additional information about the casing of
+# Unicode characters, for situations when casing incurs a change in string length
+# or is dependent on context or locale. For compatibility, the UnicodeData.txt
+# file only contains simple case mappings for characters where they are one-to-one
+# and independent of context and language. The data in this file, combined with
+# the simple case mappings in UnicodeData.txt, defines the full case mappings
+# Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc).
+#
+# Note that the preferred mechanism for defining tailored casing operations is
+# the Unicode Common Locale Data Repository (CLDR). For more information, see the
+# discussion of case mappings and case algorithms in the Unicode Standard.
+#
+# All code points not listed in this file that do not have a simple case mappings
+# in UnicodeData.txt map to themselves.
+# ================================================================================
+# Format
+# ================================================================================
+# The entries in this file are in the following machine-readable format:
+#
+# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
+#
+# <code>, <lower>, <title>, and <upper> provide the respective full case mappings
+# of <code>, expressed as character values in hex. If there is more than one character,
+# they are separated by spaces. Other than as used to separate elements, spaces are
+# to be ignored.
+#
+# The <condition_list> is optional. Where present, it consists of one or more language IDs
+# or casing contexts, separated by spaces. In these conditions:
+# - A condition list overrides the normal behavior if all of the listed conditions are true.
+# - The casing context is always the context of the characters in the original string,
+#   NOT in the resulting string.
+# - Case distinctions in the condition list are not significant.
+# - Conditions preceded by "Not_" represent the negation of the condition.
+# The condition list is not represented in the UCD as a formal property.
+#
+# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
+#
+# A casing context for a character is defined by Section 3.13 Default Case Algorithms
+# of The Unicode Standard.
+#
+# Parsers of this file must be prepared to deal with future additions to this format:
+#  * Additional contexts
+#  * Additional fields
+# ================================================================================
+
+# ================================================================================
+# Unconditional mappings
+# ================================================================================
+
+# The German es-zed is special--the normal mapping is to SS.
+# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))
+
+00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
+
+# Preserve canonical equivalence for I with dot. Turkic is handled below.
+
+0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+# Ligatures
+
+FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF
+FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI
+FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL
+FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI
+FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL
+FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T
+FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST
+
+0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN
+FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW
+FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH
+FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI
+FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW
+FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
+
+# No corresponding uppercase precomposed character
+
+0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON
+1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW
+1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS
+1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE
+1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE
+1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING
+1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI
+1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI
+1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
+1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI
+1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+
+# IMPORTANT-when iota-subscript (0345) is uppercased or titlecased,
+#  the result will be incorrect unless the iota-subscript is moved to the end
+#  of any sequence of combining marks. Otherwise, the accents will go on the capital iota.
+#  This process can be achieved by first transforming the text to NFC before casing.
+#  E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
+
+# The following cases are already in the UnicodeData.txt file, so are only commented here.
+
+# 0345; 0345; 0399; 0399; # COMBINING GREEK YPOGEGRAMMENI
+
+# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
+# have special uppercases.
+# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
+
+1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+
+# Some characters with YPOGEGRAMMENI also have no corresponding titlecases
+
+1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+
+1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+
+# ================================================================================
+# Conditional Mappings
+# The remainder of this file provides conditional casing data used to produce
+# full case mappings.
+# ================================================================================
+# Language-Insensitive Mappings
+# These are characters whose full case mappings do not depend on language, but do
+# depend on context (which characters come before or after). For more information
+# see the header of this file and the Unicode Standard.
+# ================================================================================
+
+# Special case for final form of sigma
+
+03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
+
+# Note: the following cases for non-final are already in the UnicodeData.txt file.
+
+# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
+# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
+# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
+
+# Note: the following cases are not included, since they would case-fold in lowercasing
+
+# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
+# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
+
+# ================================================================================
+# Language-Sensitive Mappings
+# These are characters whose full case mappings depend on language and perhaps also
+# context (which characters come before or after). For more information
+# see the header of this file and the Unicode Standard.
+# ================================================================================
+
+# Lithuanian
+
+# Lithuanian retains the dot in a lowercase i when followed by accents.
+
+# Remove DOT ABOVE after "i" with upper or titlecase
+
+0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
+
+# Introduce an explicit dot above when lowercasing capital I's and J's
+# whenever there are more accents above.
+# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
+
+0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
+004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
+012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
+00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
+00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
+0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
+
+# ================================================================================
+
+# Turkish and Azeri
+
+# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
+# The following rules handle those cases.
+
+0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
+# This matches the behavior of the canonically equivalent I-dot_above
+
+0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
+0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
+
+# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
+
+0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
+0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
+
+# When uppercasing, i turns into a dotted capital I
+
+0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
+0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
+
+# Note: the following case is already in the UnicodeData.txt file.
+
+# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
+
+# EOF
+
@@ -62,8 +62,16 @@ namespace CharFlag {
    const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY;
 }

+const char16_t NO_BREAK_SPACE = 0x00A0;
+const char16_t MICRO_SIGN = 0x00B5;
+const char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF;
+const char16_t LATIN_SMALL_LETTER_Y_WITH_DIAERESIS = 0x00FF;
+const char16_t LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE = 0x0130;
+const char16_t COMBINING_DOT_ABOVE = 0x0307;
+const char16_t GREEK_CAPITAL_LETTER_SIGMA = 0x03A3;
+const char16_t GREEK_SMALL_LETTER_FINAL_SIGMA = 0x03C2;
+const char16_t GREEK_SMALL_LETTER_SIGMA = 0x03C3;
 const char16_t BYTE_ORDER_MARK2 = 0xFFFE;
-const char16_t NO_BREAK_SPACE  = 0x00A0;

 const char16_t LeadSurrogateMin = 0xD800;
 const char16_t LeadSurrogateMax = 0xDBFF;
@@ -239,6 +247,10 @@ IsSpaceOrBOM2(char16_t ch)
    return CharInfo(ch).isSpace();
 }

+/*
+ * Returns the simple upper case mapping (see CanUpperCaseSpecialCasing for
+ * details) of the given UTF-16 code unit.
+ */
 inline char16_t
 ToUpperCase(char16_t ch)
 {
@@ -253,6 +265,10 @@ ToUpperCase(char16_t ch)
    return uint16_t(ch) + info.upperCase;
 }

+/*
+ * Returns the simple lower case mapping (see CanUpperCaseSpecialCasing for
+ * details) of the given UTF-16 code unit.
+ */
 inline char16_t
 ToLowerCase(char16_t ch)
 {
@@ -329,6 +345,43 @@ ToLowerCaseNonBMPTrail(char16_t lead, char16_t trail)
    return trail;
 }

+/*
+ * Returns true if the given UTF-16 code unit has a language-independent,
+ * unconditional or conditional special upper case mapping.
+ *
+ * Unicode defines two case mapping modes:
+ * 1. "simple case mappings" for one-to-one mappings which are independent of
+ *    context and language (defined in UnicodeData.txt).
+ * 2. "special case mappings" for mappings which can increase or decrease the
+ *    string length; or are dependent on context or locale (defined in
+ *    SpecialCasing.txt).
+ *
+ * The CanUpperCase() method defined above only supports simple case mappings.
+ * In order to support the full case mappings of all Unicode characters,
+ * callers need to check this method in addition to CanUpperCase().
+ *
+ * NOTE: All special upper case mappings are unconditional in Unicode 9.
+ */
+bool
+CanUpperCaseSpecialCasing(char16_t ch);
+
+/*
+ * Returns the length of the upper case mapping of |ch|.
+ *
+ * This function asserts if |ch| doesn't have a special upper case mapping.
+ */
+size_t
+LengthUpperCaseSpecialCasing(char16_t ch);
+
+/*
+ * Appends the upper case mapping of |ch| to the given output buffer,
+ * starting at the provided index.
+ *
+ * This function asserts if |ch| doesn't have a special upper case mapping.
+ */
+void
+AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index);
+
 /*
 * For a codepoint C, CodepointsWithSameUpperCaseInfo stores three offsets
 * from C to up to three codepoints with same uppercase (no codepoint in
@@ -504,7 +557,7 @@ UTF16Encode(uint32_t codePoint, char16_t* lead, char16_t* trail)
    *trail = TrailSurrogate(codePoint);
 }

-static inline void
+inline void
 UTF16Encode(uint32_t codePoint, char16_t* elements, unsigned* index)
 {
    if (!IsSupplementary(codePoint)) {
@@ -19,6 +19,12 @@
 //     DIFF:       the difference between the code point in the range and
 //                 converted code point

+// U+10400 DESERET CAPITAL LETTER LONG I .. U+10427 DESERET CAPITAL LETTER EW
+// U+104B0 OSAGE CAPITAL LETTER A .. U+104D3 OSAGE CAPITAL LETTER ZHA
+// U+10C80 OLD HUNGARIAN CAPITAL LETTER A .. U+10CB2 OLD HUNGARIAN CAPITAL LETTER US
+// U+118A0 WARANG CITI CAPITAL LETTER NGAA .. U+118BF WARANG CITI CAPITAL LETTER VIYO
+// U+16E40 MEDEFAIDRIN CAPITAL LETTER M .. U+16E5F MEDEFAIDRIN CAPITAL LETTER Y
+// U+1E900 ADLAM CAPITAL LETTER ALIF .. U+1E921 ADLAM CAPITAL LETTER SHA
 #define FOR_EACH_NON_BMP_LOWERCASE(macro) \
    macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40) \
    macro(0x104b0, 0x104d3, 0xd801, 0xdcb0, 0xdcd3, 40) \
@@ -27,6 +33,12 @@
    macro(0x16e40, 0x16e5f, 0xd81b, 0xde40, 0xde5f, 32) \
    macro(0x1e900, 0x1e921, 0xd83a, 0xdd00, 0xdd21, 34)

+// U+10428 DESERET SMALL LETTER LONG I .. U+1044F DESERET SMALL LETTER EW
+// U+104D8 OSAGE SMALL LETTER A .. U+104FB OSAGE SMALL LETTER ZHA
+// U+10CC0 OLD HUNGARIAN SMALL LETTER A .. U+10CF2 OLD HUNGARIAN SMALL LETTER US
+// U+118C0 WARANG CITI SMALL LETTER NGAA .. U+118DF WARANG CITI SMALL LETTER VIYO
+// U+16E60 MEDEFAIDRIN SMALL LETTER M .. U+16E7F MEDEFAIDRIN SMALL LETTER Y
+// U+1E922 ADLAM SMALL LETTER ALIF .. U+1E943 ADLAM SMALL LETTER SHA
 #define FOR_EACH_NON_BMP_UPPERCASE(macro) \
    macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40) \
    macro(0x104d8, 0x104fb, 0xd801, 0xdcd8, 0xdcfb, -40) \
@@ -35,6 +47,12 @@
    macro(0x16e60, 0x16e7f, 0xd81b, 0xde60, 0xde7f, -32) \
    macro(0x1e922, 0x1e943, 0xd83a, 0xdd22, 0xdd43, -34)

+// U+10400 DESERET CAPITAL LETTER LONG I .. U+10427 DESERET CAPITAL LETTER EW
+// U+104B0 OSAGE CAPITAL LETTER A .. U+104D3 OSAGE CAPITAL LETTER ZHA
+// U+10C80 OLD HUNGARIAN CAPITAL LETTER A .. U+10CB2 OLD HUNGARIAN CAPITAL LETTER US
+// U+118A0 WARANG CITI CAPITAL LETTER NGAA .. U+118BF WARANG CITI CAPITAL LETTER VIYO
+// U+16E40 MEDEFAIDRIN CAPITAL LETTER M .. U+16E5F MEDEFAIDRIN CAPITAL LETTER Y
+// U+1E900 ADLAM CAPITAL LETTER ALIF .. U+1E921 ADLAM CAPITAL LETTER SHA
 #define FOR_EACH_NON_BMP_CASE_FOLDING(macro) \
    macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40) \
    macro(0x104b0, 0x104d3, 0xd801, 0xdcb0, 0xdcd3, 40) \
@@ -43,6 +61,12 @@
    macro(0x16e40, 0x16e5f, 0xd81b, 0xde40, 0xde5f, 32) \
    macro(0x1e900, 0x1e921, 0xd83a, 0xdd00, 0xdd21, 34)

+// U+10428 DESERET SMALL LETTER LONG I .. U+1044F DESERET SMALL LETTER EW
+// U+104D8 OSAGE SMALL LETTER A .. U+104FB OSAGE SMALL LETTER ZHA
+// U+10CC0 OLD HUNGARIAN SMALL LETTER A .. U+10CF2 OLD HUNGARIAN SMALL LETTER US
+// U+118C0 WARANG CITI SMALL LETTER NGAA .. U+118DF WARANG CITI SMALL LETTER VIYO
+// U+16E60 MEDEFAIDRIN SMALL LETTER M .. U+16E7F MEDEFAIDRIN SMALL LETTER Y
+// U+1E922 ADLAM SMALL LETTER ALIF .. U+1E943 ADLAM SMALL LETTER SHA
 #define FOR_EACH_NON_BMP_REV_CASE_FOLDING(macro) \
    macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40) \
    macro(0x104d8, 0x104fb, 0xd801, 0xdcd8, 0xdcfb, -40) \
@@ -26,6 +26,18 @@ import re
 import os
 import sys
 from contextlib import closing
+from functools import partial
+from itertools import chain, groupby, ifilter, imap, izip_longest, tee
+from operator import is_not, itemgetter
+
+class codepoint_dict(dict):
+    def name(self, code_point):
+        (_, _, name, alias) = self[code_point]
+        return '{}{}'.format(name, (' (' + alias + ')' if alias else ''))
+
+    def full_name(self, code_point):
+        (_, _, name, alias) = self[code_point]
+        return 'U+{:04X} {}{}'.format(code_point, name, (' (' + alias + ')' if alias else ''))

 # ECMAScript 2016
 # §11.2 White Space
@@ -132,10 +144,32 @@ def read_derived_core_properties(derived_core_properties):
            for char in range(int(start, 16), int(end, 16) + 1):
                yield (char, char_property)

+def read_special_casing(special_casing):
+    # Format:
+    # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
+    for line in special_casing:
+        if line == '\n' or line.startswith('#'):
+            continue
+        row = line.split('#')[0].split(';')
+        code = int(row[0].strip(), 16)
+        lower = row[1].strip()
+        lower = [int(c, 16) for c in lower.split(' ')] if lower else []
+        upper = row[3].strip()
+        upper = [int(c, 16) for c in upper.split(' ')] if upper else []
+        languages = []
+        contexts = []
+        condition = row[4].strip()
+        if condition:
+            for cond in condition.split(' '):
+                if cond[0].islower():
+                    languages.append(cond)
+                else:
+                    contexts.append(cond)
+            pass
+        yield (code, lower, upper, languages, contexts)
+
 def int_ranges(ints):
    """ Yields consecutive ranges (inclusive) from integer values. """
-    from itertools import tee, izip_longest
-
    (a, b) = tee(sorted(ints))
    start = next(b)
    for (curr, succ) in izip_longest(a, b):
@@ -153,7 +187,7 @@ def utf16_encode(code):

    return lead, trail

-def make_non_bmp_convert_macro(out_file, name, convert_map):
+def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table):
    # Find continuous range in convert_map.
    convert_list = []
    entry = None
@@ -179,6 +213,7 @@ def make_non_bmp_convert_macro(out_file, name, convert_map):

    # Generate macro call for each range.
    lines = []
+    comment = []
    for entry in convert_list:
        from_code = entry['code']
        to_code = entry['code'] + entry['length'] - 1
@@ -190,29 +225,15 @@ def make_non_bmp_convert_macro(out_file, name, convert_map):

        lines.append('    macro(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})'.format(
            from_code, to_code, lead, from_trail, to_trail, diff))
+        comment.append('// {} .. {}'.format(codepoint_table.full_name(from_code),
+                                            codepoint_table.full_name(to_code)))

+    out_file.write('\n'.join(comment))
+    out_file.write('\n')
    out_file.write('#define FOR_EACH_NON_BMP_{}(macro) \\\n'.format(name))
    out_file.write(' \\\n'.join(lines))
    out_file.write('\n')

-def for_each_non_bmp_group(group_set):
-    # Find continuous range in group_set.
-    group_list = []
-    entry = None
-    for code in sorted(group_set.keys()):
-        if entry and code == entry['code'] + entry['length']:
-            entry['length'] += 1
-            continue
-
-        entry = {
-            'code': code,
-            'length': 1
-        }
-        group_list.append(entry)
-
-    for entry in group_list:
-        yield (entry['code'], entry['code'] + entry['length'] - 1)
-
 def process_derived_core_properties(derived_core_properties):
    id_start = set()
    id_continue = set()
@@ -236,7 +257,7 @@ def process_unicode_data(unicode_data, derived_core_properties):
    same_upper_cache = {same_upper_dummy: 0}
    same_upper_index = [0] * (MAX_BMP + 1)

-    test_table = {}
+    codepoint_table = codepoint_dict()
    test_space_table = []

    non_bmp_lower_map = {}
@@ -254,15 +275,9 @@ def process_unicode_data(unicode_data, derived_core_properties):
        alias = row[-5]
        uppercase = row[-3]
        lowercase = row[-2]
-        flags = 0

        if uppercase:
            upper = int(uppercase, 16)
-
-            if upper not in same_upper_map:
-                same_upper_map[upper] = [code]
-            else:
-                same_upper_map[upper].append(code)
        else:
            upper = code

@@ -271,6 +286,8 @@ def process_unicode_data(unicode_data, derived_core_properties):
        else:
            lower = code

+        codepoint_table[code] = (upper, lower, name, alias)
+
        if code > MAX_BMP:
            if code != lower:
                non_bmp_lower_map[code] = lower
@@ -285,6 +302,16 @@ def process_unicode_data(unicode_data, derived_core_properties):
                non_bmp_id_cont_set[code] = 1
            continue

+        assert lower <= MAX_BMP and upper <= MAX_BMP
+
+        if code != upper:
+            if upper not in same_upper_map:
+                same_upper_map[upper] = [code]
+            else:
+                same_upper_map[upper].append(code)
+
+        flags = 0
+
        # we combine whitespace and lineterminators because in pratice we don't need them separated
        if category == 'Zs' or code in whitespace or code in line_terminator:
            flags |= FLAG_SPACE
@@ -298,8 +325,6 @@ def process_unicode_data(unicode_data, derived_core_properties):
        elif code in id_continue or code in compatibility_identifier_part:
            flags |= FLAG_UNICODE_ID_CONTINUE_ONLY

-        test_table[code] = (upper, lower, name, alias)
-
        up_d = upper - code
        low_d = lower - code

@@ -319,12 +344,12 @@ def process_unicode_data(unicode_data, derived_core_properties):
        index[code] = i

    for code in range(0, MAX_BMP + 1):
-        entry = test_table.get(code)
+        entry = codepoint_table.get(code)

        if not entry:
            continue

-        (upper, lower, name, alias) = entry
+        (upper, _, _, _) = entry

        if upper not in same_upper_map:
            continue
@@ -354,7 +379,7 @@ def process_unicode_data(unicode_data, derived_core_properties):
        non_bmp_lower_map, non_bmp_upper_map,
        non_bmp_space_set,
        non_bmp_id_start_set, non_bmp_id_cont_set,
-        test_table, test_space_table,
+        codepoint_table, test_space_table,
    )

 def process_case_folding(case_folding):
@@ -438,9 +463,149 @@ def process_case_folding(case_folding):
        folding_tests
    )

+def process_special_casing(special_casing, table, index):
+    # Unconditional special casing.
+    unconditional_tolower = {}
+    unconditional_toupper = {}
+
+    # Conditional special casing, language independent.
+    conditional_tolower = {}
+    conditional_toupper = {}
+
+    # Conditional special casing, language dependent.
+    lang_conditional_tolower = {}
+    lang_conditional_toupper = {}
+
+    def caseInfo(code):
+        (upper, lower, flags) = table[index[code]]
+        return ((code + lower) & 0xffff, (code + upper) & 0xffff)
+
+    for (code, lower, upper, languages, contexts) in read_special_casing(special_casing):
+        assert code <= MAX_BMP, 'Unexpected character outside of BMP: %s' % code
+        assert len(languages) <= 1, 'Expected zero or one language ids: %s' % languages
+        assert len(contexts) <= 1, 'Expected zero or one casing contexts: %s' % languages
+
+        (default_lower, default_upper) = caseInfo(code)
+        special_lower = len(lower) != 1 or lower[0] != default_lower
+        special_upper = len(upper) != 1 or upper[0] != default_upper
+
+        # Invariant: If |code| has casing per UnicodeData.txt, then it also has
+        # casing rules in SpecialCasing.txt.
+        assert code == default_lower or len(lower) != 1 or code != lower[0]
+        assert code == default_upper or len(upper) != 1 or code != upper[0]
+
+        language = languages[0] if languages else None
+        context = contexts[0] if contexts else None
+
+        if not language and not context:
+            if special_lower:
+                unconditional_tolower[code] = lower
+            if special_upper:
+                unconditional_toupper[code] = upper
+        elif not language and context:
+            if special_lower:
+                conditional_tolower[code] = (lower, context)
+            if special_upper:
+                conditional_toupper[code] = (upper, context)
+        else:
+            if language not in lang_conditional_tolower:
+                lang_conditional_tolower[language] = {}
+                lang_conditional_toupper[language] = {}
+            if special_lower:
+                lang_conditional_tolower[language][code] = (lower, context)
+            if special_upper:
+                lang_conditional_toupper[language][code] = (upper, context)
+
+    # Certain special casing rules are inlined in jsstr.cpp, ensure these cases
+    # still match the current SpecialCasing.txt file.
+    def lowerCase(code):
+        (lower, _) = caseInfo(code)
+        return lower
+
+    def upperCase(code):
+        (_, upper) = caseInfo(code)
+        return upper
+
+    def ascii(char_dict):
+        return ifilter(lambda ch: ch <= 0x7f, char_dict.iterkeys())
+
+    def latin1(char_dict):
+        return ifilter(lambda ch: ch <= 0xff, char_dict.iterkeys())
+
+    def is_empty(iterable):
+        return not any(True for _ in iterable)
+
+    def is_equals(iter1, iter2):
+        return all(x == y for (x, y) in izip_longest(iter1, iter2))
+
+    # Ensure no ASCII characters have special case mappings.
+    assert is_empty(ascii(unconditional_tolower))
+    assert is_empty(ascii(unconditional_toupper))
+    assert is_empty(ascii(conditional_tolower))
+    assert is_empty(ascii(conditional_toupper))
+
+    # Ensure no Latin1 characters have special lower case mappings.
+    assert is_empty(latin1(unconditional_tolower))
+    assert is_empty(latin1(conditional_tolower))
+
+    # Ensure no Latin1 characters have conditional special upper case mappings.
+    assert is_empty(latin1(conditional_toupper))
+
+    # Ensure U+00DF is the only Latin1 character with a special upper case mapping.
+    assert is_equals([0x00DF], latin1(unconditional_toupper))
+
+    # Ensure U+0130 is the only character with a special lower case mapping.
+    assert is_equals([0x0130], unconditional_tolower)
+
+    # Ensure no characters have language independent conditional upper case mappings.
+    assert is_empty(conditional_toupper)
+
+    # Ensure U+03A3 is the only character with language independent conditional lower case mapping.
+    assert is_equals([0x03A3], conditional_tolower)
+
+    # Verify U+0130 and U+03A3 have simple lower case mappings.
+    assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3])
+
+    # Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings.
+    assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.iterkeys()))
+    assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.iterkeys()))
+
+    # Maximum case mapping length is three characters.
+    itervals = lambda d: d.itervalues()
+    assert max(imap(len, chain(
+        itervals(unconditional_tolower),
+        itervals(unconditional_toupper),
+        imap(itemgetter(0), itervals(conditional_tolower)),
+        imap(itemgetter(0), itervals(conditional_toupper)),
+        imap(itemgetter(0), chain.from_iterable(imap(itervals, itervals(lang_conditional_tolower)))),
+        imap(itemgetter(0), chain.from_iterable(imap(itervals, itervals(lang_conditional_toupper)))),
+    ))) <= 3
+
+    # Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms).
+    assert set([
+        'After_I', 'After_Soft_Dotted', 'Final_Sigma', 'More_Above', 'Not_Before_Dot',
+    ]).issuperset(set(ifilter(partial(is_not, None), chain(
+        imap(itemgetter(1), itervals(conditional_tolower)),
+        imap(itemgetter(1), itervals(conditional_toupper)),
+        imap(itemgetter(1), chain.from_iterable(imap(itervals, itervals(lang_conditional_tolower)))),
+        imap(itemgetter(1), chain.from_iterable(imap(itervals, itervals(lang_conditional_toupper)))),
+    ))))
+
+    # Special casing for U+00DF (LATIN SMALL LETTER SHARP S).
+    assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [0x0053, 0x0053];
+
+    # Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE).
+    assert unconditional_tolower[0x0130] == [0x0069, 0x0307]
+
+    # Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA).
+    assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == ([0x03C2], 'Final_Sigma');
+
+    return (unconditional_tolower, unconditional_toupper)
+
 def make_non_bmp_file(version,
                      non_bmp_lower_map, non_bmp_upper_map,
-                      non_bmp_folding_map, non_bmp_rev_folding_map):
+                      non_bmp_folding_map, non_bmp_rev_folding_map,
+                      codepoint_table):
    file_name = 'UnicodeNonBMP.h';
    with io.open(file_name, mode='wb') as non_bmp_file:
        non_bmp_file.write(mpl_license)
@@ -463,77 +628,277 @@ def make_non_bmp_file(version,

 """)

-        make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map)
+        make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map, codepoint_table)
        non_bmp_file.write('\n')
-        make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map)
+        make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map, codepoint_table)
        non_bmp_file.write('\n')
-        make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map)
+        make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map, codepoint_table)
        non_bmp_file.write('\n')
-        make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map)
+        make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map, codepoint_table)

        non_bmp_file.write("""
 #endif /* vm_UnicodeNonBMP_h */
 """)

-def make_bmp_mapping_test(version, test_table):
+def write_special_casing_methods(unconditional_toupper, codepoint_table, println):
+    def hexlit(n):
+        """ Returns C++ hex-literal for |n|. """
+        return '0x{:04X}'.format(n)
+
+    def describe_range(ranges, depth):
+        indent = depth * '    '
+        for (start, end) in ranges:
+            if start == end:
+                println(indent, '// {}'.format(codepoint_table.full_name(start)))
+            else:
+                println(indent, '// {} .. {}'.format(codepoint_table.full_name(start),
+                                                     codepoint_table.full_name(end)))
+
+    def out_range(start, end):
+        """ Tests if the input character isn't a member of the set {x | start <= x <= end}. """
+        if (start == end):
+            return 'ch != {}'.format(hexlit(start))
+        return 'ch < {} || ch > {}'.format(hexlit(start), hexlit(end))
+
+    def in_range(start, end, parenthesize=False):
+        """ Tests if the input character is in the set {x | start <= x <= end}. """
+        if (start == end):
+            return 'ch == {}'.format(hexlit(start))
+        (left, right) = ('(', ')') if parenthesize else ('', '')
+        return '{}ch >= {} && ch <= {}{}'.format(left, hexlit(start), hexlit(end), right)
+
+    def in_any_range(ranges, spaces):
+        """ Tests if the input character is included in any of the given ranges. """
+        lines = [[]]
+        for (start, end) in ranges:
+            expr = in_range(start, end, parenthesize=True)
+            line = ' || '.join(lines[-1] + [expr])
+            if len(line) < (100 - len(spaces) - len(' ||')):
+                lines[-1].append(expr)
+            else:
+                lines.append([expr])
+        return ' ||\n{}'.format(spaces).join(imap(lambda t: ' || '.join(t), lines))
+
+    def write_range_accept(parent_list, child_list, depth):
+        """ Accepts the input character if it matches any code unit in |child_list|. """
+        (min_parent, max_parent) = (parent_list[0], parent_list[-1])
+        (min_child, max_child) = (child_list[0], child_list[-1])
+        assert min_child >= min_parent
+        assert max_child <= max_parent
+        indent = depth * '    '
+
+        child_ranges = list(int_ranges(child_list))
+        has_successor = max_child != max_parent
+
+        # If |child_list| is a contiguous list of code units, emit a simple
+        # range check: |min_child <= input <= max_child|.
+        if len(child_ranges) == 1:
+            describe_range(child_ranges, depth)
+            if has_successor:
+                println(indent, 'if (ch <= {})'.format(hexlit(max_child)))
+                println(indent, '    return ch >= {};'.format(hexlit(min_child)))
+            else:
+                println(indent, 'return {};'.format(in_range(min_child, max_child)))
+            return
+
+        # Otherwise create a disjunction over the subranges in |child_ranges|.
+        if not has_successor:
+            spaces = indent + len('return ') * ' '
+        else:
+            spaces = indent + len('    return ') * ' '
+        range_test_expr = in_any_range(child_ranges, spaces)
+
+        if min_child != min_parent:
+            println(indent, 'if (ch < {})'.format(hexlit(min_child)))
+            println(indent, '    return false;')
+
+        # If there's no successor block, we can omit the |input <= max_child| check,
+        # because it was already checked when we emitted the parent range test.
+        if not has_successor:
+            describe_range(child_ranges, depth)
+            println(indent, 'return {};'.format(range_test_expr))
+        else:
+            println(indent, 'if (ch <= {}) {{'.format(hexlit(max_child)))
+            describe_range(child_ranges, depth + 1)
+            println(indent, '    return {};'.format(range_test_expr))
+            println(indent, '}')
+
+    def write_CanUpperCaseSpecialCasing():
+        """ Checks if the input has a special upper case mapping. """
+        println('bool')
+        println('js::unicode::CanUpperCaseSpecialCasing(char16_t ch)')
+        println('{')
+
+        assert unconditional_toupper, "|unconditional_toupper| is not empty"
+
+        # Sorted list of code units with special upper case mappings.
+        code_list = sorted(unconditional_toupper.iterkeys())
+
+        # Fail-fast if the input character isn't a special casing character.
+        println('    if ({})'.format(out_range(code_list[0], code_list[-1])))
+        println('        return false;')
+
+        for i in range(0, 16):
+            # Check if the input characters is in the range:
+            # |start_point <= input < end_point|.
+            start_point = i << 12
+            end_point = (i + 1) << 12
+            matches = [cu for cu in code_list if start_point <= cu < end_point]
+
+            # Skip empty ranges.
+            if not matches:
+                continue
+
+            # If |matches| consists of only a few characters, directly check
+            # the input against the characters in |matches|.
+            if len(matches) <= 8:
+                write_range_accept(code_list, matches, depth=1)
+                continue
+
+            # Otherwise split into further subranges.
+
+            # Only enter the if-block if the input is less-or-equals to the
+            # largest value in the current range.
+            is_last_block = matches[-1] == code_list[-1]
+            if not is_last_block:
+                println('    if (ch <= {}) {{'.format(hexlit(matches[-1])))
+            else:
+                println('    if (ch < {})'.format(hexlit(matches[0])))
+                println('        return false;')
+
+            for j in range(0, 16):
+                inner_start = start_point + (j << 8)
+                inner_end = start_point + ((j + 1) << 8)
+                inner_matches = [cu for cu in matches if inner_start <= cu < inner_end]
+
+                if inner_matches:
+                    d = 1 if is_last_block else 2
+                    write_range_accept(matches, inner_matches, depth=d)
+
+            if not is_last_block:
+                println('    }')
+
+        println('}')
+
+    def write_LengthUpperCaseSpecialCasing():
+        """ Slow case: Special casing character was found, returns its mapping length. """
+        println('size_t')
+        println('js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)')
+        println('{')
+
+        println('    switch(ch) {')
+        for (code, converted) in sorted(unconditional_toupper.iteritems(), key=itemgetter(0)):
+            println('      case {}: return {}; // {}'.format(hexlit(code), len(converted),
+                                                             codepoint_table.name(code)))
+        println('    }')
+        println('')
+        println('    MOZ_ASSERT_UNREACHABLE("Bad character input.");')
+        println('    return 0;')
+
+        println('}')
+
+    def write_AppendUpperCaseSpecialCasing():
+        """ Slow case: Special casing character was found, append its mapping characters. """
+        println('void')
+        println('js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)')
+        println('{')
+
+        println('    switch(ch) {')
+        for (code, converted) in sorted(unconditional_toupper.iteritems(), key=itemgetter(0)):
+            println('      case {}: // {}'.format(hexlit(code), codepoint_table.name(code)))
+            for ch in converted:
+                println('        elements[(*index)++] = {}; // {}'.format(hexlit(ch),
+                                                                          codepoint_table.name(ch)))
+            println('        return;')
+        println('    }')
+        println('')
+        println('    MOZ_ASSERT_UNREACHABLE("Bad character input.");')
+        println('    return;')
+
+        println('}')
+
+    write_CanUpperCaseSpecialCasing()
+    println('')
+    write_LengthUpperCaseSpecialCasing()
+    println('')
+    write_AppendUpperCaseSpecialCasing()
+
+def make_bmp_mapping_test(version, codepoint_table, unconditional_tolower, unconditional_toupper):
+    def unicodeEsc(n):
+        return '\u{:04X}'.format(n)
+
    file_name = '../tests/ecma_5/String/string-upper-lower-mapping.js'
-    with io.open(file_name, mode='wb') as test_mapping:
-        test_mapping.write(warning_message)
-        test_mapping.write(unicode_version_message.format(version))
-        test_mapping.write(public_domain)
-        test_mapping.write('var mapping = [\n')
+    with io.open(file_name, mode='wb') as output:
+        write = partial(print, file=output, sep='', end='')
+        println = partial(print, file=output, sep='', end='\n')
+
+        write(warning_message)
+        write(unicode_version_message.format(version))
+        write(public_domain)
+        println('var mapping = [')
        for code in range(0, MAX_BMP + 1):
-            entry = test_table.get(code)
+            entry = codepoint_table.get(code)

            if entry:
-                (upper, lower, name, alias) = entry
-                test_mapping.write('  [' + hex(upper) + ', ' + hex(lower) + '], /* ' +
-                        name + (' (' + alias + ')' if alias else '') + ' */\n')
+                (upper, lower, _, _) = entry
+                upper = unconditional_toupper[code] if code in unconditional_toupper else [upper]
+                lower = unconditional_tolower[code] if code in unconditional_tolower else [lower]
+                println('  ["{}", "{}"], /* {} */'.format("".join(imap(unicodeEsc, upper)),
+                                                          "".join(imap(unicodeEsc, lower)),
+                                                          codepoint_table.name(code)))
            else:
-                test_mapping.write('  [' + hex(code) + ', ' + hex(code) + '],\n')
-        test_mapping.write('];')
-        test_mapping.write("""
+                println('  ["{0}", "{0}"],'.format(unicodeEsc(code)))
+        println('];')
+        write("""
 assertEq(mapping.length, 0x10000);
 for (var i = 0; i <= 0xffff; i++) {
    var char = String.fromCharCode(i);
    var info = mapping[i];

-    assertEq(char.toUpperCase().charCodeAt(0), info[0]);
-    assertEq(char.toLowerCase().charCodeAt(0), info[1]);
+    assertEq(char.toUpperCase(), info[0]);
+    assertEq(char.toLowerCase(), info[1]);
 }

 if (typeof reportCompare === "function")
    reportCompare(true, true);
 """)

-def make_non_bmp_mapping_test(version, non_bmp_upper_map, non_bmp_lower_map):
+def make_non_bmp_mapping_test(version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table):
    file_name = '../tests/ecma_6/String/string-code-point-upper-lower-mapping.js'
    with io.open(file_name, mode='wb') as test_non_bmp_mapping:
        test_non_bmp_mapping.write(warning_message)
        test_non_bmp_mapping.write(unicode_version_message.format(version))
        test_non_bmp_mapping.write(public_domain)
+
        for code in sorted(non_bmp_upper_map.keys()):
            test_non_bmp_mapping.write("""\
-assertEq(String.fromCodePoint(0x{:x}).toUpperCase().codePointAt(0), 0x{:x});
-""".format(code, non_bmp_upper_map[code]))
+assertEq(String.fromCodePoint(0x{:04X}).toUpperCase().codePointAt(0), 0x{:04X}); // {}, {}
+""".format(code, non_bmp_upper_map[code],
+           codepoint_table.name(code), codepoint_table.name(non_bmp_upper_map[code])))
+
        for code in sorted(non_bmp_lower_map.keys()):
            test_non_bmp_mapping.write("""\
-assertEq(String.fromCodePoint(0x{:x}).toLowerCase().codePointAt(0), 0x{:x});
-""".format(code, non_bmp_lower_map[code]))
+assertEq(String.fromCodePoint(0x{:04X}).toLowerCase().codePointAt(0), 0x{:04X}); // {}, {}
+""".format(code, non_bmp_lower_map[code],
+           codepoint_table.name(code), codepoint_table.name(non_bmp_lower_map[code])))

        test_non_bmp_mapping.write("""
 if (typeof reportCompare === "function")
    reportCompare(true, true);
 """)

-def make_space_test(version, test_space_table):
+def make_space_test(version, test_space_table, codepoint_table):
+    def hex_and_name(c):
+        return '    0x{:04X} /* {} */'.format(c, codepoint_table.name(c))
+
    file_name = '../tests/ecma_5/String/string-space-trim.js'
    with io.open(file_name, mode='wb') as test_space:
        test_space.write(warning_message)
        test_space.write(unicode_version_message.format(version))
        test_space.write(public_domain)
-        test_space.write('var onlySpace = String.fromCharCode(' +
-                        ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')
+        test_space.write('var onlySpace = String.fromCharCode(\n')
+        test_space.write(',\n'.join(map(hex_and_name, test_space_table)))
+        test_space.write('\n);\n')
        test_space.write("""
 assertEq(onlySpace.trim(), "");
 assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
@@ -544,7 +909,10 @@ if (typeof reportCompare === "function")
    reportCompare(true, true);
 """)

-def make_icase_test(version, folding_tests):
+def make_icase_test(version, folding_tests, codepoint_table):
+    def char_hex(c):
+        return '0x{:04X}'.format(c)
+
    file_name = '../tests/ecma_6/RegExp/unicode-ignoreCase.js'
    with io.open(file_name, mode='wb') as test_icase:
        test_icase.write(warning_message)
@@ -565,7 +933,8 @@ function test(code, ...equivs) {
 }
 """)
        for args in folding_tests:
-            test_icase.write('test(' + ','.join([hex(c) for c in args]) + ');\n')
+            test_icase.write('test({}); // {}\n'.format(', '.join(map(char_hex, args)),
+                                                        ', '.join(map(codepoint_table.name, args))))
        test_icase.write("""
 if (typeof reportCompare === "function")
    reportCompare(true, true);
@@ -576,7 +945,9 @@ def make_unicode_file(version,
                      same_upper_table, same_upper_index,
                      folding_table, folding_index,
                      non_bmp_space_set,
-                      non_bmp_id_start_set, non_bmp_id_cont_set):
+                      non_bmp_id_start_set, non_bmp_id_cont_set,
+                      unconditional_toupper,
+                      codepoint_table):
    index1, index2, shift = splitbins(index)

    # Don't forget to update CharInfo in Unicode.h if you need to change this
@@ -665,8 +1036,8 @@ def make_unicode_file(version,
 *  stop if you found the best shift
 */
 """
-    def dump(data, name, file):
-        file.write('const uint8_t unicode::' + name + '[] = {\n')
+    def dump(data, name, println):
+        println('const uint8_t unicode::{}[] = {{'.format(name))

        line = pad = ' ' * 4
        lines = []
@@ -682,93 +1053,79 @@ def make_unicode_file(version,
                line = line + s + ', '
        lines.append(line.rstrip())

-        file.write('\n'.join(lines))
-        file.write('\n};\n')
+        println('\n'.join(lines))
+        println('};')
+
+    def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println):
+        println('const {} unicode::{}[] = {{'.format(data_type, name))
+        for d in tbl:
+            println('    {{ {} }},'.format(', '.join(str(e) for e in d)))
+        println('};')
+        println('')
+
+        dump(idx1, idx1_name, println)
+        println('')
+        dump(idx2, idx2_name, println)
+        println('')
+
+    def write_supplemental_identifier_method(name, group_set, println):
+        println('bool')
+        println('js::unicode::{}(uint32_t codePoint)'.format(name))
+        println('{')
+        for (from_code, to_code) in int_ranges(group_set.keys()):
+            println('    if (codePoint >= 0x{:X} && codePoint <= 0x{:X}) // {} .. {}'.format(from_code,
+                                                                                             to_code,
+                                                                                             codepoint_table.name(from_code),
+                                                                                             codepoint_table.name(to_code)))
+            println('        return true;')
+        println('    return false;')
+        println('}')
+        println('')

    file_name = 'Unicode.cpp'
    with io.open(file_name, 'wb') as data_file:
-        data_file.write(warning_message)
-        data_file.write(unicode_version_message.format(version))
-        data_file.write(public_domain)
-        data_file.write('#include "vm/Unicode.h"\n\n')
-        data_file.write('using namespace js;\n')
-        data_file.write('using namespace js::unicode;\n')
-        data_file.write(comment)
-        data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')
-        for d in table:
-            data_file.write('    {')
-            data_file.write(', '.join((str(e) for e in d)))
-            data_file.write('},\n')
-        data_file.write('};\n')
-        data_file.write('\n')
+        write = partial(print, file=data_file, sep='', end='')
+        println = partial(print, file=data_file, sep='', end='\n')

-        dump(index1, 'index1', data_file)
-        data_file.write('\n')
-        dump(index2, 'index2', data_file)
-        data_file.write('\n')
+        write(warning_message)
+        write(unicode_version_message.format(version))
+        write(public_domain)
+        println('#include "vm/Unicode.h"')
+        println('')
+        println('using namespace js;')
+        println('using namespace js::unicode;')
+        write(comment)

-        data_file.write('const CodepointsWithSameUpperCaseInfo unicode::js_codepoints_with_same_upper_info[] = {\n')
-        for d in same_upper_table:
-            data_file.write('    {')
-            data_file.write(', '.join((str(e) for e in d)))
-            data_file.write('},\n')
-        data_file.write('};\n')
-        data_file.write('\n')
+        write_table('CharacterInfo',
+                    'js_charinfo', table,
+                    'index1', index1,
+                    'index2', index2,
+                    println)

-        dump(same_upper_index1, 'codepoints_with_same_upper_index1', data_file)
-        data_file.write('\n')
-        dump(same_upper_index2, 'codepoints_with_same_upper_index2', data_file)
-        data_file.write('\n')
+        write_table('CodepointsWithSameUpperCaseInfo',
+                    'js_codepoints_with_same_upper_info', same_upper_table,
+                    'codepoints_with_same_upper_index1', same_upper_index1,
+                    'codepoints_with_same_upper_index2', same_upper_index2,
+                    println)

-        data_file.write('const FoldingInfo unicode::js_foldinfo[] = {\n')
-        for d in folding_table:
-            data_file.write('    {')
-            data_file.write(', '.join((str(e) for e in d)))
-            data_file.write('},\n')
-        data_file.write('};\n')
-        data_file.write('\n')
-
-        dump(folding_index1, 'folding_index1', data_file)
-        data_file.write('\n')
-        dump(folding_index2, 'folding_index2', data_file)
-        data_file.write('\n')
+        write_table('FoldingInfo',
+                    'js_foldinfo', folding_table,
+                    'folding_index1', folding_index1,
+                    'folding_index2', folding_index2,
+                    println)

        # If the following assert fails, it means space character is added to
        # non-BMP area.  In that case the following code should be uncommented
        # and the corresponding code should be added to frontend.
        assert len(non_bmp_space_set.keys()) == 0

-        data_file.write("""\
-bool
-js::unicode::IsIdentifierStartNonBMP(uint32_t codePoint)
-{
-""")
+        write_supplemental_identifier_method('IsIdentifierStartNonBMP', non_bmp_id_start_set,
+                                             println)

-        for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_start_set):
-            data_file.write("""\
-    if (codePoint >= 0x{:x} && codePoint <= 0x{:x})
-        return true;
-""".format(from_code, to_code))
+        write_supplemental_identifier_method('IsIdentifierPartNonBMP', non_bmp_id_cont_set,
+                                             println)

-        data_file.write("""\
-    return false;
-}
-
-bool
-js::unicode::IsIdentifierPartNonBMP(uint32_t codePoint)
-{
-""")
-
-        for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_cont_set):
-            data_file.write("""\
-    if (codePoint >= 0x{:x} && codePoint <= 0x{:x})
-        return true;
-""".format(from_code, to_code))
-
-        data_file.write("""\
-    return false;
-}
-""")
+        write_special_casing_methods(unconditional_toupper, codepoint_table, println)

 def getsize(data):
    """ return smallest possible integer size for the given array """
@@ -842,10 +1199,8 @@ def splitbins(t):
 def make_irregexp_tables(version,
                         table, index,
                         folding_table, folding_index,
-                         test_table):
+                         codepoint_table):
    import string
-    from functools import partial
-    from itertools import chain, ifilter, imap

    MAX_ASCII = 0x7F
    MAX_LATIN1 = 0xFF
@@ -894,13 +1249,13 @@ def make_irregexp_tables(version,

    def char_name(code):
        assert 0 <= code and code <= MAX_BMP
-        if code not in test_table:
+        if code not in codepoint_table:
            return '<Unused>'
        if code == LEAD_SURROGATE_MIN:
            return '<Lead Surrogate Min>'
        if code == TRAIL_SURROGATE_MAX:
            return '<Trail Surrogate Max>'
-        (_, _, name, alias) = test_table[code]
+        (_, _, name, alias) = codepoint_table[code]
        return name if not name.startswith('<') else alias

    def write_character_range(println, name, characters):
@@ -1080,7 +1435,8 @@ def update_unicode(args):

    with download_or_open('UnicodeData.txt') as unicode_data, \
         download_or_open('CaseFolding.txt') as case_folding, \
-         download_or_open('DerivedCoreProperties.txt') as derived_core_properties:
+         download_or_open('DerivedCoreProperties.txt') as derived_core_properties, \
+         download_or_open('SpecialCasing.txt') as special_casing:
        unicode_version = version_from_file(derived_core_properties, 'DerivedCoreProperties')

        print('Processing...')
@@ -1090,13 +1446,16 @@ def update_unicode(args):
            non_bmp_lower_map, non_bmp_upper_map,
            non_bmp_space_set,
            non_bmp_id_start_set, non_bmp_id_cont_set,
-            test_table, test_space_table
+            codepoint_table, test_space_table
        ) = process_unicode_data(unicode_data, derived_core_properties)
        (
            folding_table, folding_index,
            non_bmp_folding_map, non_bmp_rev_folding_map,
            folding_tests
        ) = process_case_folding(case_folding)
+        (
+            unconditional_tolower, unconditional_toupper
+        ) = process_special_casing(special_casing, table, index)

    print('Generating...')
    make_unicode_file(unicode_version,
@@ -1104,19 +1463,23 @@ def update_unicode(args):
                      same_upper_table, same_upper_index,
                      folding_table, folding_index,
                      non_bmp_space_set,
-                      non_bmp_id_start_set, non_bmp_id_cont_set)
+                      non_bmp_id_start_set, non_bmp_id_cont_set,
+                      unconditional_toupper,
+                      codepoint_table)
    make_non_bmp_file(unicode_version,
                      non_bmp_lower_map, non_bmp_upper_map,
-                      non_bmp_folding_map, non_bmp_rev_folding_map)
+                      non_bmp_folding_map, non_bmp_rev_folding_map,
+                      codepoint_table)
    make_irregexp_tables(unicode_version,
                         table, index,
                         folding_table, folding_index,
-                         test_table)
+                         codepoint_table)

-    make_bmp_mapping_test(unicode_version, test_table)
-    make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map)
-    make_space_test(unicode_version, test_space_table)
-    make_icase_test(unicode_version, folding_tests)
+    make_bmp_mapping_test(unicode_version,
+                          codepoint_table, unconditional_tolower, unconditional_toupper)
+    make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table)
+    make_space_test(unicode_version, test_space_table, codepoint_table)
+    make_icase_test(unicode_version, folding_tests, codepoint_table)

 if __name__ == '__main__':
    import argparse