1
0
mirror of https://github.com/roytam1/UXP.git synced 2026-05-26 13:58:49 +00:00

Issue #2259 - Reimplement String.prototype.toLocale{Lower,Upper}Case per ECMAScript Intl specification

- Update make_unicode to output SpecialCasing
- Handle special casing
- Use realloc instead of malloc when resizing a newly created string buffer

Based-on: m-c 1318403, 1431957
This commit is contained in:
Martok
2023-06-29 23:05:33 +02:00
committed by roytam1
parent 949f69ef4b
commit 8d97bd437a
14 changed files with 3198 additions and 1299 deletions
+1
View File
@@ -82,6 +82,7 @@ included_inclnames_to_ignore = set([
'unicode/plurrule.h', # ICU
'unicode/timezone.h', # ICU
'unicode/ucal.h', # ICU
'unicode/uchar.h', # ICU
'unicode/uclean.h', # ICU
'unicode/ucol.h', # ICU
'unicode/udat.h', # ICU
+86 -2
View File
@@ -731,6 +731,88 @@ function String_localeCompare(that) {
return intl_CompareStrings(collator, S, That);
}
/**
* 13.1.2 String.prototype.toLocaleLowerCase ( [ locales ] )
*
* ES2017 Intl draft rev 94045d234762ad107a3d09bb6f7381a65f1a2f9b
*/
function String_toLocaleLowerCase() {
// Step 1.
RequireObjectCoercible(this);
// Step 2.
var string = ToString(this);
// Handle the common cases (no locales argument or a single string
// argument) first.
var locales = arguments.length > 0 ? arguments[0] : undefined;
var requestedLocale;
if (locales === undefined) {
// Steps 3, 6.
requestedLocale = undefined;
} else if (typeof locales === "string") {
// Steps 3, 5.
requestedLocale = ValidateAndCanonicalizeLanguageTag(locales);
} else {
// Step 3.
var requestedLocales = CanonicalizeLocaleList(locales);
// Steps 4-6.
requestedLocale = requestedLocales.length > 0 ? requestedLocales[0] : undefined;
}
// Trivial case: When the input is empty, directly return the empty string.
if (string.length === 0)
return "";
if (requestedLocale === undefined)
requestedLocale = DefaultLocale();
// Steps 7-16.
return intl_toLocaleLowerCase(string, requestedLocale);
}
/**
* 13.1.3 String.prototype.toLocaleUpperCase ( [ locales ] )
*
* ES2017 Intl draft rev 94045d234762ad107a3d09bb6f7381a65f1a2f9b
*/
function String_toLocaleUpperCase() {
// Step 1.
RequireObjectCoercible(this);
// Step 2.
var string = ToString(this);
// Handle the common cases (no locales argument or a single string
// argument) first.
var locales = arguments.length > 0 ? arguments[0] : undefined;
var requestedLocale;
if (locales === undefined) {
// Steps 3, 6.
requestedLocale = undefined;
} else if (typeof locales === "string") {
// Steps 3, 5.
requestedLocale = ValidateAndCanonicalizeLanguageTag(locales);
} else {
// Step 3.
var requestedLocales = CanonicalizeLocaleList(locales);
// Steps 4-6.
requestedLocale = requestedLocales.length > 0 ? requestedLocales[0] : undefined;
}
// Trivial case: When the input is empty, directly return the empty string.
if (string.length === 0)
return "";
if (requestedLocale === undefined)
requestedLocale = DefaultLocale();
// Steps 7-16.
return intl_toLocaleUpperCase(string, requestedLocale);
}
/* ES6 Draft May 22, 2014 21.1.2.4 */
function String_static_raw(callSite, ...substitutions) {
// Step 1 (implicit).
@@ -1014,13 +1096,15 @@ _SetCanonicalName(String_static_trimEnd, "trimEnd");
function String_static_toLocaleLowerCase(string) {
if (arguments.length < 1)
ThrowTypeError(JSMSG_MISSING_FUN_ARG, 0, 'String.toLocaleLowerCase');
return callFunction(std_String_toLocaleLowerCase, string);
var locales = arguments.length > 1 ? arguments[1] : undefined;
return callFunction(String_toLocaleLowerCase, string, locales);
}
function String_static_toLocaleUpperCase(string) {
if (arguments.length < 1)
ThrowTypeError(JSMSG_MISSING_FUN_ARG, 0, 'String.toLocaleUpperCase');
return callFunction(std_String_toLocaleUpperCase, string);
var locales = arguments.length > 1 ? arguments[1] : undefined;
return callFunction(String_toLocaleUpperCase, string, locales);
}
function String_static_normalize(string) {
+58
View File
@@ -446,6 +446,64 @@ function CanonicalizeLanguageTag(locale) {
return canonical;
}
/**
* Returns true if the input contains only ASCII alphabetical characters.
*/
function IsASCIIAlphaString(s) {
assert(typeof s === "string", "IsASCIIAlphaString");
for (var i = 0; i < s.length; i++) {
var c = callFunction(std_String_charCodeAt, s, i);
if (!((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A)))
return false
}
return true;
}
/**
* Validates and canonicalizes the given language tag.
*/
function ValidateAndCanonicalizeLanguageTag(locale) {
assert(typeof locale === "string", "ValidateAndCanonicalizeLanguageTag");
// Handle the common case (a standalone language) first.
// Only the following BCP47 subset is accepted:
// Language-Tag = langtag
// langtag = language
// language = 2*3ALPHA ; shortest ISO 639 code
// For three character long strings we need to make sure it's not a
// private use only language tag, for example "x-x".
if (locale.length === 2 || (locale.length === 3 && locale[1] !== "-")) {
if (!IsASCIIAlphaString(locale))
ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale);
assert(IsStructurallyValidLanguageTag(locale), "2*3ALPHA is a valid language tag");
// The language subtag is canonicalized to lower case.
locale = callFunction(std_String_toLowerCase, locale);
// langTagMappings doesn't contain any 2*3ALPHA keys, so we don't need
// to check for possible replacements in this map.
assert(!callFunction(std_Object_hasOwnProperty, langTagMappings, locale),
"langTagMappings contains no 2*3ALPHA mappings");
// Replace deprecated subtags with their preferred values.
locale = callFunction(std_Object_hasOwnProperty, langSubtagMappings, locale)
? langSubtagMappings[locale]
: locale;
assert(locale === CanonicalizeLanguageTag(locale), "expected same canonicalization");
return locale;
}
if (!IsStructurallyValidLanguageTag(locale))
ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale);
return CanonicalizeLanguageTag(locale);
}
function localeContainsNoUnicodeExtensions(locale) {
// No "-u-", no possible Unicode extension.
if (callFunction(std_String_indexOf, locale, "-u-") === -1)
+4
View File
@@ -151,6 +151,10 @@ def readRegistry(registry):
# Special case for heploc.
langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
# ValidateAndCanonicalizeLanguageTag in Intl.js expects langTagMappings
# contains no 2*3ALPHA.
assert all(len(lang) > 3 for lang in langTagMappings.iterkeys())
return {"fileDate": fileDate,
"langTagMappings": langTagMappings,
"langSubtagMappings": langSubtagMappings,
+2 -2
View File
@@ -5327,8 +5327,8 @@ JS_ResetDefaultLocale(JSContext* cx);
* Locale specific string conversion and error message callbacks.
*/
struct JSLocaleCallbacks {
JSLocaleToUpperCase localeToUpperCase;
JSLocaleToLowerCase localeToLowerCase;
JSLocaleToUpperCase localeToUpperCase; // not used
JSLocaleToLowerCase localeToLowerCase; // not used
JSLocaleCompare localeCompare; // not used
JSLocaleToUnicode localeToUnicode;
};
+1
View File
@@ -365,6 +365,7 @@ struct JSContext : public js::ExclusiveContext,
using ExclusiveContext::permanentAtoms;
using ExclusiveContext::pod_calloc;
using ExclusiveContext::pod_malloc;
using ExclusiveContext::pod_realloc;
using ExclusiveContext::staticStrings;
using ExclusiveContext::updateMallocCounter;
using ExclusiveContext::wellKnownSymbols;
+537 -126
View File
@@ -31,10 +31,12 @@
#include "jsutil.h"
#include "builtin/intl/ICUHeader.h"
#include "builtin/intl/CommonFunctions.h"
#include "builtin/RegExp.h"
#include "jit/InlinableNatives.h"
#include "js/Conversions.h"
#include "js/UniquePtr.h"
#include "unicode/uchar.h"
#include "unicode/unorm2.h"
#include "vm/GlobalObject.h"
#include "vm/Interpreter.h"
@@ -598,19 +600,210 @@ js::SubstringKernel(JSContext* cx, HandleString str, int32_t beginInt, int32_t l
return NewDependentString(cx, str, begin, len);
}
template <typename CharT>
static auto
ReallocChars(JSContext* cx, UniquePtr<CharT[], JS::FreePolicy> chars, size_t oldLength,
size_t newLength)
-> decltype(chars)
{
using AnyCharPtr = decltype(chars);
CharT* oldChars = chars.release();
CharT* newChars = cx->pod_realloc<CharT>(oldChars, oldLength, newLength);
if (!newChars) {
js_free(oldChars);
return AnyCharPtr();
}
return AnyCharPtr(newChars);
}
/**
* U+03A3 GREEK CAPITAL LETTER SIGMA has two different lower case mappings
* depending on its context:
* When it's preceded by a cased character and not followed by another cased
* character, its lower case form is U+03C2 GREEK SMALL LETTER FINAL SIGMA.
* Otherwise its lower case mapping is U+03C3 GREEK SMALL LETTER SIGMA.
*
* Unicode 9.0, §3.13 Default Case Algorithms
*/
static char16_t
Final_Sigma(const char16_t* chars, size_t length, size_t index)
{
MOZ_ASSERT(index < length);
MOZ_ASSERT(chars[index] == unicode::GREEK_CAPITAL_LETTER_SIGMA);
MOZ_ASSERT(unicode::ToLowerCase(unicode::GREEK_CAPITAL_LETTER_SIGMA) ==
unicode::GREEK_SMALL_LETTER_SIGMA);
// Tell the analysis the BinaryProperty.contains function pointer called by
// u_hasBinaryProperty cannot GC.
JS::AutoSuppressGCAnalysis nogc;
bool precededByCased = false;
for (size_t i = index; i > 0; ) {
char16_t c = chars[--i];
uint32_t codePoint = c;
if (unicode::IsTrailSurrogate(c) && i > 0) {
char16_t lead = chars[i - 1];
if (unicode::IsLeadSurrogate(lead)) {
codePoint = unicode::UTF16Decode(lead, c);
i--;
}
}
// Ignore any characters with the property Case_Ignorable.
// NB: We need to skip over all Case_Ignorable characters, even when
// they also have the Cased binary property.
if (u_hasBinaryProperty(codePoint, UCHAR_CASE_IGNORABLE))
continue;
precededByCased = u_hasBinaryProperty(codePoint, UCHAR_CASED);
break;
}
if (!precededByCased)
return unicode::GREEK_SMALL_LETTER_SIGMA;
bool followedByCased = false;
for (size_t i = index + 1; i < length; ) {
char16_t c = chars[i++];
uint32_t codePoint = c;
if (unicode::IsLeadSurrogate(c) && i < length) {
char16_t trail = chars[i];
if (unicode::IsTrailSurrogate(trail)) {
codePoint = unicode::UTF16Decode(c, trail);
i++;
}
}
// Ignore any characters with the property Case_Ignorable.
// NB: We need to skip over all Case_Ignorable characters, even when
// they also have the Cased binary property.
if (u_hasBinaryProperty(codePoint, UCHAR_CASE_IGNORABLE))
continue;
followedByCased = u_hasBinaryProperty(codePoint, UCHAR_CASED);
break;
}
if (!followedByCased)
return unicode::GREEK_SMALL_LETTER_FINAL_SIGMA;
return unicode::GREEK_SMALL_LETTER_SIGMA;
}
static Latin1Char
Final_Sigma(const Latin1Char* chars, size_t length, size_t index)
{
MOZ_ASSERT_UNREACHABLE("U+03A3 is not a Latin-1 character");
return 0;
}
// If |srcLength == destLength| is true, the destination buffer was allocated
// with the same size as the source buffer. When we append characters which
// have special casing mappings, we test |srcLength == destLength| to decide
// if we need to back out and reallocate a sufficiently large destination
// buffer. Otherwise the destination buffer was allocated with the correct
// size to hold all lower case mapped characters, i.e.
// |destLength == ToLowerCaseLength(srcChars, 0, srcLength)| is true.
template <typename CharT>
static size_t
ToLowerCaseImpl(CharT* destChars, const CharT* srcChars, size_t startIndex, size_t srcLength,
size_t destLength)
{
MOZ_ASSERT(startIndex < srcLength);
MOZ_ASSERT(srcLength <= destLength);
MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), srcLength == destLength);
size_t j = startIndex;
for (size_t i = startIndex; i < srcLength; i++) {
char16_t c = srcChars[i];
if (!IsSame<CharT, Latin1Char>::value) {
if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
char16_t trail = srcChars[i + 1];
if (unicode::IsTrailSurrogate(trail)) {
trail = unicode::ToLowerCaseNonBMPTrail(c, trail);
destChars[j++] = c;
destChars[j++] = trail;
i++;
continue;
}
}
// Special case: U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
// lowercases to <U+0069 U+0307>.
if (c == unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
// Return if the output buffer is too small.
if (srcLength == destLength)
return i;
destChars[j++] = CharT('i');
destChars[j++] = CharT(unicode::COMBINING_DOT_ABOVE);
continue;
}
// Special case: U+03A3 GREEK CAPITAL LETTER SIGMA lowercases to
// one of two codepoints depending on context.
if (c == unicode::GREEK_CAPITAL_LETTER_SIGMA) {
destChars[j++] = Final_Sigma(srcChars, srcLength, i);
continue;
}
}
c = unicode::ToLowerCase(c);
MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
destChars[j++] = c;
}
MOZ_ASSERT(j == destLength);
destChars[destLength] = '\0';
return srcLength;
}
static size_t
ToLowerCaseLength(const char16_t* chars, size_t startIndex, size_t length)
{
size_t lowerLength = length;
for (size_t i = startIndex; i < length; i++) {
char16_t c = chars[i];
// U+0130 is lowercased to the two-element sequence <U+0069 U+0307>.
if (c == unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
lowerLength += 1;
}
return lowerLength;
}
static size_t
ToLowerCaseLength(const Latin1Char* chars, size_t startIndex, size_t length)
{
MOZ_ASSERT_UNREACHABLE("never called for Latin-1 strings");
return 0;
}
template <typename CharT>
static JSString*
ToLowerCase(JSContext* cx, JSLinearString* str)
{
// Unlike toUpperCase, toLowerCase has the nice invariant that if the input
// is a Latin1 string, the output is also a Latin1 string.
UniquePtr<CharT[], JS::FreePolicy> newChars;
size_t length = str->length();
// Unlike toUpperCase, toLowerCase has the nice invariant that if the
// input is a Latin-1 string, the output is also a Latin-1 string.
using AnyCharPtr = UniquePtr<CharT[], JS::FreePolicy>;
AnyCharPtr newChars;
const size_t length = str->length();
size_t resultLength;
{
AutoCheckCannotGC nogc;
const CharT* chars = str->chars<CharT>(nogc);
// Look for the first upper case character.
// We don't need extra special casing checks in the loop below,
// because U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+03A3
// GREEK CAPITAL LETTER SIGMA already have simple lower case mappings.
MOZ_ASSERT(unicode::CanLowerCase(unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE),
"U+0130 has a simple lower case mapping");
MOZ_ASSERT(unicode::CanLowerCase(unicode::GREEK_CAPITAL_LETTER_SIGMA),
"U+03A3 has a simple lower case mapping");
// Look for the first character that changes when lowercased.
size_t i = 0;
for (; i < length; i++) {
char16_t c = chars[i];
@@ -630,40 +823,35 @@ ToLowerCase(JSContext* cx, JSLinearString* str)
break;
}
// If all characters are lower case, return the input string.
// If no character needs to change, return the input string.
if (i == length)
return str;
newChars = cx->make_pod_array<CharT>(length + 1);
resultLength = length;
newChars = cx->make_pod_array<CharT>(resultLength + 1);
if (!newChars)
return nullptr;
PodCopy(newChars.get(), chars, i);
for (; i < length; i++) {
char16_t c = chars[i];
if (!IsSame<CharT, Latin1Char>::value) {
if (unicode::IsLeadSurrogate(c) && i + 1 < length) {
char16_t trail = chars[i + 1];
if (unicode::IsTrailSurrogate(trail)) {
trail = unicode::ToLowerCaseNonBMPTrail(c, trail);
newChars[i] = c;
newChars[i + 1] = trail;
i++;
continue;
}
}
}
size_t readChars = ToLowerCaseImpl(newChars.get(), chars, i, length, resultLength);
if (readChars < length) {
MOZ_ASSERT((!IsSame<CharT, Latin1Char>::value),
"Latin-1 strings don't have special lower case mappings");
resultLength = ToLowerCaseLength(chars, readChars, length);
c = unicode::ToLowerCase(c);
MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
newChars[i] = c;
AnyCharPtr buf = ReallocChars(cx, Move(newChars), length + 1, resultLength + 1);
if (!buf)
return nullptr;
newChars = Move(buf);
MOZ_ALWAYS_TRUE(length ==
ToLowerCaseImpl(newChars.get(), chars, readChars, length, resultLength));
}
newChars[length] = 0;
}
JSString* res = NewStringDontDeflate<CanGC>(cx, newChars.get(), length);
JSString* res = NewStringDontDeflate<CanGC>(cx, newChars.get(), resultLength);
if (!res)
return nullptr;
@@ -671,21 +859,102 @@ ToLowerCase(JSContext* cx, JSLinearString* str)
return res;
}
static inline bool
ToLowerCaseHelper(JSContext* cx, const CallArgs& args)
JSString*
js::StringToLowerCase(JSContext* cx, HandleLinearString string)
{
if (string->hasLatin1Chars())
return ToLowerCase<Latin1Char>(cx, string);
return ToLowerCase<char16_t>(cx, string);
}
bool
js::str_toLowerCase(JSContext* cx, unsigned argc, Value* vp)
{
CallArgs args = CallArgsFromVp(argc, vp);
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
if (!str)
return false;
JSLinearString* linear = str->ensureLinear(cx);
RootedLinearString linear(cx, str->ensureLinear(cx));
if (!linear)
return false;
if (linear->hasLatin1Chars())
str = ToLowerCase<Latin1Char>(cx, linear);
else
str = ToLowerCase<char16_t>(cx, linear);
JSString* result = StringToLowerCase(cx, linear);
if (!result)
return false;
args.rval().setString(result);
return true;
}
static const char*
CaseMappingLocale(JSContext* cx, JSString* str)
{
JSLinearString* locale = str->ensureLinear(cx);
if (!locale)
return nullptr;
MOZ_ASSERT(locale->length() >= 2, "locale is a valid language tag");
// Lithuanian, Turkish, and Azeri have language dependent case mappings.
static const char languagesWithSpecialCasing[][3] = { "lt", "tr", "az" };
// All strings in |languagesWithSpecialCasing| are of length two, so we
// only need to compare the first two characters to find a matching locale.
// ES2017 Intl, §9.2.2 BestAvailableLocale
if (locale->length() == 2 || locale->latin1OrTwoByteChar(2) == '-') {
for (const auto& language : languagesWithSpecialCasing) {
if (locale->latin1OrTwoByteChar(0) == language[0] &&
locale->latin1OrTwoByteChar(1) == language[1])
{
return language;
}
}
}
return ""; // ICU root locale
}
bool
js::intl_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp)
{
CallArgs args = CallArgsFromVp(argc, vp);
MOZ_ASSERT(args.length() == 2);
MOZ_ASSERT(args[0].isString());
MOZ_ASSERT(args[1].isString());
RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx));
if (!linear)
return false;
const char* locale = CaseMappingLocale(cx, args[1].toString());
if (!locale)
return false;
// Call String.prototype.toLowerCase() for language independent casing.
if (intl::StringsAreEqual(locale, "")) {
JSString* str = StringToLowerCase(cx, linear);
if (!str)
return false;
args.rval().setString(str);
return true;
}
AutoStableStringChars inputChars(cx);
if (!inputChars.initTwoByte(cx, linear))
return false;
mozilla::Range<const char16_t> input = inputChars.twoByteRange();
// Maximum case mapping length is three characters.
static_assert(JSString::MAX_LENGTH < INT32_MAX / 3,
"Case conversion doesn't overflow int32_t indices");
JSString* str = intl::CallICU(cx, [&input, locale](UChar* chars, int32_t size, UErrorCode* status) {
return u_strToLower(chars, size, Char16ToUChar(input.begin().get()), input.length(),
locale, status);
});
if (!str)
return false;
@@ -693,82 +962,192 @@ ToLowerCaseHelper(JSContext* cx, const CallArgs& args)
return true;
}
bool
js::str_toLowerCase(JSContext* cx, unsigned argc, Value* vp)
static inline bool
CanUpperCaseSpecialCasing(Latin1Char charCode)
{
return ToLowerCaseHelper(cx, CallArgsFromVp(argc, vp));
// Handle U+00DF LATIN SMALL LETTER SHARP S inline, all other Latin-1
// characters don't have special casing rules.
MOZ_ASSERT_IF(charCode != unicode::LATIN_SMALL_LETTER_SHARP_S,
!unicode::CanUpperCaseSpecialCasing(charCode));
return charCode == unicode::LATIN_SMALL_LETTER_SHARP_S;
}
bool
js::str_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp)
static inline bool
CanUpperCaseSpecialCasing(char16_t charCode)
{
CallArgs args = CallArgsFromVp(argc, vp);
/*
* Forcefully ignore the first (or any) argument and return toLowerCase(),
* ECMA has reserved that argument, presumably for defining the locale.
*/
if (cx->runtime()->localeCallbacks && cx->runtime()->localeCallbacks->localeToLowerCase) {
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
if (!str)
return false;
RootedValue result(cx);
if (!cx->runtime()->localeCallbacks->localeToLowerCase(cx, str, &result))
return false;
args.rval().set(result);
return true;
}
return ToLowerCaseHelper(cx, args);
return unicode::CanUpperCaseSpecialCasing(charCode);
}
static inline size_t
LengthUpperCaseSpecialCasing(Latin1Char charCode)
{
// U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'.
MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S);
return 2;
}
static inline size_t
LengthUpperCaseSpecialCasing(char16_t charCode)
{
MOZ_ASSERT(CanUpperCaseSpecialCasing(charCode));
return unicode::LengthUpperCaseSpecialCasing(charCode);
}
static inline void
AppendUpperCaseSpecialCasing(char16_t charCode, Latin1Char* elements, size_t* index)
{
// U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'.
MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S);
static_assert('S' <= JSString::MAX_LATIN1_CHAR, "'S' is a Latin-1 character");
elements[(*index)++] = 'S';
elements[(*index)++] = 'S';
}
static inline void
AppendUpperCaseSpecialCasing(char16_t charCode, char16_t* elements, size_t* index)
{
unicode::AppendUpperCaseSpecialCasing(charCode, elements, index);
}
// See ToLowerCaseImpl for an explanation of the parameters.
template <typename DestChar, typename SrcChar>
static void
ToUpperCaseImpl(DestChar* destChars, const SrcChar* srcChars, size_t firstLowerCase, size_t length)
static size_t
ToUpperCaseImpl(DestChar* destChars, const SrcChar* srcChars, size_t startIndex, size_t srcLength,
size_t destLength)
{
MOZ_ASSERT(firstLowerCase < length);
static_assert(IsSame<SrcChar, Latin1Char>::value || !IsSame<DestChar, Latin1Char>::value,
"cannot write non-Latin-1 characters into Latin-1 string");
MOZ_ASSERT(startIndex < srcLength);
MOZ_ASSERT(srcLength <= destLength);
for (size_t i = 0; i < firstLowerCase; i++)
destChars[i] = srcChars[i];
for (size_t i = firstLowerCase; i < length; i++) {
size_t j = startIndex;
for (size_t i = startIndex; i < srcLength; i++) {
char16_t c = srcChars[i];
if (!IsSame<DestChar, Latin1Char>::value) {
if (unicode::IsLeadSurrogate(c) && i + 1 < length) {
if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
char16_t trail = srcChars[i + 1];
if (unicode::IsTrailSurrogate(trail)) {
trail = unicode::ToUpperCaseNonBMPTrail(c, trail);
destChars[i] = c;
destChars[i + 1] = trail;
destChars[j++] = c;
destChars[j++] = trail;
i++;
continue;
}
}
}
if (MOZ_UNLIKELY(c > 0x7f && CanUpperCaseSpecialCasing(static_cast<SrcChar>(c)))) {
// Return if the output buffer is too small.
if (srcLength == destLength)
return i;
AppendUpperCaseSpecialCasing(c, destChars, &j);
continue;
}
c = unicode::ToUpperCase(c);
MOZ_ASSERT_IF((IsSame<DestChar, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
destChars[i] = c;
destChars[j++] = c;
}
destChars[length] = '\0';
MOZ_ASSERT(j == destLength);
destChars[destLength] = '\0';
return srcLength;
}
// Explicit instantiation so we don't hit the static_assert from above.
static bool
ToUpperCaseImpl(Latin1Char* destChars, const char16_t* srcChars, size_t startIndex,
size_t srcLength, size_t destLength)
{
MOZ_ASSERT_UNREACHABLE("cannot write non-Latin-1 characters into Latin-1 string");
return false;
}
template <typename CharT>
static size_t
ToUpperCaseLength(const CharT* chars, size_t startIndex, size_t length)
{
size_t upperLength = length;
for (size_t i = startIndex; i < length; i++) {
char16_t c = chars[i];
if (c > 0x7f && CanUpperCaseSpecialCasing(static_cast<CharT>(c)))
upperLength += LengthUpperCaseSpecialCasing(static_cast<CharT>(c)) - 1;
}
return upperLength;
}
template <typename DestChar, typename SrcChar>
static inline void
CopyChars(DestChar* destChars, const SrcChar* srcChars, size_t length)
{
static_assert(!IsSame<DestChar, SrcChar>::value, "PodCopy is used for the same type case");
for (size_t i = 0; i < length; i++)
destChars[i] = srcChars[i];
}
template <typename CharT>
static inline void
CopyChars(CharT* destChars, const CharT* srcChars, size_t length)
{
PodCopy(destChars, srcChars, length);
}
template <typename DestChar, typename SrcChar>
static inline UniquePtr<DestChar[], JS::FreePolicy>
ToUpperCase(JSContext* cx, const SrcChar* chars, size_t startIndex, size_t length,
size_t* resultLength)
{
MOZ_ASSERT(startIndex < length);
using DestCharPtr = UniquePtr<DestChar[], JS::FreePolicy>;
*resultLength = length;
DestCharPtr buf = cx->make_pod_array<DestChar>(length + 1);
if (!buf)
return buf;
CopyChars(buf.get(), chars, startIndex);
size_t readChars = ToUpperCaseImpl(buf.get(), chars, startIndex, length, length);
if (readChars < length) {
size_t actualLength = ToUpperCaseLength(chars, readChars, length);
*resultLength = actualLength;
DestCharPtr buf2 = ReallocChars(cx, Move(buf), length + 1, actualLength + 1);
if (!buf2)
return buf2;
buf = Move(buf2);
MOZ_ALWAYS_TRUE(length ==
ToUpperCaseImpl(buf.get(), chars, readChars, length, actualLength));
}
return buf;
}
template <typename CharT>
static JSString*
ToUpperCase(JSContext* cx, JSLinearString* str)
{
typedef UniquePtr<Latin1Char[], JS::FreePolicy> Latin1CharPtr;
typedef UniquePtr<char16_t[], JS::FreePolicy> TwoByteCharPtr;
using Latin1CharPtr = UniquePtr<Latin1Char[], JS::FreePolicy>;
using TwoByteCharPtr = UniquePtr<char16_t[], JS::FreePolicy>;
mozilla::MaybeOneOf<Latin1CharPtr, TwoByteCharPtr> newChars;
size_t length = str->length();
const size_t length = str->length();
size_t resultLength;
{
AutoCheckCannotGC nogc;
const CharT* chars = str->chars<CharT>(nogc);
// Look for the first lower case character.
// Look for the first character that changes when uppercased.
size_t i = 0;
for (; i < length; i++) {
char16_t c = chars[i];
@@ -786,21 +1165,33 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
}
if (unicode::CanUpperCase(c))
break;
if (MOZ_UNLIKELY(c > 0x7f && CanUpperCaseSpecialCasing(static_cast<CharT>(c))))
break;
}
// If all characters are upper case, return the input string.
// If no character needs to change, return the input string.
if (i == length)
return str;
// If the string is Latin1, check if it contains the MICRO SIGN (0xb5)
// or SMALL LETTER Y WITH DIAERESIS (0xff) character. The corresponding
// upper case characters are not in the Latin1 range.
// The string changes when uppercased, so we must create a new string.
// Can it be Latin-1?
//
// If the original string is Latin-1, it can -- unless the string
// contains U+00B5 MICRO SIGN or U+00FF SMALL LETTER Y WITH DIAERESIS,
// the only Latin-1 codepoints that don't uppercase within Latin-1.
// Search for those codepoints to decide whether the new string can be
// Latin-1.
// If the original string is a two-byte string, its uppercase form is
// so rarely Latin-1 that we don't even consider creating a new
// Latin-1 string.
bool resultIsLatin1;
if (IsSame<CharT, Latin1Char>::value) {
resultIsLatin1 = true;
for (size_t j = i; j < length; j++) {
Latin1Char c = chars[j];
if (c == 0xb5 || c == 0xff) {
if (c == unicode::MICRO_SIGN ||
c == unicode::LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)
{
MOZ_ASSERT(unicode::ToUpperCase(c) > JSString::MAX_LATIN1_CHAR);
resultIsLatin1 = false;
break;
@@ -813,31 +1204,29 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
}
if (resultIsLatin1) {
Latin1CharPtr buf = cx->make_pod_array<Latin1Char>(length + 1);
Latin1CharPtr buf = ToUpperCase<Latin1Char>(cx, chars, i, length, &resultLength);
if (!buf)
return nullptr;
ToUpperCaseImpl(buf.get(), chars, i, length);
newChars.construct<Latin1CharPtr>(Move(buf));
} else {
TwoByteCharPtr buf = cx->make_pod_array<char16_t>(length + 1);
TwoByteCharPtr buf = ToUpperCase<char16_t>(cx, chars, i, length, &resultLength);
if (!buf)
return nullptr;
ToUpperCaseImpl(buf.get(), chars, i, length);
newChars.construct<TwoByteCharPtr>(Move(buf));
}
}
JSString* res;
if (newChars.constructed<Latin1CharPtr>()) {
res = NewStringDontDeflate<CanGC>(cx, newChars.ref<Latin1CharPtr>().get(), length);
res = NewStringDontDeflate<CanGC>(cx, newChars.ref<Latin1CharPtr>().get(), resultLength);
if (!res)
return nullptr;
mozilla::Unused << newChars.ref<Latin1CharPtr>().release();
} else {
res = NewStringDontDeflate<CanGC>(cx, newChars.ref<TwoByteCharPtr>().get(), length);
res = NewStringDontDeflate<CanGC>(cx, newChars.ref<TwoByteCharPtr>().get(), resultLength);
if (!res)
return nullptr;
@@ -847,57 +1236,79 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
return res;
}
static bool
ToUpperCaseHelper(JSContext* cx, const CallArgs& args)
JSString*
js::StringToUpperCase(JSContext* cx, HandleLinearString string)
{
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
if (!str)
return false;
JSLinearString* linear = str->ensureLinear(cx);
if (!linear)
return false;
if (linear->hasLatin1Chars())
str = ToUpperCase<Latin1Char>(cx, linear);
else
str = ToUpperCase<char16_t>(cx, linear);
if (!str)
return false;
args.rval().setString(str);
return true;
if (string->hasLatin1Chars())
return ToUpperCase<Latin1Char>(cx, string);
return ToUpperCase<char16_t>(cx, string);
}
bool
js::str_toUpperCase(JSContext* cx, unsigned argc, Value* vp)
{
return ToUpperCaseHelper(cx, CallArgsFromVp(argc, vp));
CallArgs args = CallArgsFromVp(argc, vp);
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
if (!str)
return false;
RootedLinearString linear(cx, str->ensureLinear(cx));
if (!linear)
return false;
JSString* result = StringToUpperCase(cx, linear);
if (!result)
return false;
args.rval().setString(result);
return true;
}
bool
js::str_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp)
js::intl_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp)
{
CallArgs args = CallArgsFromVp(argc, vp);
MOZ_ASSERT(args.length() == 2);
MOZ_ASSERT(args[0].isString());
MOZ_ASSERT(args[1].isString());
/*
* Forcefully ignore the first (or any) argument and return toUpperCase(),
* ECMA has reserved that argument, presumably for defining the locale.
*/
if (cx->runtime()->localeCallbacks && cx->runtime()->localeCallbacks->localeToUpperCase) {
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx));
if (!linear)
return false;
const char* locale = CaseMappingLocale(cx, args[1].toString());
if (!locale)
return false;
// Call String.prototype.toUpperCase() for language independent casing.
if (intl::StringsAreEqual(locale, "")) {
JSString* str = StringToUpperCase(cx, linear);
if (!str)
return false;
RootedValue result(cx);
if (!cx->runtime()->localeCallbacks->localeToUpperCase(cx, str, &result))
return false;
args.rval().set(result);
args.rval().setString(str);
return true;
}
return ToUpperCaseHelper(cx, args);
AutoStableStringChars inputChars(cx);
if (!inputChars.initTwoByte(cx, linear))
return false;
mozilla::Range<const char16_t> input = inputChars.twoByteRange();
// Maximum case mapping length is three characters.
static_assert(JSString::MAX_LENGTH < INT32_MAX / 3,
"Case conversion doesn't overflow int32_t indices");
JSString* str = intl::CallICU(cx, [&input, locale](UChar* chars, int32_t size, UErrorCode* status) {
return u_strToUpper(chars, size, Char16ToUChar(input.begin().get()), input.length(),
locale, status);
});
if (!str)
return false;
args.rval().setString(str);
return true;
}
/* ES2017 21.1.3.12. */
@@ -944,7 +1355,7 @@ js::str_normalize(JSContext* cx, unsigned argc, Value* vp)
if (!linear)
return false;
// Latin1 strings are already in Normalization Form C.
// Latin-1 strings are already in Normalization Form C.
if (form == NFC && linear->hasLatin1Chars()) {
// Step 7.
args.rval().setString(str);
@@ -1359,7 +1770,7 @@ StringMatch(const TextChar* text, uint32_t textLen, const PatChar* pat, uint32_t
/*
* For big patterns with large potential overlap we want the SIMD-optimized
* speed of memcmp. For small patterns, a simple loop is faster. We also can't
* use memcmp if one of the strings is TwoByte and the other is Latin1.
* use memcmp if one of the strings is TwoByte and the other is Latin-1.
*
* FIXME: Linux memcmp performance is sad and the manual loop is faster.
*/
@@ -1555,7 +1966,7 @@ RopeMatch(JSContext* cx, JSRope* text, JSLinearString* pat, int* match)
* need to build the list of leaf nodes. Do both here: iterate over the
* nodes so long as there are not too many.
*
* We also don't use rope matching if the rope contains both Latin1 and
* We also don't use rope matching if the rope contains both Latin-1 and
* TwoByte nodes, to simplify the match algorithm.
*/
{
@@ -2890,8 +3301,8 @@ static const JSFunctionSpec string_methods[] = {
JS_FN("trimStart", str_trimStart, 0,0),
JS_FN("trimRight", str_trimEnd, 0,0),
JS_FN("trimEnd", str_trimEnd, 0,0),
JS_FN("toLocaleLowerCase", str_toLocaleLowerCase, 0,0),
JS_FN("toLocaleUpperCase", str_toLocaleUpperCase, 0,0),
JS_SELF_HOSTED_FN("toLocaleLowerCase", "String_toLocaleLowerCase", 0,0),
JS_SELF_HOSTED_FN("toLocaleUpperCase", "String_toLocaleUpperCase", 0,0),
JS_SELF_HOSTED_FN("localeCompare", "String_localeCompare", 1,0),
JS_SELF_HOSTED_FN("repeat", "String_repeat", 1,0),
JS_FN("normalize", str_normalize, 0,0),
@@ -3000,7 +3411,7 @@ js::str_fromCharCode(JSContext* cx, unsigned argc, Value* vp)
// string (thin or fat) and so we don't need to malloc the chars. (We could
// cover some cases where args.length() goes up to
// JSFatInlineString::MAX_LENGTH_LATIN1 if we also checked if the chars are
// all Latin1, but it doesn't seem worth the effort.)
// all Latin-1, but it doesn't seem worth the effort.)
if (args.length() <= JSFatInlineString::MAX_LENGTH_TWO_BYTE)
return str_fromCharCode_few_args(cx, args);
@@ -3143,7 +3554,7 @@ js::str_fromCodePoint(JSContext* cx, unsigned argc, Value* vp)
// string (thin or fat) and so we don't need to malloc the chars. (We could
// cover some cases where |args.length()| goes up to
// JSFatInlineString::MAX_LENGTH_LATIN1 / 2 if we also checked if the chars
// are all Latin1, but it doesn't seem worth the effort.)
// are all Latin-1, but it doesn't seem worth the effort.)
if (args.length() <= JSFatInlineString::MAX_LENGTH_TWO_BYTE / 2)
return str_fromCodePoint_few_args(cx, args);
+23 -4
View File
@@ -371,11 +371,24 @@ str_trimStart(JSContext* cx, unsigned argc, Value* vp);
extern bool
str_trimEnd(JSContext* cx, unsigned argc, Value* vp);
extern bool
str_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp);
/**
* Returns the input string converted to lower case based on the language
* specific case mappings for the input locale.
*
* Usage: lowerCase = intl_toLocaleLowerCase(string, locale)
*/
extern MOZ_MUST_USE bool
intl_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp);
/**
* Returns the input string converted to upper case based on the language
* specific case mappings for the input locale.
*
* Usage: upperCase = intl_toLocaleUpperCase(string, locale)
*/
extern MOZ_MUST_USE bool
intl_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp);
extern bool
str_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp);
extern bool
str_normalize(JSContext* cx, unsigned argc, Value* vp);
@@ -480,6 +493,12 @@ JSString*
str_replaceAll_string_raw(JSContext* cx, HandleString string, HandleString pattern,
HandleString replacement);
extern JSString*
StringToLowerCase(JSContext* cx, HandleLinearString string);
extern JSString*
StringToUpperCase(JSContext* cx, HandleLinearString string);
extern bool
StringConstructor(JSContext* cx, unsigned argc, Value* vp);
+3 -3
View File
@@ -2207,11 +2207,9 @@ static const JSFunctionSpec intrinsic_functions[] = {
JS_FN("std_String_trimStart", str_trimStart, 0,0),
JS_FN("std_String_trimRight", str_trimEnd, 0,0),
JS_FN("std_String_trimEnd", str_trimEnd, 0,0),
JS_FN("std_String_toLocaleLowerCase", str_toLocaleLowerCase, 0,0),
JS_FN("std_String_toLocaleUpperCase", str_toLocaleUpperCase, 0,0),
JS_FN("std_String_normalize", str_normalize, 0,0),
JS_FN("std_String_concat", str_concat, 1,0),
JS_FN("std_TypedArray_buffer", js::TypedArray_bufferGetter, 1,0),
JS_FN("std_WeakMap_has", WeakMap_has, 1,0),
@@ -2485,6 +2483,8 @@ static const JSFunctionSpec intrinsic_functions[] = {
JS_FN("intl_PluralRules_availableLocales", intl_PluralRules_availableLocales, 0,0),
JS_FN("intl_GetPluralCategories", intl_GetPluralCategories, 2, 0),
JS_FN("intl_SelectPluralRule", intl_SelectPluralRule, 2,0),
JS_FN("intl_toLocaleLowerCase", intl_toLocaleLowerCase, 2,0),
JS_FN("intl_toLocaleUpperCase", intl_toLocaleUpperCase, 2,0),
JS_FN("intl_RelativeTimeFormat_availableLocales", intl_RelativeTimeFormat_availableLocales, 0,0),
JS_FN("intl_FormatRelativeTime", intl_FormatRelativeTime, 3,0),
+281
View File
@@ -0,0 +1,281 @@
# SpecialCasing-11.0.0.txt
# Date: 2018-02-22, 06:16:47 GMT
# © 2018 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
#
# Special Casing
#
# This file is a supplement to the UnicodeData.txt file. It does not define any
# properties, but rather provides additional information about the casing of
# Unicode characters, for situations when casing incurs a change in string length
# or is dependent on context or locale. For compatibility, the UnicodeData.txt
# file only contains simple case mappings for characters where they are one-to-one
# and independent of context and language. The data in this file, combined with
# the simple case mappings in UnicodeData.txt, defines the full case mappings
# Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc).
#
# Note that the preferred mechanism for defining tailored casing operations is
# the Unicode Common Locale Data Repository (CLDR). For more information, see the
# discussion of case mappings and case algorithms in the Unicode Standard.
#
# All code points not listed in this file that do not have a simple case mappings
# in UnicodeData.txt map to themselves.
# ================================================================================
# Format
# ================================================================================
# The entries in this file are in the following machine-readable format:
#
# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
#
# <code>, <lower>, <title>, and <upper> provide the respective full case mappings
# of <code>, expressed as character values in hex. If there is more than one character,
# they are separated by spaces. Other than as used to separate elements, spaces are
# to be ignored.
#
# The <condition_list> is optional. Where present, it consists of one or more language IDs
# or casing contexts, separated by spaces. In these conditions:
# - A condition list overrides the normal behavior if all of the listed conditions are true.
# - The casing context is always the context of the characters in the original string,
# NOT in the resulting string.
# - Case distinctions in the condition list are not significant.
# - Conditions preceded by "Not_" represent the negation of the condition.
# The condition list is not represented in the UCD as a formal property.
#
# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
#
# A casing context for a character is defined by Section 3.13 Default Case Algorithms
# of The Unicode Standard.
#
# Parsers of this file must be prepared to deal with future additions to this format:
# * Additional contexts
# * Additional fields
# ================================================================================
# ================================================================================
# Unconditional mappings
# ================================================================================
# The German es-zed is special--the normal mapping is to SS.
# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))
00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
# Preserve canonical equivalence for I with dot. Turkic is handled below.
0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
# Ligatures
FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF
FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI
FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL
FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI
FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL
FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T
FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST
0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN
FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW
FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH
FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI
FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW
FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
# No corresponding uppercase precomposed character
0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON
1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW
1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS
1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE
1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE
1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING
1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI
1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI
1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI
1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI
1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI
# IMPORTANT-when iota-subscript (0345) is uppercased or titlecased,
# the result will be incorrect unless the iota-subscript is moved to the end
# of any sequence of combining marks. Otherwise, the accents will go on the capital iota.
# This process can be achieved by first transforming the text to NFC before casing.
# E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
# The following cases are already in the UnicodeData.txt file, so are only commented here.
# 0345; 0345; 0399; 0399; # COMBINING GREEK YPOGEGRAMMENI
# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
# have special uppercases.
# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
# Some characters with YPOGEGRAMMENI also have no corresponding titlecases
1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
# ================================================================================
# Conditional Mappings
# The remainder of this file provides conditional casing data used to produce
# full case mappings.
# ================================================================================
# Language-Insensitive Mappings
# These are characters whose full case mappings do not depend on language, but do
# depend on context (which characters come before or after). For more information
# see the header of this file and the Unicode Standard.
# ================================================================================
# Special case for final form of sigma
03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
# Note: the following cases for non-final are already in the UnicodeData.txt file.
# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
# Note: the following cases are not included, since they would case-fold in lowercasing
# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
# ================================================================================
# Language-Sensitive Mappings
# These are characters whose full case mappings depend on language and perhaps also
# context (which characters come before or after). For more information
# see the header of this file and the Unicode Standard.
# ================================================================================
# Lithuanian
# Lithuanian retains the dot in a lowercase i when followed by accents.
# Remove DOT ABOVE after "i" with upper or titlecase
0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
# Introduce an explicit dot above when lowercasing capital I's and J's
# whenever there are more accents above.
# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
# ================================================================================
# Turkish and Azeri
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# The following rules handle those cases.
0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
# This matches the behavior of the canonically equivalent I-dot_above
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
# When uppercasing, i turns into a dotted capital I
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
# Note: the following case is already in the UnicodeData.txt file.
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
# EOF
+1608 -1008
View File
File diff suppressed because it is too large Load Diff
+55 -2
View File
@@ -62,8 +62,16 @@ namespace CharFlag {
const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY;
}
const char16_t NO_BREAK_SPACE = 0x00A0;
const char16_t MICRO_SIGN = 0x00B5;
const char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF;
const char16_t LATIN_SMALL_LETTER_Y_WITH_DIAERESIS = 0x00FF;
const char16_t LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE = 0x0130;
const char16_t COMBINING_DOT_ABOVE = 0x0307;
const char16_t GREEK_CAPITAL_LETTER_SIGMA = 0x03A3;
const char16_t GREEK_SMALL_LETTER_FINAL_SIGMA = 0x03C2;
const char16_t GREEK_SMALL_LETTER_SIGMA = 0x03C3;
const char16_t BYTE_ORDER_MARK2 = 0xFFFE;
const char16_t NO_BREAK_SPACE = 0x00A0;
const char16_t LeadSurrogateMin = 0xD800;
const char16_t LeadSurrogateMax = 0xDBFF;
@@ -239,6 +247,10 @@ IsSpaceOrBOM2(char16_t ch)
return CharInfo(ch).isSpace();
}
/*
* Returns the simple upper case mapping (see CanUpperCaseSpecialCasing for
* details) of the given UTF-16 code unit.
*/
inline char16_t
ToUpperCase(char16_t ch)
{
@@ -253,6 +265,10 @@ ToUpperCase(char16_t ch)
return uint16_t(ch) + info.upperCase;
}
/*
* Returns the simple lower case mapping (see CanUpperCaseSpecialCasing for
* details) of the given UTF-16 code unit.
*/
inline char16_t
ToLowerCase(char16_t ch)
{
@@ -329,6 +345,43 @@ ToLowerCaseNonBMPTrail(char16_t lead, char16_t trail)
return trail;
}
/*
* Returns true if the given UTF-16 code unit has a language-independent,
* unconditional or conditional special upper case mapping.
*
* Unicode defines two case mapping modes:
* 1. "simple case mappings" for one-to-one mappings which are independent of
* context and language (defined in UnicodeData.txt).
* 2. "special case mappings" for mappings which can increase or decrease the
* string length; or are dependent on context or locale (defined in
* SpecialCasing.txt).
*
* The CanUpperCase() method defined above only supports simple case mappings.
* In order to support the full case mappings of all Unicode characters,
* callers need to check this method in addition to CanUpperCase().
*
* NOTE: All special upper case mappings are unconditional in Unicode 9.
*/
bool
CanUpperCaseSpecialCasing(char16_t ch);
/*
* Returns the length of the upper case mapping of |ch|.
*
* This function asserts if |ch| doesn't have a special upper case mapping.
*/
size_t
LengthUpperCaseSpecialCasing(char16_t ch);
/*
* Appends the upper case mapping of |ch| to the given output buffer,
* starting at the provided index.
*
* This function asserts if |ch| doesn't have a special upper case mapping.
*/
void
AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index);
/*
* For a codepoint C, CodepointsWithSameUpperCaseInfo stores three offsets
* from C to up to three codepoints with same uppercase (no codepoint in
@@ -504,7 +557,7 @@ UTF16Encode(uint32_t codePoint, char16_t* lead, char16_t* trail)
*trail = TrailSurrogate(codePoint);
}
static inline void
inline void
UTF16Encode(uint32_t codePoint, char16_t* elements, unsigned* index)
{
if (!IsSupplementary(codePoint)) {
+24
View File
@@ -19,6 +19,12 @@
// DIFF: the difference between the code point in the range and
// converted code point
// U+10400 DESERET CAPITAL LETTER LONG I .. U+10427 DESERET CAPITAL LETTER EW
// U+104B0 OSAGE CAPITAL LETTER A .. U+104D3 OSAGE CAPITAL LETTER ZHA
// U+10C80 OLD HUNGARIAN CAPITAL LETTER A .. U+10CB2 OLD HUNGARIAN CAPITAL LETTER US
// U+118A0 WARANG CITI CAPITAL LETTER NGAA .. U+118BF WARANG CITI CAPITAL LETTER VIYO
// U+16E40 MEDEFAIDRIN CAPITAL LETTER M .. U+16E5F MEDEFAIDRIN CAPITAL LETTER Y
// U+1E900 ADLAM CAPITAL LETTER ALIF .. U+1E921 ADLAM CAPITAL LETTER SHA
#define FOR_EACH_NON_BMP_LOWERCASE(macro) \
macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40) \
macro(0x104b0, 0x104d3, 0xd801, 0xdcb0, 0xdcd3, 40) \
@@ -27,6 +33,12 @@
macro(0x16e40, 0x16e5f, 0xd81b, 0xde40, 0xde5f, 32) \
macro(0x1e900, 0x1e921, 0xd83a, 0xdd00, 0xdd21, 34)
// U+10428 DESERET SMALL LETTER LONG I .. U+1044F DESERET SMALL LETTER EW
// U+104D8 OSAGE SMALL LETTER A .. U+104FB OSAGE SMALL LETTER ZHA
// U+10CC0 OLD HUNGARIAN SMALL LETTER A .. U+10CF2 OLD HUNGARIAN SMALL LETTER US
// U+118C0 WARANG CITI SMALL LETTER NGAA .. U+118DF WARANG CITI SMALL LETTER VIYO
// U+16E60 MEDEFAIDRIN SMALL LETTER M .. U+16E7F MEDEFAIDRIN SMALL LETTER Y
// U+1E922 ADLAM SMALL LETTER ALIF .. U+1E943 ADLAM SMALL LETTER SHA
#define FOR_EACH_NON_BMP_UPPERCASE(macro) \
macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40) \
macro(0x104d8, 0x104fb, 0xd801, 0xdcd8, 0xdcfb, -40) \
@@ -35,6 +47,12 @@
macro(0x16e60, 0x16e7f, 0xd81b, 0xde60, 0xde7f, -32) \
macro(0x1e922, 0x1e943, 0xd83a, 0xdd22, 0xdd43, -34)
// U+10400 DESERET CAPITAL LETTER LONG I .. U+10427 DESERET CAPITAL LETTER EW
// U+104B0 OSAGE CAPITAL LETTER A .. U+104D3 OSAGE CAPITAL LETTER ZHA
// U+10C80 OLD HUNGARIAN CAPITAL LETTER A .. U+10CB2 OLD HUNGARIAN CAPITAL LETTER US
// U+118A0 WARANG CITI CAPITAL LETTER NGAA .. U+118BF WARANG CITI CAPITAL LETTER VIYO
// U+16E40 MEDEFAIDRIN CAPITAL LETTER M .. U+16E5F MEDEFAIDRIN CAPITAL LETTER Y
// U+1E900 ADLAM CAPITAL LETTER ALIF .. U+1E921 ADLAM CAPITAL LETTER SHA
#define FOR_EACH_NON_BMP_CASE_FOLDING(macro) \
macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40) \
macro(0x104b0, 0x104d3, 0xd801, 0xdcb0, 0xdcd3, 40) \
@@ -43,6 +61,12 @@
macro(0x16e40, 0x16e5f, 0xd81b, 0xde40, 0xde5f, 32) \
macro(0x1e900, 0x1e921, 0xd83a, 0xdd00, 0xdd21, 34)
// U+10428 DESERET SMALL LETTER LONG I .. U+1044F DESERET SMALL LETTER EW
// U+104D8 OSAGE SMALL LETTER A .. U+104FB OSAGE SMALL LETTER ZHA
// U+10CC0 OLD HUNGARIAN SMALL LETTER A .. U+10CF2 OLD HUNGARIAN SMALL LETTER US
// U+118C0 WARANG CITI SMALL LETTER NGAA .. U+118DF WARANG CITI SMALL LETTER VIYO
// U+16E60 MEDEFAIDRIN SMALL LETTER M .. U+16E7F MEDEFAIDRIN SMALL LETTER Y
// U+1E922 ADLAM SMALL LETTER ALIF .. U+1E943 ADLAM SMALL LETTER SHA
#define FOR_EACH_NON_BMP_REV_CASE_FOLDING(macro) \
macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40) \
macro(0x104d8, 0x104fb, 0xd801, 0xdcd8, 0xdcfb, -40) \
+515 -152
View File
@@ -26,6 +26,18 @@ import re
import os
import sys
from contextlib import closing
from functools import partial
from itertools import chain, groupby, ifilter, imap, izip_longest, tee
from operator import is_not, itemgetter
class codepoint_dict(dict):
def name(self, code_point):
(_, _, name, alias) = self[code_point]
return '{}{}'.format(name, (' (' + alias + ')' if alias else ''))
def full_name(self, code_point):
(_, _, name, alias) = self[code_point]
return 'U+{:04X} {}{}'.format(code_point, name, (' (' + alias + ')' if alias else ''))
# ECMAScript 2016
# §11.2 White Space
@@ -132,10 +144,32 @@ def read_derived_core_properties(derived_core_properties):
for char in range(int(start, 16), int(end, 16) + 1):
yield (char, char_property)
def read_special_casing(special_casing):
# Format:
# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
for line in special_casing:
if line == '\n' or line.startswith('#'):
continue
row = line.split('#')[0].split(';')
code = int(row[0].strip(), 16)
lower = row[1].strip()
lower = [int(c, 16) for c in lower.split(' ')] if lower else []
upper = row[3].strip()
upper = [int(c, 16) for c in upper.split(' ')] if upper else []
languages = []
contexts = []
condition = row[4].strip()
if condition:
for cond in condition.split(' '):
if cond[0].islower():
languages.append(cond)
else:
contexts.append(cond)
pass
yield (code, lower, upper, languages, contexts)
def int_ranges(ints):
""" Yields consecutive ranges (inclusive) from integer values. """
from itertools import tee, izip_longest
(a, b) = tee(sorted(ints))
start = next(b)
for (curr, succ) in izip_longest(a, b):
@@ -153,7 +187,7 @@ def utf16_encode(code):
return lead, trail
def make_non_bmp_convert_macro(out_file, name, convert_map):
def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table):
# Find continuous range in convert_map.
convert_list = []
entry = None
@@ -179,6 +213,7 @@ def make_non_bmp_convert_macro(out_file, name, convert_map):
# Generate macro call for each range.
lines = []
comment = []
for entry in convert_list:
from_code = entry['code']
to_code = entry['code'] + entry['length'] - 1
@@ -190,29 +225,15 @@ def make_non_bmp_convert_macro(out_file, name, convert_map):
lines.append(' macro(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})'.format(
from_code, to_code, lead, from_trail, to_trail, diff))
comment.append('// {} .. {}'.format(codepoint_table.full_name(from_code),
codepoint_table.full_name(to_code)))
out_file.write('\n'.join(comment))
out_file.write('\n')
out_file.write('#define FOR_EACH_NON_BMP_{}(macro) \\\n'.format(name))
out_file.write(' \\\n'.join(lines))
out_file.write('\n')
def for_each_non_bmp_group(group_set):
# Find continuous range in group_set.
group_list = []
entry = None
for code in sorted(group_set.keys()):
if entry and code == entry['code'] + entry['length']:
entry['length'] += 1
continue
entry = {
'code': code,
'length': 1
}
group_list.append(entry)
for entry in group_list:
yield (entry['code'], entry['code'] + entry['length'] - 1)
def process_derived_core_properties(derived_core_properties):
id_start = set()
id_continue = set()
@@ -236,7 +257,7 @@ def process_unicode_data(unicode_data, derived_core_properties):
same_upper_cache = {same_upper_dummy: 0}
same_upper_index = [0] * (MAX_BMP + 1)
test_table = {}
codepoint_table = codepoint_dict()
test_space_table = []
non_bmp_lower_map = {}
@@ -254,15 +275,9 @@ def process_unicode_data(unicode_data, derived_core_properties):
alias = row[-5]
uppercase = row[-3]
lowercase = row[-2]
flags = 0
if uppercase:
upper = int(uppercase, 16)
if upper not in same_upper_map:
same_upper_map[upper] = [code]
else:
same_upper_map[upper].append(code)
else:
upper = code
@@ -271,6 +286,8 @@ def process_unicode_data(unicode_data, derived_core_properties):
else:
lower = code
codepoint_table[code] = (upper, lower, name, alias)
if code > MAX_BMP:
if code != lower:
non_bmp_lower_map[code] = lower
@@ -285,6 +302,16 @@ def process_unicode_data(unicode_data, derived_core_properties):
non_bmp_id_cont_set[code] = 1
continue
assert lower <= MAX_BMP and upper <= MAX_BMP
if code != upper:
if upper not in same_upper_map:
same_upper_map[upper] = [code]
else:
same_upper_map[upper].append(code)
flags = 0
# we combine whitespace and lineterminators because in pratice we don't need them separated
if category == 'Zs' or code in whitespace or code in line_terminator:
flags |= FLAG_SPACE
@@ -298,8 +325,6 @@ def process_unicode_data(unicode_data, derived_core_properties):
elif code in id_continue or code in compatibility_identifier_part:
flags |= FLAG_UNICODE_ID_CONTINUE_ONLY
test_table[code] = (upper, lower, name, alias)
up_d = upper - code
low_d = lower - code
@@ -319,12 +344,12 @@ def process_unicode_data(unicode_data, derived_core_properties):
index[code] = i
for code in range(0, MAX_BMP + 1):
entry = test_table.get(code)
entry = codepoint_table.get(code)
if not entry:
continue
(upper, lower, name, alias) = entry
(upper, _, _, _) = entry
if upper not in same_upper_map:
continue
@@ -354,7 +379,7 @@ def process_unicode_data(unicode_data, derived_core_properties):
non_bmp_lower_map, non_bmp_upper_map,
non_bmp_space_set,
non_bmp_id_start_set, non_bmp_id_cont_set,
test_table, test_space_table,
codepoint_table, test_space_table,
)
def process_case_folding(case_folding):
@@ -438,9 +463,149 @@ def process_case_folding(case_folding):
folding_tests
)
def process_special_casing(special_casing, table, index):
# Unconditional special casing.
unconditional_tolower = {}
unconditional_toupper = {}
# Conditional special casing, language independent.
conditional_tolower = {}
conditional_toupper = {}
# Conditional special casing, language dependent.
lang_conditional_tolower = {}
lang_conditional_toupper = {}
def caseInfo(code):
(upper, lower, flags) = table[index[code]]
return ((code + lower) & 0xffff, (code + upper) & 0xffff)
for (code, lower, upper, languages, contexts) in read_special_casing(special_casing):
assert code <= MAX_BMP, 'Unexpected character outside of BMP: %s' % code
assert len(languages) <= 1, 'Expected zero or one language ids: %s' % languages
assert len(contexts) <= 1, 'Expected zero or one casing contexts: %s' % languages
(default_lower, default_upper) = caseInfo(code)
special_lower = len(lower) != 1 or lower[0] != default_lower
special_upper = len(upper) != 1 or upper[0] != default_upper
# Invariant: If |code| has casing per UnicodeData.txt, then it also has
# casing rules in SpecialCasing.txt.
assert code == default_lower or len(lower) != 1 or code != lower[0]
assert code == default_upper or len(upper) != 1 or code != upper[0]
language = languages[0] if languages else None
context = contexts[0] if contexts else None
if not language and not context:
if special_lower:
unconditional_tolower[code] = lower
if special_upper:
unconditional_toupper[code] = upper
elif not language and context:
if special_lower:
conditional_tolower[code] = (lower, context)
if special_upper:
conditional_toupper[code] = (upper, context)
else:
if language not in lang_conditional_tolower:
lang_conditional_tolower[language] = {}
lang_conditional_toupper[language] = {}
if special_lower:
lang_conditional_tolower[language][code] = (lower, context)
if special_upper:
lang_conditional_toupper[language][code] = (upper, context)
# Certain special casing rules are inlined in jsstr.cpp, ensure these cases
# still match the current SpecialCasing.txt file.
def lowerCase(code):
(lower, _) = caseInfo(code)
return lower
def upperCase(code):
(_, upper) = caseInfo(code)
return upper
def ascii(char_dict):
return ifilter(lambda ch: ch <= 0x7f, char_dict.iterkeys())
def latin1(char_dict):
return ifilter(lambda ch: ch <= 0xff, char_dict.iterkeys())
def is_empty(iterable):
return not any(True for _ in iterable)
def is_equals(iter1, iter2):
return all(x == y for (x, y) in izip_longest(iter1, iter2))
# Ensure no ASCII characters have special case mappings.
assert is_empty(ascii(unconditional_tolower))
assert is_empty(ascii(unconditional_toupper))
assert is_empty(ascii(conditional_tolower))
assert is_empty(ascii(conditional_toupper))
# Ensure no Latin1 characters have special lower case mappings.
assert is_empty(latin1(unconditional_tolower))
assert is_empty(latin1(conditional_tolower))
# Ensure no Latin1 characters have conditional special upper case mappings.
assert is_empty(latin1(conditional_toupper))
# Ensure U+00DF is the only Latin1 character with a special upper case mapping.
assert is_equals([0x00DF], latin1(unconditional_toupper))
# Ensure U+0130 is the only character with a special lower case mapping.
assert is_equals([0x0130], unconditional_tolower)
# Ensure no characters have language independent conditional upper case mappings.
assert is_empty(conditional_toupper)
# Ensure U+03A3 is the only character with language independent conditional lower case mapping.
assert is_equals([0x03A3], conditional_tolower)
# Verify U+0130 and U+03A3 have simple lower case mappings.
assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3])
# Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings.
assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.iterkeys()))
assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.iterkeys()))
# Maximum case mapping length is three characters.
itervals = lambda d: d.itervalues()
assert max(imap(len, chain(
itervals(unconditional_tolower),
itervals(unconditional_toupper),
imap(itemgetter(0), itervals(conditional_tolower)),
imap(itemgetter(0), itervals(conditional_toupper)),
imap(itemgetter(0), chain.from_iterable(imap(itervals, itervals(lang_conditional_tolower)))),
imap(itemgetter(0), chain.from_iterable(imap(itervals, itervals(lang_conditional_toupper)))),
))) <= 3
# Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms).
assert set([
'After_I', 'After_Soft_Dotted', 'Final_Sigma', 'More_Above', 'Not_Before_Dot',
]).issuperset(set(ifilter(partial(is_not, None), chain(
imap(itemgetter(1), itervals(conditional_tolower)),
imap(itemgetter(1), itervals(conditional_toupper)),
imap(itemgetter(1), chain.from_iterable(imap(itervals, itervals(lang_conditional_tolower)))),
imap(itemgetter(1), chain.from_iterable(imap(itervals, itervals(lang_conditional_toupper)))),
))))
# Special casing for U+00DF (LATIN SMALL LETTER SHARP S).
assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [0x0053, 0x0053];
# Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE).
assert unconditional_tolower[0x0130] == [0x0069, 0x0307]
# Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA).
assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == ([0x03C2], 'Final_Sigma');
return (unconditional_tolower, unconditional_toupper)
def make_non_bmp_file(version,
non_bmp_lower_map, non_bmp_upper_map,
non_bmp_folding_map, non_bmp_rev_folding_map):
non_bmp_folding_map, non_bmp_rev_folding_map,
codepoint_table):
file_name = 'UnicodeNonBMP.h';
with io.open(file_name, mode='wb') as non_bmp_file:
non_bmp_file.write(mpl_license)
@@ -463,77 +628,277 @@ def make_non_bmp_file(version,
""")
make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map)
make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map, codepoint_table)
non_bmp_file.write('\n')
make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map)
make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map, codepoint_table)
non_bmp_file.write('\n')
make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map)
make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map, codepoint_table)
non_bmp_file.write('\n')
make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map)
make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map, codepoint_table)
non_bmp_file.write("""
#endif /* vm_UnicodeNonBMP_h */
""")
def make_bmp_mapping_test(version, test_table):
def write_special_casing_methods(unconditional_toupper, codepoint_table, println):
def hexlit(n):
""" Returns C++ hex-literal for |n|. """
return '0x{:04X}'.format(n)
def describe_range(ranges, depth):
indent = depth * ' '
for (start, end) in ranges:
if start == end:
println(indent, '// {}'.format(codepoint_table.full_name(start)))
else:
println(indent, '// {} .. {}'.format(codepoint_table.full_name(start),
codepoint_table.full_name(end)))
def out_range(start, end):
""" Tests if the input character isn't a member of the set {x | start <= x <= end}. """
if (start == end):
return 'ch != {}'.format(hexlit(start))
return 'ch < {} || ch > {}'.format(hexlit(start), hexlit(end))
def in_range(start, end, parenthesize=False):
""" Tests if the input character is in the set {x | start <= x <= end}. """
if (start == end):
return 'ch == {}'.format(hexlit(start))
(left, right) = ('(', ')') if parenthesize else ('', '')
return '{}ch >= {} && ch <= {}{}'.format(left, hexlit(start), hexlit(end), right)
def in_any_range(ranges, spaces):
""" Tests if the input character is included in any of the given ranges. """
lines = [[]]
for (start, end) in ranges:
expr = in_range(start, end, parenthesize=True)
line = ' || '.join(lines[-1] + [expr])
if len(line) < (100 - len(spaces) - len(' ||')):
lines[-1].append(expr)
else:
lines.append([expr])
return ' ||\n{}'.format(spaces).join(imap(lambda t: ' || '.join(t), lines))
def write_range_accept(parent_list, child_list, depth):
""" Accepts the input character if it matches any code unit in |child_list|. """
(min_parent, max_parent) = (parent_list[0], parent_list[-1])
(min_child, max_child) = (child_list[0], child_list[-1])
assert min_child >= min_parent
assert max_child <= max_parent
indent = depth * ' '
child_ranges = list(int_ranges(child_list))
has_successor = max_child != max_parent
# If |child_list| is a contiguous list of code units, emit a simple
# range check: |min_child <= input <= max_child|.
if len(child_ranges) == 1:
describe_range(child_ranges, depth)
if has_successor:
println(indent, 'if (ch <= {})'.format(hexlit(max_child)))
println(indent, ' return ch >= {};'.format(hexlit(min_child)))
else:
println(indent, 'return {};'.format(in_range(min_child, max_child)))
return
# Otherwise create a disjunction over the subranges in |child_ranges|.
if not has_successor:
spaces = indent + len('return ') * ' '
else:
spaces = indent + len(' return ') * ' '
range_test_expr = in_any_range(child_ranges, spaces)
if min_child != min_parent:
println(indent, 'if (ch < {})'.format(hexlit(min_child)))
println(indent, ' return false;')
# If there's no successor block, we can omit the |input <= max_child| check,
# because it was already checked when we emitted the parent range test.
if not has_successor:
describe_range(child_ranges, depth)
println(indent, 'return {};'.format(range_test_expr))
else:
println(indent, 'if (ch <= {}) {{'.format(hexlit(max_child)))
describe_range(child_ranges, depth + 1)
println(indent, ' return {};'.format(range_test_expr))
println(indent, '}')
def write_CanUpperCaseSpecialCasing():
""" Checks if the input has a special upper case mapping. """
println('bool')
println('js::unicode::CanUpperCaseSpecialCasing(char16_t ch)')
println('{')
assert unconditional_toupper, "|unconditional_toupper| is not empty"
# Sorted list of code units with special upper case mappings.
code_list = sorted(unconditional_toupper.iterkeys())
# Fail-fast if the input character isn't a special casing character.
println(' if ({})'.format(out_range(code_list[0], code_list[-1])))
println(' return false;')
for i in range(0, 16):
# Check if the input characters is in the range:
# |start_point <= input < end_point|.
start_point = i << 12
end_point = (i + 1) << 12
matches = [cu for cu in code_list if start_point <= cu < end_point]
# Skip empty ranges.
if not matches:
continue
# If |matches| consists of only a few characters, directly check
# the input against the characters in |matches|.
if len(matches) <= 8:
write_range_accept(code_list, matches, depth=1)
continue
# Otherwise split into further subranges.
# Only enter the if-block if the input is less-or-equals to the
# largest value in the current range.
is_last_block = matches[-1] == code_list[-1]
if not is_last_block:
println(' if (ch <= {}) {{'.format(hexlit(matches[-1])))
else:
println(' if (ch < {})'.format(hexlit(matches[0])))
println(' return false;')
for j in range(0, 16):
inner_start = start_point + (j << 8)
inner_end = start_point + ((j + 1) << 8)
inner_matches = [cu for cu in matches if inner_start <= cu < inner_end]
if inner_matches:
d = 1 if is_last_block else 2
write_range_accept(matches, inner_matches, depth=d)
if not is_last_block:
println(' }')
println('}')
def write_LengthUpperCaseSpecialCasing():
""" Slow case: Special casing character was found, returns its mapping length. """
println('size_t')
println('js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)')
println('{')
println(' switch(ch) {')
for (code, converted) in sorted(unconditional_toupper.iteritems(), key=itemgetter(0)):
println(' case {}: return {}; // {}'.format(hexlit(code), len(converted),
codepoint_table.name(code)))
println(' }')
println('')
println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");')
println(' return 0;')
println('}')
def write_AppendUpperCaseSpecialCasing():
""" Slow case: Special casing character was found, append its mapping characters. """
println('void')
println('js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)')
println('{')
println(' switch(ch) {')
for (code, converted) in sorted(unconditional_toupper.iteritems(), key=itemgetter(0)):
println(' case {}: // {}'.format(hexlit(code), codepoint_table.name(code)))
for ch in converted:
println(' elements[(*index)++] = {}; // {}'.format(hexlit(ch),
codepoint_table.name(ch)))
println(' return;')
println(' }')
println('')
println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");')
println(' return;')
println('}')
write_CanUpperCaseSpecialCasing()
println('')
write_LengthUpperCaseSpecialCasing()
println('')
write_AppendUpperCaseSpecialCasing()
def make_bmp_mapping_test(version, codepoint_table, unconditional_tolower, unconditional_toupper):
def unicodeEsc(n):
return '\u{:04X}'.format(n)
file_name = '../tests/ecma_5/String/string-upper-lower-mapping.js'
with io.open(file_name, mode='wb') as test_mapping:
test_mapping.write(warning_message)
test_mapping.write(unicode_version_message.format(version))
test_mapping.write(public_domain)
test_mapping.write('var mapping = [\n')
with io.open(file_name, mode='wb') as output:
write = partial(print, file=output, sep='', end='')
println = partial(print, file=output, sep='', end='\n')
write(warning_message)
write(unicode_version_message.format(version))
write(public_domain)
println('var mapping = [')
for code in range(0, MAX_BMP + 1):
entry = test_table.get(code)
entry = codepoint_table.get(code)
if entry:
(upper, lower, name, alias) = entry
test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' +
name + (' (' + alias + ')' if alias else '') + ' */\n')
(upper, lower, _, _) = entry
upper = unconditional_toupper[code] if code in unconditional_toupper else [upper]
lower = unconditional_tolower[code] if code in unconditional_tolower else [lower]
println(' ["{}", "{}"], /* {} */'.format("".join(imap(unicodeEsc, upper)),
"".join(imap(unicodeEsc, lower)),
codepoint_table.name(code)))
else:
test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n')
test_mapping.write('];')
test_mapping.write("""
println(' ["{0}", "{0}"],'.format(unicodeEsc(code)))
println('];')
write("""
assertEq(mapping.length, 0x10000);
for (var i = 0; i <= 0xffff; i++) {
var char = String.fromCharCode(i);
var info = mapping[i];
assertEq(char.toUpperCase().charCodeAt(0), info[0]);
assertEq(char.toLowerCase().charCodeAt(0), info[1]);
assertEq(char.toUpperCase(), info[0]);
assertEq(char.toLowerCase(), info[1]);
}
if (typeof reportCompare === "function")
reportCompare(true, true);
""")
def make_non_bmp_mapping_test(version, non_bmp_upper_map, non_bmp_lower_map):
def make_non_bmp_mapping_test(version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table):
file_name = '../tests/ecma_6/String/string-code-point-upper-lower-mapping.js'
with io.open(file_name, mode='wb') as test_non_bmp_mapping:
test_non_bmp_mapping.write(warning_message)
test_non_bmp_mapping.write(unicode_version_message.format(version))
test_non_bmp_mapping.write(public_domain)
for code in sorted(non_bmp_upper_map.keys()):
test_non_bmp_mapping.write("""\
assertEq(String.fromCodePoint(0x{:x}).toUpperCase().codePointAt(0), 0x{:x});
""".format(code, non_bmp_upper_map[code]))
assertEq(String.fromCodePoint(0x{:04X}).toUpperCase().codePointAt(0), 0x{:04X}); // {}, {}
""".format(code, non_bmp_upper_map[code],
codepoint_table.name(code), codepoint_table.name(non_bmp_upper_map[code])))
for code in sorted(non_bmp_lower_map.keys()):
test_non_bmp_mapping.write("""\
assertEq(String.fromCodePoint(0x{:x}).toLowerCase().codePointAt(0), 0x{:x});
""".format(code, non_bmp_lower_map[code]))
assertEq(String.fromCodePoint(0x{:04X}).toLowerCase().codePointAt(0), 0x{:04X}); // {}, {}
""".format(code, non_bmp_lower_map[code],
codepoint_table.name(code), codepoint_table.name(non_bmp_lower_map[code])))
test_non_bmp_mapping.write("""
if (typeof reportCompare === "function")
reportCompare(true, true);
""")
def make_space_test(version, test_space_table):
def make_space_test(version, test_space_table, codepoint_table):
def hex_and_name(c):
return ' 0x{:04X} /* {} */'.format(c, codepoint_table.name(c))
file_name = '../tests/ecma_5/String/string-space-trim.js'
with io.open(file_name, mode='wb') as test_space:
test_space.write(warning_message)
test_space.write(unicode_version_message.format(version))
test_space.write(public_domain)
test_space.write('var onlySpace = String.fromCharCode(' +
', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')
test_space.write('var onlySpace = String.fromCharCode(\n')
test_space.write(',\n'.join(map(hex_and_name, test_space_table)))
test_space.write('\n);\n')
test_space.write("""
assertEq(onlySpace.trim(), "");
assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
@@ -544,7 +909,10 @@ if (typeof reportCompare === "function")
reportCompare(true, true);
""")
def make_icase_test(version, folding_tests):
def make_icase_test(version, folding_tests, codepoint_table):
def char_hex(c):
return '0x{:04X}'.format(c)
file_name = '../tests/ecma_6/RegExp/unicode-ignoreCase.js'
with io.open(file_name, mode='wb') as test_icase:
test_icase.write(warning_message)
@@ -565,7 +933,8 @@ function test(code, ...equivs) {
}
""")
for args in folding_tests:
test_icase.write('test(' + ','.join([hex(c) for c in args]) + ');\n')
test_icase.write('test({}); // {}\n'.format(', '.join(map(char_hex, args)),
', '.join(map(codepoint_table.name, args))))
test_icase.write("""
if (typeof reportCompare === "function")
reportCompare(true, true);
@@ -576,7 +945,9 @@ def make_unicode_file(version,
same_upper_table, same_upper_index,
folding_table, folding_index,
non_bmp_space_set,
non_bmp_id_start_set, non_bmp_id_cont_set):
non_bmp_id_start_set, non_bmp_id_cont_set,
unconditional_toupper,
codepoint_table):
index1, index2, shift = splitbins(index)
# Don't forget to update CharInfo in Unicode.h if you need to change this
@@ -665,8 +1036,8 @@ def make_unicode_file(version,
* stop if you found the best shift
*/
"""
def dump(data, name, file):
file.write('const uint8_t unicode::' + name + '[] = {\n')
def dump(data, name, println):
println('const uint8_t unicode::{}[] = {{'.format(name))
line = pad = ' ' * 4
lines = []
@@ -682,93 +1053,79 @@ def make_unicode_file(version,
line = line + s + ', '
lines.append(line.rstrip())
file.write('\n'.join(lines))
file.write('\n};\n')
println('\n'.join(lines))
println('};')
def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println):
println('const {} unicode::{}[] = {{'.format(data_type, name))
for d in tbl:
println(' {{ {} }},'.format(', '.join(str(e) for e in d)))
println('};')
println('')
dump(idx1, idx1_name, println)
println('')
dump(idx2, idx2_name, println)
println('')
def write_supplemental_identifier_method(name, group_set, println):
println('bool')
println('js::unicode::{}(uint32_t codePoint)'.format(name))
println('{')
for (from_code, to_code) in int_ranges(group_set.keys()):
println(' if (codePoint >= 0x{:X} && codePoint <= 0x{:X}) // {} .. {}'.format(from_code,
to_code,
codepoint_table.name(from_code),
codepoint_table.name(to_code)))
println(' return true;')
println(' return false;')
println('}')
println('')
file_name = 'Unicode.cpp'
with io.open(file_name, 'wb') as data_file:
data_file.write(warning_message)
data_file.write(unicode_version_message.format(version))
data_file.write(public_domain)
data_file.write('#include "vm/Unicode.h"\n\n')
data_file.write('using namespace js;\n')
data_file.write('using namespace js::unicode;\n')
data_file.write(comment)
data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')
for d in table:
data_file.write(' {')
data_file.write(', '.join((str(e) for e in d)))
data_file.write('},\n')
data_file.write('};\n')
data_file.write('\n')
write = partial(print, file=data_file, sep='', end='')
println = partial(print, file=data_file, sep='', end='\n')
dump(index1, 'index1', data_file)
data_file.write('\n')
dump(index2, 'index2', data_file)
data_file.write('\n')
write(warning_message)
write(unicode_version_message.format(version))
write(public_domain)
println('#include "vm/Unicode.h"')
println('')
println('using namespace js;')
println('using namespace js::unicode;')
write(comment)
data_file.write('const CodepointsWithSameUpperCaseInfo unicode::js_codepoints_with_same_upper_info[] = {\n')
for d in same_upper_table:
data_file.write(' {')
data_file.write(', '.join((str(e) for e in d)))
data_file.write('},\n')
data_file.write('};\n')
data_file.write('\n')
write_table('CharacterInfo',
'js_charinfo', table,
'index1', index1,
'index2', index2,
println)
dump(same_upper_index1, 'codepoints_with_same_upper_index1', data_file)
data_file.write('\n')
dump(same_upper_index2, 'codepoints_with_same_upper_index2', data_file)
data_file.write('\n')
write_table('CodepointsWithSameUpperCaseInfo',
'js_codepoints_with_same_upper_info', same_upper_table,
'codepoints_with_same_upper_index1', same_upper_index1,
'codepoints_with_same_upper_index2', same_upper_index2,
println)
data_file.write('const FoldingInfo unicode::js_foldinfo[] = {\n')
for d in folding_table:
data_file.write(' {')
data_file.write(', '.join((str(e) for e in d)))
data_file.write('},\n')
data_file.write('};\n')
data_file.write('\n')
dump(folding_index1, 'folding_index1', data_file)
data_file.write('\n')
dump(folding_index2, 'folding_index2', data_file)
data_file.write('\n')
write_table('FoldingInfo',
'js_foldinfo', folding_table,
'folding_index1', folding_index1,
'folding_index2', folding_index2,
println)
# If the following assert fails, it means space character is added to
# non-BMP area. In that case the following code should be uncommented
# and the corresponding code should be added to frontend.
assert len(non_bmp_space_set.keys()) == 0
data_file.write("""\
bool
js::unicode::IsIdentifierStartNonBMP(uint32_t codePoint)
{
""")
write_supplemental_identifier_method('IsIdentifierStartNonBMP', non_bmp_id_start_set,
println)
for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_start_set):
data_file.write("""\
if (codePoint >= 0x{:x} && codePoint <= 0x{:x})
return true;
""".format(from_code, to_code))
write_supplemental_identifier_method('IsIdentifierPartNonBMP', non_bmp_id_cont_set,
println)
data_file.write("""\
return false;
}
bool
js::unicode::IsIdentifierPartNonBMP(uint32_t codePoint)
{
""")
for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_cont_set):
data_file.write("""\
if (codePoint >= 0x{:x} && codePoint <= 0x{:x})
return true;
""".format(from_code, to_code))
data_file.write("""\
return false;
}
""")
write_special_casing_methods(unconditional_toupper, codepoint_table, println)
def getsize(data):
""" return smallest possible integer size for the given array """
@@ -842,10 +1199,8 @@ def splitbins(t):
def make_irregexp_tables(version,
table, index,
folding_table, folding_index,
test_table):
codepoint_table):
import string
from functools import partial
from itertools import chain, ifilter, imap
MAX_ASCII = 0x7F
MAX_LATIN1 = 0xFF
@@ -894,13 +1249,13 @@ def make_irregexp_tables(version,
def char_name(code):
assert 0 <= code and code <= MAX_BMP
if code not in test_table:
if code not in codepoint_table:
return '<Unused>'
if code == LEAD_SURROGATE_MIN:
return '<Lead Surrogate Min>'
if code == TRAIL_SURROGATE_MAX:
return '<Trail Surrogate Max>'
(_, _, name, alias) = test_table[code]
(_, _, name, alias) = codepoint_table[code]
return name if not name.startswith('<') else alias
def write_character_range(println, name, characters):
@@ -1080,7 +1435,8 @@ def update_unicode(args):
with download_or_open('UnicodeData.txt') as unicode_data, \
download_or_open('CaseFolding.txt') as case_folding, \
download_or_open('DerivedCoreProperties.txt') as derived_core_properties:
download_or_open('DerivedCoreProperties.txt') as derived_core_properties, \
download_or_open('SpecialCasing.txt') as special_casing:
unicode_version = version_from_file(derived_core_properties, 'DerivedCoreProperties')
print('Processing...')
@@ -1090,13 +1446,16 @@ def update_unicode(args):
non_bmp_lower_map, non_bmp_upper_map,
non_bmp_space_set,
non_bmp_id_start_set, non_bmp_id_cont_set,
test_table, test_space_table
codepoint_table, test_space_table
) = process_unicode_data(unicode_data, derived_core_properties)
(
folding_table, folding_index,
non_bmp_folding_map, non_bmp_rev_folding_map,
folding_tests
) = process_case_folding(case_folding)
(
unconditional_tolower, unconditional_toupper
) = process_special_casing(special_casing, table, index)
print('Generating...')
make_unicode_file(unicode_version,
@@ -1104,19 +1463,23 @@ def update_unicode(args):
same_upper_table, same_upper_index,
folding_table, folding_index,
non_bmp_space_set,
non_bmp_id_start_set, non_bmp_id_cont_set)
non_bmp_id_start_set, non_bmp_id_cont_set,
unconditional_toupper,
codepoint_table)
make_non_bmp_file(unicode_version,
non_bmp_lower_map, non_bmp_upper_map,
non_bmp_folding_map, non_bmp_rev_folding_map)
non_bmp_folding_map, non_bmp_rev_folding_map,
codepoint_table)
make_irregexp_tables(unicode_version,
table, index,
folding_table, folding_index,
test_table)
codepoint_table)
make_bmp_mapping_test(unicode_version, test_table)
make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map)
make_space_test(unicode_version, test_space_table)
make_icase_test(unicode_version, folding_tests)
make_bmp_mapping_test(unicode_version,
codepoint_table, unconditional_tolower, unconditional_toupper)
make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table)
make_space_test(unicode_version, test_space_table, codepoint_table)
make_icase_test(unicode_version, folding_tests, codepoint_table)
if __name__ == '__main__':
import argparse