mirror of
https://github.com/roytam1/UXP.git
synced 2026-05-26 13:58:49 +00:00
Issue #2259 - Reimplement String.prototype.toLocale{Lower,Upper}Case per ECMAScript Intl specification
- Update make_unicode to output SpecialCasing - Handle special casing - Use realloc instead of malloc when resizing a newly created string buffer Based-on: m-c 1318403, 1431957
This commit is contained in:
@@ -82,6 +82,7 @@ included_inclnames_to_ignore = set([
|
||||
'unicode/plurrule.h', # ICU
|
||||
'unicode/timezone.h', # ICU
|
||||
'unicode/ucal.h', # ICU
|
||||
'unicode/uchar.h', # ICU
|
||||
'unicode/uclean.h', # ICU
|
||||
'unicode/ucol.h', # ICU
|
||||
'unicode/udat.h', # ICU
|
||||
|
||||
@@ -731,6 +731,88 @@ function String_localeCompare(that) {
|
||||
return intl_CompareStrings(collator, S, That);
|
||||
}
|
||||
|
||||
/**
|
||||
* 13.1.2 String.prototype.toLocaleLowerCase ( [ locales ] )
|
||||
*
|
||||
* ES2017 Intl draft rev 94045d234762ad107a3d09bb6f7381a65f1a2f9b
|
||||
*/
|
||||
function String_toLocaleLowerCase() {
|
||||
// Step 1.
|
||||
RequireObjectCoercible(this);
|
||||
|
||||
// Step 2.
|
||||
var string = ToString(this);
|
||||
|
||||
// Handle the common cases (no locales argument or a single string
|
||||
// argument) first.
|
||||
var locales = arguments.length > 0 ? arguments[0] : undefined;
|
||||
var requestedLocale;
|
||||
if (locales === undefined) {
|
||||
// Steps 3, 6.
|
||||
requestedLocale = undefined;
|
||||
} else if (typeof locales === "string") {
|
||||
// Steps 3, 5.
|
||||
requestedLocale = ValidateAndCanonicalizeLanguageTag(locales);
|
||||
} else {
|
||||
// Step 3.
|
||||
var requestedLocales = CanonicalizeLocaleList(locales);
|
||||
|
||||
// Steps 4-6.
|
||||
requestedLocale = requestedLocales.length > 0 ? requestedLocales[0] : undefined;
|
||||
}
|
||||
|
||||
// Trivial case: When the input is empty, directly return the empty string.
|
||||
if (string.length === 0)
|
||||
return "";
|
||||
|
||||
if (requestedLocale === undefined)
|
||||
requestedLocale = DefaultLocale();
|
||||
|
||||
// Steps 7-16.
|
||||
return intl_toLocaleLowerCase(string, requestedLocale);
|
||||
}
|
||||
|
||||
/**
|
||||
* 13.1.3 String.prototype.toLocaleUpperCase ( [ locales ] )
|
||||
*
|
||||
* ES2017 Intl draft rev 94045d234762ad107a3d09bb6f7381a65f1a2f9b
|
||||
*/
|
||||
function String_toLocaleUpperCase() {
|
||||
// Step 1.
|
||||
RequireObjectCoercible(this);
|
||||
|
||||
// Step 2.
|
||||
var string = ToString(this);
|
||||
|
||||
// Handle the common cases (no locales argument or a single string
|
||||
// argument) first.
|
||||
var locales = arguments.length > 0 ? arguments[0] : undefined;
|
||||
var requestedLocale;
|
||||
if (locales === undefined) {
|
||||
// Steps 3, 6.
|
||||
requestedLocale = undefined;
|
||||
} else if (typeof locales === "string") {
|
||||
// Steps 3, 5.
|
||||
requestedLocale = ValidateAndCanonicalizeLanguageTag(locales);
|
||||
} else {
|
||||
// Step 3.
|
||||
var requestedLocales = CanonicalizeLocaleList(locales);
|
||||
|
||||
// Steps 4-6.
|
||||
requestedLocale = requestedLocales.length > 0 ? requestedLocales[0] : undefined;
|
||||
}
|
||||
|
||||
// Trivial case: When the input is empty, directly return the empty string.
|
||||
if (string.length === 0)
|
||||
return "";
|
||||
|
||||
if (requestedLocale === undefined)
|
||||
requestedLocale = DefaultLocale();
|
||||
|
||||
// Steps 7-16.
|
||||
return intl_toLocaleUpperCase(string, requestedLocale);
|
||||
}
|
||||
|
||||
/* ES6 Draft May 22, 2014 21.1.2.4 */
|
||||
function String_static_raw(callSite, ...substitutions) {
|
||||
// Step 1 (implicit).
|
||||
@@ -1014,13 +1096,15 @@ _SetCanonicalName(String_static_trimEnd, "trimEnd");
|
||||
function String_static_toLocaleLowerCase(string) {
|
||||
if (arguments.length < 1)
|
||||
ThrowTypeError(JSMSG_MISSING_FUN_ARG, 0, 'String.toLocaleLowerCase');
|
||||
return callFunction(std_String_toLocaleLowerCase, string);
|
||||
var locales = arguments.length > 1 ? arguments[1] : undefined;
|
||||
return callFunction(String_toLocaleLowerCase, string, locales);
|
||||
}
|
||||
|
||||
function String_static_toLocaleUpperCase(string) {
|
||||
if (arguments.length < 1)
|
||||
ThrowTypeError(JSMSG_MISSING_FUN_ARG, 0, 'String.toLocaleUpperCase');
|
||||
return callFunction(std_String_toLocaleUpperCase, string);
|
||||
var locales = arguments.length > 1 ? arguments[1] : undefined;
|
||||
return callFunction(String_toLocaleUpperCase, string, locales);
|
||||
}
|
||||
|
||||
function String_static_normalize(string) {
|
||||
|
||||
@@ -446,6 +446,64 @@ function CanonicalizeLanguageTag(locale) {
|
||||
return canonical;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns true if the input contains only ASCII alphabetical characters.
|
||||
*/
|
||||
function IsASCIIAlphaString(s) {
|
||||
assert(typeof s === "string", "IsASCIIAlphaString");
|
||||
|
||||
for (var i = 0; i < s.length; i++) {
|
||||
var c = callFunction(std_String_charCodeAt, s, i);
|
||||
if (!((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A)))
|
||||
return false
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Validates and canonicalizes the given language tag.
|
||||
*/
|
||||
function ValidateAndCanonicalizeLanguageTag(locale) {
|
||||
assert(typeof locale === "string", "ValidateAndCanonicalizeLanguageTag");
|
||||
|
||||
// Handle the common case (a standalone language) first.
|
||||
// Only the following BCP47 subset is accepted:
|
||||
// Language-Tag = langtag
|
||||
// langtag = language
|
||||
// language = 2*3ALPHA ; shortest ISO 639 code
|
||||
// For three character long strings we need to make sure it's not a
|
||||
// private use only language tag, for example "x-x".
|
||||
if (locale.length === 2 || (locale.length === 3 && locale[1] !== "-")) {
|
||||
if (!IsASCIIAlphaString(locale))
|
||||
ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale);
|
||||
assert(IsStructurallyValidLanguageTag(locale), "2*3ALPHA is a valid language tag");
|
||||
|
||||
// The language subtag is canonicalized to lower case.
|
||||
locale = callFunction(std_String_toLowerCase, locale);
|
||||
|
||||
// langTagMappings doesn't contain any 2*3ALPHA keys, so we don't need
|
||||
// to check for possible replacements in this map.
|
||||
assert(!callFunction(std_Object_hasOwnProperty, langTagMappings, locale),
|
||||
"langTagMappings contains no 2*3ALPHA mappings");
|
||||
|
||||
// Replace deprecated subtags with their preferred values.
|
||||
locale = callFunction(std_Object_hasOwnProperty, langSubtagMappings, locale)
|
||||
? langSubtagMappings[locale]
|
||||
: locale;
|
||||
assert(locale === CanonicalizeLanguageTag(locale), "expected same canonicalization");
|
||||
|
||||
return locale;
|
||||
}
|
||||
|
||||
if (!IsStructurallyValidLanguageTag(locale))
|
||||
ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale);
|
||||
|
||||
return CanonicalizeLanguageTag(locale);
|
||||
}
|
||||
|
||||
|
||||
function localeContainsNoUnicodeExtensions(locale) {
|
||||
// No "-u-", no possible Unicode extension.
|
||||
if (callFunction(std_String_indexOf, locale, "-u-") === -1)
|
||||
|
||||
@@ -151,6 +151,10 @@ def readRegistry(registry):
|
||||
# Special case for heploc.
|
||||
langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
|
||||
|
||||
# ValidateAndCanonicalizeLanguageTag in Intl.js expects langTagMappings
|
||||
# contains no 2*3ALPHA.
|
||||
assert all(len(lang) > 3 for lang in langTagMappings.iterkeys())
|
||||
|
||||
return {"fileDate": fileDate,
|
||||
"langTagMappings": langTagMappings,
|
||||
"langSubtagMappings": langSubtagMappings,
|
||||
|
||||
+2
-2
@@ -5327,8 +5327,8 @@ JS_ResetDefaultLocale(JSContext* cx);
|
||||
* Locale specific string conversion and error message callbacks.
|
||||
*/
|
||||
struct JSLocaleCallbacks {
|
||||
JSLocaleToUpperCase localeToUpperCase;
|
||||
JSLocaleToLowerCase localeToLowerCase;
|
||||
JSLocaleToUpperCase localeToUpperCase; // not used
|
||||
JSLocaleToLowerCase localeToLowerCase; // not used
|
||||
JSLocaleCompare localeCompare; // not used
|
||||
JSLocaleToUnicode localeToUnicode;
|
||||
};
|
||||
|
||||
@@ -365,6 +365,7 @@ struct JSContext : public js::ExclusiveContext,
|
||||
using ExclusiveContext::permanentAtoms;
|
||||
using ExclusiveContext::pod_calloc;
|
||||
using ExclusiveContext::pod_malloc;
|
||||
using ExclusiveContext::pod_realloc;
|
||||
using ExclusiveContext::staticStrings;
|
||||
using ExclusiveContext::updateMallocCounter;
|
||||
using ExclusiveContext::wellKnownSymbols;
|
||||
|
||||
+537
-126
@@ -31,10 +31,12 @@
|
||||
#include "jsutil.h"
|
||||
|
||||
#include "builtin/intl/ICUHeader.h"
|
||||
#include "builtin/intl/CommonFunctions.h"
|
||||
#include "builtin/RegExp.h"
|
||||
#include "jit/InlinableNatives.h"
|
||||
#include "js/Conversions.h"
|
||||
#include "js/UniquePtr.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/unorm2.h"
|
||||
#include "vm/GlobalObject.h"
|
||||
#include "vm/Interpreter.h"
|
||||
@@ -598,19 +600,210 @@ js::SubstringKernel(JSContext* cx, HandleString str, int32_t beginInt, int32_t l
|
||||
return NewDependentString(cx, str, begin, len);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static auto
|
||||
ReallocChars(JSContext* cx, UniquePtr<CharT[], JS::FreePolicy> chars, size_t oldLength,
|
||||
size_t newLength)
|
||||
-> decltype(chars)
|
||||
{
|
||||
using AnyCharPtr = decltype(chars);
|
||||
|
||||
CharT* oldChars = chars.release();
|
||||
CharT* newChars = cx->pod_realloc<CharT>(oldChars, oldLength, newLength);
|
||||
if (!newChars) {
|
||||
js_free(oldChars);
|
||||
return AnyCharPtr();
|
||||
}
|
||||
|
||||
return AnyCharPtr(newChars);
|
||||
}
|
||||
|
||||
/**
|
||||
* U+03A3 GREEK CAPITAL LETTER SIGMA has two different lower case mappings
|
||||
* depending on its context:
|
||||
* When it's preceded by a cased character and not followed by another cased
|
||||
* character, its lower case form is U+03C2 GREEK SMALL LETTER FINAL SIGMA.
|
||||
* Otherwise its lower case mapping is U+03C3 GREEK SMALL LETTER SIGMA.
|
||||
*
|
||||
* Unicode 9.0, §3.13 Default Case Algorithms
|
||||
*/
|
||||
static char16_t
|
||||
Final_Sigma(const char16_t* chars, size_t length, size_t index)
|
||||
{
|
||||
MOZ_ASSERT(index < length);
|
||||
MOZ_ASSERT(chars[index] == unicode::GREEK_CAPITAL_LETTER_SIGMA);
|
||||
MOZ_ASSERT(unicode::ToLowerCase(unicode::GREEK_CAPITAL_LETTER_SIGMA) ==
|
||||
unicode::GREEK_SMALL_LETTER_SIGMA);
|
||||
|
||||
// Tell the analysis the BinaryProperty.contains function pointer called by
|
||||
// u_hasBinaryProperty cannot GC.
|
||||
JS::AutoSuppressGCAnalysis nogc;
|
||||
|
||||
bool precededByCased = false;
|
||||
for (size_t i = index; i > 0; ) {
|
||||
char16_t c = chars[--i];
|
||||
uint32_t codePoint = c;
|
||||
if (unicode::IsTrailSurrogate(c) && i > 0) {
|
||||
char16_t lead = chars[i - 1];
|
||||
if (unicode::IsLeadSurrogate(lead)) {
|
||||
codePoint = unicode::UTF16Decode(lead, c);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
// Ignore any characters with the property Case_Ignorable.
|
||||
// NB: We need to skip over all Case_Ignorable characters, even when
|
||||
// they also have the Cased binary property.
|
||||
if (u_hasBinaryProperty(codePoint, UCHAR_CASE_IGNORABLE))
|
||||
continue;
|
||||
|
||||
precededByCased = u_hasBinaryProperty(codePoint, UCHAR_CASED);
|
||||
break;
|
||||
}
|
||||
if (!precededByCased)
|
||||
return unicode::GREEK_SMALL_LETTER_SIGMA;
|
||||
|
||||
bool followedByCased = false;
|
||||
for (size_t i = index + 1; i < length; ) {
|
||||
char16_t c = chars[i++];
|
||||
uint32_t codePoint = c;
|
||||
if (unicode::IsLeadSurrogate(c) && i < length) {
|
||||
char16_t trail = chars[i];
|
||||
if (unicode::IsTrailSurrogate(trail)) {
|
||||
codePoint = unicode::UTF16Decode(c, trail);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// Ignore any characters with the property Case_Ignorable.
|
||||
// NB: We need to skip over all Case_Ignorable characters, even when
|
||||
// they also have the Cased binary property.
|
||||
if (u_hasBinaryProperty(codePoint, UCHAR_CASE_IGNORABLE))
|
||||
continue;
|
||||
|
||||
followedByCased = u_hasBinaryProperty(codePoint, UCHAR_CASED);
|
||||
break;
|
||||
}
|
||||
if (!followedByCased)
|
||||
return unicode::GREEK_SMALL_LETTER_FINAL_SIGMA;
|
||||
|
||||
return unicode::GREEK_SMALL_LETTER_SIGMA;
|
||||
}
|
||||
|
||||
static Latin1Char
|
||||
Final_Sigma(const Latin1Char* chars, size_t length, size_t index)
|
||||
{
|
||||
MOZ_ASSERT_UNREACHABLE("U+03A3 is not a Latin-1 character");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// If |srcLength == destLength| is true, the destination buffer was allocated
|
||||
// with the same size as the source buffer. When we append characters which
|
||||
// have special casing mappings, we test |srcLength == destLength| to decide
|
||||
// if we need to back out and reallocate a sufficiently large destination
|
||||
// buffer. Otherwise the destination buffer was allocated with the correct
|
||||
// size to hold all lower case mapped characters, i.e.
|
||||
// |destLength == ToLowerCaseLength(srcChars, 0, srcLength)| is true.
|
||||
template <typename CharT>
|
||||
static size_t
|
||||
ToLowerCaseImpl(CharT* destChars, const CharT* srcChars, size_t startIndex, size_t srcLength,
|
||||
size_t destLength)
|
||||
{
|
||||
MOZ_ASSERT(startIndex < srcLength);
|
||||
MOZ_ASSERT(srcLength <= destLength);
|
||||
MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), srcLength == destLength);
|
||||
|
||||
size_t j = startIndex;
|
||||
for (size_t i = startIndex; i < srcLength; i++) {
|
||||
char16_t c = srcChars[i];
|
||||
if (!IsSame<CharT, Latin1Char>::value) {
|
||||
if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
|
||||
char16_t trail = srcChars[i + 1];
|
||||
if (unicode::IsTrailSurrogate(trail)) {
|
||||
trail = unicode::ToLowerCaseNonBMPTrail(c, trail);
|
||||
destChars[j++] = c;
|
||||
destChars[j++] = trail;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Special case: U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
// lowercases to <U+0069 U+0307>.
|
||||
if (c == unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
|
||||
// Return if the output buffer is too small.
|
||||
if (srcLength == destLength)
|
||||
return i;
|
||||
|
||||
destChars[j++] = CharT('i');
|
||||
destChars[j++] = CharT(unicode::COMBINING_DOT_ABOVE);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Special case: U+03A3 GREEK CAPITAL LETTER SIGMA lowercases to
|
||||
// one of two codepoints depending on context.
|
||||
if (c == unicode::GREEK_CAPITAL_LETTER_SIGMA) {
|
||||
destChars[j++] = Final_Sigma(srcChars, srcLength, i);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
c = unicode::ToLowerCase(c);
|
||||
MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
|
||||
destChars[j++] = c;
|
||||
}
|
||||
|
||||
MOZ_ASSERT(j == destLength);
|
||||
destChars[destLength] = '\0';
|
||||
|
||||
return srcLength;
|
||||
}
|
||||
|
||||
static size_t
|
||||
ToLowerCaseLength(const char16_t* chars, size_t startIndex, size_t length)
|
||||
{
|
||||
size_t lowerLength = length;
|
||||
for (size_t i = startIndex; i < length; i++) {
|
||||
char16_t c = chars[i];
|
||||
|
||||
// U+0130 is lowercased to the two-element sequence <U+0069 U+0307>.
|
||||
if (c == unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
|
||||
lowerLength += 1;
|
||||
}
|
||||
return lowerLength;
|
||||
}
|
||||
|
||||
static size_t
|
||||
ToLowerCaseLength(const Latin1Char* chars, size_t startIndex, size_t length)
|
||||
{
|
||||
MOZ_ASSERT_UNREACHABLE("never called for Latin-1 strings");
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static JSString*
|
||||
ToLowerCase(JSContext* cx, JSLinearString* str)
|
||||
{
|
||||
// Unlike toUpperCase, toLowerCase has the nice invariant that if the input
|
||||
// is a Latin1 string, the output is also a Latin1 string.
|
||||
UniquePtr<CharT[], JS::FreePolicy> newChars;
|
||||
size_t length = str->length();
|
||||
// Unlike toUpperCase, toLowerCase has the nice invariant that if the
|
||||
// input is a Latin-1 string, the output is also a Latin-1 string.
|
||||
using AnyCharPtr = UniquePtr<CharT[], JS::FreePolicy>;
|
||||
|
||||
AnyCharPtr newChars;
|
||||
const size_t length = str->length();
|
||||
size_t resultLength;
|
||||
{
|
||||
AutoCheckCannotGC nogc;
|
||||
const CharT* chars = str->chars<CharT>(nogc);
|
||||
|
||||
// Look for the first upper case character.
|
||||
// We don't need extra special casing checks in the loop below,
|
||||
// because U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+03A3
|
||||
// GREEK CAPITAL LETTER SIGMA already have simple lower case mappings.
|
||||
MOZ_ASSERT(unicode::CanLowerCase(unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE),
|
||||
"U+0130 has a simple lower case mapping");
|
||||
MOZ_ASSERT(unicode::CanLowerCase(unicode::GREEK_CAPITAL_LETTER_SIGMA),
|
||||
"U+03A3 has a simple lower case mapping");
|
||||
|
||||
// Look for the first character that changes when lowercased.
|
||||
size_t i = 0;
|
||||
for (; i < length; i++) {
|
||||
char16_t c = chars[i];
|
||||
@@ -630,40 +823,35 @@ ToLowerCase(JSContext* cx, JSLinearString* str)
|
||||
break;
|
||||
}
|
||||
|
||||
// If all characters are lower case, return the input string.
|
||||
// If no character needs to change, return the input string.
|
||||
if (i == length)
|
||||
return str;
|
||||
|
||||
newChars = cx->make_pod_array<CharT>(length + 1);
|
||||
resultLength = length;
|
||||
newChars = cx->make_pod_array<CharT>(resultLength + 1);
|
||||
if (!newChars)
|
||||
return nullptr;
|
||||
|
||||
PodCopy(newChars.get(), chars, i);
|
||||
|
||||
for (; i < length; i++) {
|
||||
char16_t c = chars[i];
|
||||
if (!IsSame<CharT, Latin1Char>::value) {
|
||||
if (unicode::IsLeadSurrogate(c) && i + 1 < length) {
|
||||
char16_t trail = chars[i + 1];
|
||||
if (unicode::IsTrailSurrogate(trail)) {
|
||||
trail = unicode::ToLowerCaseNonBMPTrail(c, trail);
|
||||
newChars[i] = c;
|
||||
newChars[i + 1] = trail;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
size_t readChars = ToLowerCaseImpl(newChars.get(), chars, i, length, resultLength);
|
||||
if (readChars < length) {
|
||||
MOZ_ASSERT((!IsSame<CharT, Latin1Char>::value),
|
||||
"Latin-1 strings don't have special lower case mappings");
|
||||
resultLength = ToLowerCaseLength(chars, readChars, length);
|
||||
|
||||
c = unicode::ToLowerCase(c);
|
||||
MOZ_ASSERT_IF((IsSame<CharT, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
|
||||
newChars[i] = c;
|
||||
AnyCharPtr buf = ReallocChars(cx, Move(newChars), length + 1, resultLength + 1);
|
||||
if (!buf)
|
||||
return nullptr;
|
||||
|
||||
newChars = Move(buf);
|
||||
|
||||
MOZ_ALWAYS_TRUE(length ==
|
||||
ToLowerCaseImpl(newChars.get(), chars, readChars, length, resultLength));
|
||||
}
|
||||
|
||||
newChars[length] = 0;
|
||||
}
|
||||
|
||||
JSString* res = NewStringDontDeflate<CanGC>(cx, newChars.get(), length);
|
||||
JSString* res = NewStringDontDeflate<CanGC>(cx, newChars.get(), resultLength);
|
||||
if (!res)
|
||||
return nullptr;
|
||||
|
||||
@@ -671,21 +859,102 @@ ToLowerCase(JSContext* cx, JSLinearString* str)
|
||||
return res;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
ToLowerCaseHelper(JSContext* cx, const CallArgs& args)
|
||||
JSString*
|
||||
js::StringToLowerCase(JSContext* cx, HandleLinearString string)
|
||||
{
|
||||
if (string->hasLatin1Chars())
|
||||
return ToLowerCase<Latin1Char>(cx, string);
|
||||
return ToLowerCase<char16_t>(cx, string);
|
||||
}
|
||||
|
||||
bool
|
||||
js::str_toLowerCase(JSContext* cx, unsigned argc, Value* vp)
|
||||
{
|
||||
CallArgs args = CallArgsFromVp(argc, vp);
|
||||
|
||||
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
JSLinearString* linear = str->ensureLinear(cx);
|
||||
RootedLinearString linear(cx, str->ensureLinear(cx));
|
||||
if (!linear)
|
||||
return false;
|
||||
|
||||
if (linear->hasLatin1Chars())
|
||||
str = ToLowerCase<Latin1Char>(cx, linear);
|
||||
else
|
||||
str = ToLowerCase<char16_t>(cx, linear);
|
||||
JSString* result = StringToLowerCase(cx, linear);
|
||||
if (!result)
|
||||
return false;
|
||||
|
||||
args.rval().setString(result);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char*
|
||||
CaseMappingLocale(JSContext* cx, JSString* str)
|
||||
{
|
||||
JSLinearString* locale = str->ensureLinear(cx);
|
||||
if (!locale)
|
||||
return nullptr;
|
||||
|
||||
MOZ_ASSERT(locale->length() >= 2, "locale is a valid language tag");
|
||||
|
||||
// Lithuanian, Turkish, and Azeri have language dependent case mappings.
|
||||
static const char languagesWithSpecialCasing[][3] = { "lt", "tr", "az" };
|
||||
|
||||
// All strings in |languagesWithSpecialCasing| are of length two, so we
|
||||
// only need to compare the first two characters to find a matching locale.
|
||||
// ES2017 Intl, §9.2.2 BestAvailableLocale
|
||||
if (locale->length() == 2 || locale->latin1OrTwoByteChar(2) == '-') {
|
||||
for (const auto& language : languagesWithSpecialCasing) {
|
||||
if (locale->latin1OrTwoByteChar(0) == language[0] &&
|
||||
locale->latin1OrTwoByteChar(1) == language[1])
|
||||
{
|
||||
return language;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ""; // ICU root locale
|
||||
}
|
||||
|
||||
bool
|
||||
js::intl_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp)
|
||||
{
|
||||
CallArgs args = CallArgsFromVp(argc, vp);
|
||||
MOZ_ASSERT(args.length() == 2);
|
||||
MOZ_ASSERT(args[0].isString());
|
||||
MOZ_ASSERT(args[1].isString());
|
||||
|
||||
RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx));
|
||||
if (!linear)
|
||||
return false;
|
||||
|
||||
const char* locale = CaseMappingLocale(cx, args[1].toString());
|
||||
if (!locale)
|
||||
return false;
|
||||
|
||||
// Call String.prototype.toLowerCase() for language independent casing.
|
||||
if (intl::StringsAreEqual(locale, "")) {
|
||||
JSString* str = StringToLowerCase(cx, linear);
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
args.rval().setString(str);
|
||||
return true;
|
||||
}
|
||||
|
||||
AutoStableStringChars inputChars(cx);
|
||||
if (!inputChars.initTwoByte(cx, linear))
|
||||
return false;
|
||||
mozilla::Range<const char16_t> input = inputChars.twoByteRange();
|
||||
|
||||
// Maximum case mapping length is three characters.
|
||||
static_assert(JSString::MAX_LENGTH < INT32_MAX / 3,
|
||||
"Case conversion doesn't overflow int32_t indices");
|
||||
|
||||
JSString* str = intl::CallICU(cx, [&input, locale](UChar* chars, int32_t size, UErrorCode* status) {
|
||||
return u_strToLower(chars, size, Char16ToUChar(input.begin().get()), input.length(),
|
||||
locale, status);
|
||||
});
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
@@ -693,82 +962,192 @@ ToLowerCaseHelper(JSContext* cx, const CallArgs& args)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
js::str_toLowerCase(JSContext* cx, unsigned argc, Value* vp)
|
||||
static inline bool
|
||||
CanUpperCaseSpecialCasing(Latin1Char charCode)
|
||||
{
|
||||
return ToLowerCaseHelper(cx, CallArgsFromVp(argc, vp));
|
||||
// Handle U+00DF LATIN SMALL LETTER SHARP S inline, all other Latin-1
|
||||
// characters don't have special casing rules.
|
||||
MOZ_ASSERT_IF(charCode != unicode::LATIN_SMALL_LETTER_SHARP_S,
|
||||
!unicode::CanUpperCaseSpecialCasing(charCode));
|
||||
|
||||
return charCode == unicode::LATIN_SMALL_LETTER_SHARP_S;
|
||||
}
|
||||
|
||||
bool
|
||||
js::str_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp)
|
||||
static inline bool
|
||||
CanUpperCaseSpecialCasing(char16_t charCode)
|
||||
{
|
||||
CallArgs args = CallArgsFromVp(argc, vp);
|
||||
|
||||
/*
|
||||
* Forcefully ignore the first (or any) argument and return toLowerCase(),
|
||||
* ECMA has reserved that argument, presumably for defining the locale.
|
||||
*/
|
||||
if (cx->runtime()->localeCallbacks && cx->runtime()->localeCallbacks->localeToLowerCase) {
|
||||
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
RootedValue result(cx);
|
||||
if (!cx->runtime()->localeCallbacks->localeToLowerCase(cx, str, &result))
|
||||
return false;
|
||||
|
||||
args.rval().set(result);
|
||||
return true;
|
||||
}
|
||||
|
||||
return ToLowerCaseHelper(cx, args);
|
||||
return unicode::CanUpperCaseSpecialCasing(charCode);
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
LengthUpperCaseSpecialCasing(Latin1Char charCode)
|
||||
{
|
||||
// U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'.
|
||||
MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S);
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
LengthUpperCaseSpecialCasing(char16_t charCode)
|
||||
{
|
||||
MOZ_ASSERT(CanUpperCaseSpecialCasing(charCode));
|
||||
|
||||
return unicode::LengthUpperCaseSpecialCasing(charCode);
|
||||
}
|
||||
|
||||
static inline void
|
||||
AppendUpperCaseSpecialCasing(char16_t charCode, Latin1Char* elements, size_t* index)
|
||||
{
|
||||
// U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'.
|
||||
MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S);
|
||||
static_assert('S' <= JSString::MAX_LATIN1_CHAR, "'S' is a Latin-1 character");
|
||||
|
||||
elements[(*index)++] = 'S';
|
||||
elements[(*index)++] = 'S';
|
||||
}
|
||||
|
||||
static inline void
|
||||
AppendUpperCaseSpecialCasing(char16_t charCode, char16_t* elements, size_t* index)
|
||||
{
|
||||
unicode::AppendUpperCaseSpecialCasing(charCode, elements, index);
|
||||
}
|
||||
|
||||
// See ToLowerCaseImpl for an explanation of the parameters.
|
||||
template <typename DestChar, typename SrcChar>
|
||||
static void
|
||||
ToUpperCaseImpl(DestChar* destChars, const SrcChar* srcChars, size_t firstLowerCase, size_t length)
|
||||
static size_t
|
||||
ToUpperCaseImpl(DestChar* destChars, const SrcChar* srcChars, size_t startIndex, size_t srcLength,
|
||||
size_t destLength)
|
||||
{
|
||||
MOZ_ASSERT(firstLowerCase < length);
|
||||
static_assert(IsSame<SrcChar, Latin1Char>::value || !IsSame<DestChar, Latin1Char>::value,
|
||||
"cannot write non-Latin-1 characters into Latin-1 string");
|
||||
MOZ_ASSERT(startIndex < srcLength);
|
||||
MOZ_ASSERT(srcLength <= destLength);
|
||||
|
||||
for (size_t i = 0; i < firstLowerCase; i++)
|
||||
destChars[i] = srcChars[i];
|
||||
|
||||
for (size_t i = firstLowerCase; i < length; i++) {
|
||||
size_t j = startIndex;
|
||||
for (size_t i = startIndex; i < srcLength; i++) {
|
||||
char16_t c = srcChars[i];
|
||||
if (!IsSame<DestChar, Latin1Char>::value) {
|
||||
if (unicode::IsLeadSurrogate(c) && i + 1 < length) {
|
||||
if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
|
||||
char16_t trail = srcChars[i + 1];
|
||||
if (unicode::IsTrailSurrogate(trail)) {
|
||||
trail = unicode::ToUpperCaseNonBMPTrail(c, trail);
|
||||
destChars[i] = c;
|
||||
destChars[i + 1] = trail;
|
||||
destChars[j++] = c;
|
||||
destChars[j++] = trail;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (MOZ_UNLIKELY(c > 0x7f && CanUpperCaseSpecialCasing(static_cast<SrcChar>(c)))) {
|
||||
// Return if the output buffer is too small.
|
||||
if (srcLength == destLength)
|
||||
return i;
|
||||
|
||||
AppendUpperCaseSpecialCasing(c, destChars, &j);
|
||||
continue;
|
||||
}
|
||||
|
||||
c = unicode::ToUpperCase(c);
|
||||
MOZ_ASSERT_IF((IsSame<DestChar, Latin1Char>::value), c <= JSString::MAX_LATIN1_CHAR);
|
||||
destChars[i] = c;
|
||||
destChars[j++] = c;
|
||||
}
|
||||
|
||||
destChars[length] = '\0';
|
||||
MOZ_ASSERT(j == destLength);
|
||||
destChars[destLength] = '\0';
|
||||
|
||||
return srcLength;
|
||||
}
|
||||
|
||||
// Explicit instantiation so we don't hit the static_assert from above.
|
||||
static bool
|
||||
ToUpperCaseImpl(Latin1Char* destChars, const char16_t* srcChars, size_t startIndex,
|
||||
size_t srcLength, size_t destLength)
|
||||
{
|
||||
MOZ_ASSERT_UNREACHABLE("cannot write non-Latin-1 characters into Latin-1 string");
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static size_t
|
||||
ToUpperCaseLength(const CharT* chars, size_t startIndex, size_t length)
|
||||
{
|
||||
size_t upperLength = length;
|
||||
for (size_t i = startIndex; i < length; i++) {
|
||||
char16_t c = chars[i];
|
||||
|
||||
if (c > 0x7f && CanUpperCaseSpecialCasing(static_cast<CharT>(c)))
|
||||
upperLength += LengthUpperCaseSpecialCasing(static_cast<CharT>(c)) - 1;
|
||||
}
|
||||
return upperLength;
|
||||
}
|
||||
|
||||
template <typename DestChar, typename SrcChar>
|
||||
static inline void
|
||||
CopyChars(DestChar* destChars, const SrcChar* srcChars, size_t length)
|
||||
{
|
||||
static_assert(!IsSame<DestChar, SrcChar>::value, "PodCopy is used for the same type case");
|
||||
for (size_t i = 0; i < length; i++)
|
||||
destChars[i] = srcChars[i];
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static inline void
|
||||
CopyChars(CharT* destChars, const CharT* srcChars, size_t length)
|
||||
{
|
||||
PodCopy(destChars, srcChars, length);
|
||||
}
|
||||
|
||||
template <typename DestChar, typename SrcChar>
|
||||
static inline UniquePtr<DestChar[], JS::FreePolicy>
|
||||
ToUpperCase(JSContext* cx, const SrcChar* chars, size_t startIndex, size_t length,
|
||||
size_t* resultLength)
|
||||
{
|
||||
MOZ_ASSERT(startIndex < length);
|
||||
|
||||
using DestCharPtr = UniquePtr<DestChar[], JS::FreePolicy>;
|
||||
|
||||
*resultLength = length;
|
||||
DestCharPtr buf = cx->make_pod_array<DestChar>(length + 1);
|
||||
if (!buf)
|
||||
return buf;
|
||||
|
||||
CopyChars(buf.get(), chars, startIndex);
|
||||
|
||||
size_t readChars = ToUpperCaseImpl(buf.get(), chars, startIndex, length, length);
|
||||
if (readChars < length) {
|
||||
size_t actualLength = ToUpperCaseLength(chars, readChars, length);
|
||||
|
||||
*resultLength = actualLength;
|
||||
DestCharPtr buf2 = ReallocChars(cx, Move(buf), length + 1, actualLength + 1);
|
||||
if (!buf2)
|
||||
return buf2;
|
||||
|
||||
buf = Move(buf2);
|
||||
|
||||
MOZ_ALWAYS_TRUE(length ==
|
||||
ToUpperCaseImpl(buf.get(), chars, readChars, length, actualLength));
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static JSString*
|
||||
ToUpperCase(JSContext* cx, JSLinearString* str)
|
||||
{
|
||||
typedef UniquePtr<Latin1Char[], JS::FreePolicy> Latin1CharPtr;
|
||||
typedef UniquePtr<char16_t[], JS::FreePolicy> TwoByteCharPtr;
|
||||
using Latin1CharPtr = UniquePtr<Latin1Char[], JS::FreePolicy>;
|
||||
using TwoByteCharPtr = UniquePtr<char16_t[], JS::FreePolicy>;
|
||||
|
||||
mozilla::MaybeOneOf<Latin1CharPtr, TwoByteCharPtr> newChars;
|
||||
size_t length = str->length();
|
||||
const size_t length = str->length();
|
||||
size_t resultLength;
|
||||
{
|
||||
AutoCheckCannotGC nogc;
|
||||
const CharT* chars = str->chars<CharT>(nogc);
|
||||
|
||||
// Look for the first lower case character.
|
||||
// Look for the first character that changes when uppercased.
|
||||
size_t i = 0;
|
||||
for (; i < length; i++) {
|
||||
char16_t c = chars[i];
|
||||
@@ -786,21 +1165,33 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
|
||||
}
|
||||
if (unicode::CanUpperCase(c))
|
||||
break;
|
||||
if (MOZ_UNLIKELY(c > 0x7f && CanUpperCaseSpecialCasing(static_cast<CharT>(c))))
|
||||
break;
|
||||
}
|
||||
|
||||
// If all characters are upper case, return the input string.
|
||||
// If no character needs to change, return the input string.
|
||||
if (i == length)
|
||||
return str;
|
||||
|
||||
// If the string is Latin1, check if it contains the MICRO SIGN (0xb5)
|
||||
// or SMALL LETTER Y WITH DIAERESIS (0xff) character. The corresponding
|
||||
// upper case characters are not in the Latin1 range.
|
||||
// The string changes when uppercased, so we must create a new string.
|
||||
// Can it be Latin-1?
|
||||
//
|
||||
// If the original string is Latin-1, it can -- unless the string
|
||||
// contains U+00B5 MICRO SIGN or U+00FF SMALL LETTER Y WITH DIAERESIS,
|
||||
// the only Latin-1 codepoints that don't uppercase within Latin-1.
|
||||
// Search for those codepoints to decide whether the new string can be
|
||||
// Latin-1.
|
||||
// If the original string is a two-byte string, its uppercase form is
|
||||
// so rarely Latin-1 that we don't even consider creating a new
|
||||
// Latin-1 string.
|
||||
bool resultIsLatin1;
|
||||
if (IsSame<CharT, Latin1Char>::value) {
|
||||
resultIsLatin1 = true;
|
||||
for (size_t j = i; j < length; j++) {
|
||||
Latin1Char c = chars[j];
|
||||
if (c == 0xb5 || c == 0xff) {
|
||||
if (c == unicode::MICRO_SIGN ||
|
||||
c == unicode::LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)
|
||||
{
|
||||
MOZ_ASSERT(unicode::ToUpperCase(c) > JSString::MAX_LATIN1_CHAR);
|
||||
resultIsLatin1 = false;
|
||||
break;
|
||||
@@ -813,31 +1204,29 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
|
||||
}
|
||||
|
||||
if (resultIsLatin1) {
|
||||
Latin1CharPtr buf = cx->make_pod_array<Latin1Char>(length + 1);
|
||||
Latin1CharPtr buf = ToUpperCase<Latin1Char>(cx, chars, i, length, &resultLength);
|
||||
if (!buf)
|
||||
return nullptr;
|
||||
|
||||
ToUpperCaseImpl(buf.get(), chars, i, length);
|
||||
newChars.construct<Latin1CharPtr>(Move(buf));
|
||||
} else {
|
||||
TwoByteCharPtr buf = cx->make_pod_array<char16_t>(length + 1);
|
||||
TwoByteCharPtr buf = ToUpperCase<char16_t>(cx, chars, i, length, &resultLength);
|
||||
if (!buf)
|
||||
return nullptr;
|
||||
|
||||
ToUpperCaseImpl(buf.get(), chars, i, length);
|
||||
newChars.construct<TwoByteCharPtr>(Move(buf));
|
||||
}
|
||||
}
|
||||
|
||||
JSString* res;
|
||||
if (newChars.constructed<Latin1CharPtr>()) {
|
||||
res = NewStringDontDeflate<CanGC>(cx, newChars.ref<Latin1CharPtr>().get(), length);
|
||||
res = NewStringDontDeflate<CanGC>(cx, newChars.ref<Latin1CharPtr>().get(), resultLength);
|
||||
if (!res)
|
||||
return nullptr;
|
||||
|
||||
mozilla::Unused << newChars.ref<Latin1CharPtr>().release();
|
||||
} else {
|
||||
res = NewStringDontDeflate<CanGC>(cx, newChars.ref<TwoByteCharPtr>().get(), length);
|
||||
res = NewStringDontDeflate<CanGC>(cx, newChars.ref<TwoByteCharPtr>().get(), resultLength);
|
||||
if (!res)
|
||||
return nullptr;
|
||||
|
||||
@@ -847,57 +1236,79 @@ ToUpperCase(JSContext* cx, JSLinearString* str)
|
||||
return res;
|
||||
}
|
||||
|
||||
static bool
|
||||
ToUpperCaseHelper(JSContext* cx, const CallArgs& args)
|
||||
JSString*
|
||||
js::StringToUpperCase(JSContext* cx, HandleLinearString string)
|
||||
{
|
||||
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
JSLinearString* linear = str->ensureLinear(cx);
|
||||
if (!linear)
|
||||
return false;
|
||||
|
||||
if (linear->hasLatin1Chars())
|
||||
str = ToUpperCase<Latin1Char>(cx, linear);
|
||||
else
|
||||
str = ToUpperCase<char16_t>(cx, linear);
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
args.rval().setString(str);
|
||||
return true;
|
||||
if (string->hasLatin1Chars())
|
||||
return ToUpperCase<Latin1Char>(cx, string);
|
||||
return ToUpperCase<char16_t>(cx, string);
|
||||
}
|
||||
|
||||
bool
|
||||
js::str_toUpperCase(JSContext* cx, unsigned argc, Value* vp)
|
||||
{
|
||||
return ToUpperCaseHelper(cx, CallArgsFromVp(argc, vp));
|
||||
CallArgs args = CallArgsFromVp(argc, vp);
|
||||
|
||||
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
RootedLinearString linear(cx, str->ensureLinear(cx));
|
||||
if (!linear)
|
||||
return false;
|
||||
|
||||
JSString* result = StringToUpperCase(cx, linear);
|
||||
if (!result)
|
||||
return false;
|
||||
|
||||
args.rval().setString(result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
js::str_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp)
|
||||
js::intl_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp)
|
||||
{
|
||||
CallArgs args = CallArgsFromVp(argc, vp);
|
||||
MOZ_ASSERT(args.length() == 2);
|
||||
MOZ_ASSERT(args[0].isString());
|
||||
MOZ_ASSERT(args[1].isString());
|
||||
|
||||
/*
|
||||
* Forcefully ignore the first (or any) argument and return toUpperCase(),
|
||||
* ECMA has reserved that argument, presumably for defining the locale.
|
||||
*/
|
||||
if (cx->runtime()->localeCallbacks && cx->runtime()->localeCallbacks->localeToUpperCase) {
|
||||
RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
|
||||
RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx));
|
||||
if (!linear)
|
||||
return false;
|
||||
|
||||
const char* locale = CaseMappingLocale(cx, args[1].toString());
|
||||
if (!locale)
|
||||
return false;
|
||||
|
||||
// Call String.prototype.toUpperCase() for language independent casing.
|
||||
if (intl::StringsAreEqual(locale, "")) {
|
||||
JSString* str = StringToUpperCase(cx, linear);
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
RootedValue result(cx);
|
||||
if (!cx->runtime()->localeCallbacks->localeToUpperCase(cx, str, &result))
|
||||
return false;
|
||||
|
||||
args.rval().set(result);
|
||||
args.rval().setString(str);
|
||||
return true;
|
||||
}
|
||||
|
||||
return ToUpperCaseHelper(cx, args);
|
||||
AutoStableStringChars inputChars(cx);
|
||||
if (!inputChars.initTwoByte(cx, linear))
|
||||
return false;
|
||||
mozilla::Range<const char16_t> input = inputChars.twoByteRange();
|
||||
|
||||
// Maximum case mapping length is three characters.
|
||||
static_assert(JSString::MAX_LENGTH < INT32_MAX / 3,
|
||||
"Case conversion doesn't overflow int32_t indices");
|
||||
|
||||
JSString* str = intl::CallICU(cx, [&input, locale](UChar* chars, int32_t size, UErrorCode* status) {
|
||||
return u_strToUpper(chars, size, Char16ToUChar(input.begin().get()), input.length(),
|
||||
locale, status);
|
||||
});
|
||||
if (!str)
|
||||
return false;
|
||||
|
||||
args.rval().setString(str);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* ES2017 21.1.3.12. */
|
||||
@@ -944,7 +1355,7 @@ js::str_normalize(JSContext* cx, unsigned argc, Value* vp)
|
||||
if (!linear)
|
||||
return false;
|
||||
|
||||
// Latin1 strings are already in Normalization Form C.
|
||||
// Latin-1 strings are already in Normalization Form C.
|
||||
if (form == NFC && linear->hasLatin1Chars()) {
|
||||
// Step 7.
|
||||
args.rval().setString(str);
|
||||
@@ -1359,7 +1770,7 @@ StringMatch(const TextChar* text, uint32_t textLen, const PatChar* pat, uint32_t
|
||||
/*
|
||||
* For big patterns with large potential overlap we want the SIMD-optimized
|
||||
* speed of memcmp. For small patterns, a simple loop is faster. We also can't
|
||||
* use memcmp if one of the strings is TwoByte and the other is Latin1.
|
||||
* use memcmp if one of the strings is TwoByte and the other is Latin-1.
|
||||
*
|
||||
* FIXME: Linux memcmp performance is sad and the manual loop is faster.
|
||||
*/
|
||||
@@ -1555,7 +1966,7 @@ RopeMatch(JSContext* cx, JSRope* text, JSLinearString* pat, int* match)
|
||||
* need to build the list of leaf nodes. Do both here: iterate over the
|
||||
* nodes so long as there are not too many.
|
||||
*
|
||||
* We also don't use rope matching if the rope contains both Latin1 and
|
||||
* We also don't use rope matching if the rope contains both Latin-1 and
|
||||
* TwoByte nodes, to simplify the match algorithm.
|
||||
*/
|
||||
{
|
||||
@@ -2890,8 +3301,8 @@ static const JSFunctionSpec string_methods[] = {
|
||||
JS_FN("trimStart", str_trimStart, 0,0),
|
||||
JS_FN("trimRight", str_trimEnd, 0,0),
|
||||
JS_FN("trimEnd", str_trimEnd, 0,0),
|
||||
JS_FN("toLocaleLowerCase", str_toLocaleLowerCase, 0,0),
|
||||
JS_FN("toLocaleUpperCase", str_toLocaleUpperCase, 0,0),
|
||||
JS_SELF_HOSTED_FN("toLocaleLowerCase", "String_toLocaleLowerCase", 0,0),
|
||||
JS_SELF_HOSTED_FN("toLocaleUpperCase", "String_toLocaleUpperCase", 0,0),
|
||||
JS_SELF_HOSTED_FN("localeCompare", "String_localeCompare", 1,0),
|
||||
JS_SELF_HOSTED_FN("repeat", "String_repeat", 1,0),
|
||||
JS_FN("normalize", str_normalize, 0,0),
|
||||
@@ -3000,7 +3411,7 @@ js::str_fromCharCode(JSContext* cx, unsigned argc, Value* vp)
|
||||
// string (thin or fat) and so we don't need to malloc the chars. (We could
|
||||
// cover some cases where args.length() goes up to
|
||||
// JSFatInlineString::MAX_LENGTH_LATIN1 if we also checked if the chars are
|
||||
// all Latin1, but it doesn't seem worth the effort.)
|
||||
// all Latin-1, but it doesn't seem worth the effort.)
|
||||
if (args.length() <= JSFatInlineString::MAX_LENGTH_TWO_BYTE)
|
||||
return str_fromCharCode_few_args(cx, args);
|
||||
|
||||
@@ -3143,7 +3554,7 @@ js::str_fromCodePoint(JSContext* cx, unsigned argc, Value* vp)
|
||||
// string (thin or fat) and so we don't need to malloc the chars. (We could
|
||||
// cover some cases where |args.length()| goes up to
|
||||
// JSFatInlineString::MAX_LENGTH_LATIN1 / 2 if we also checked if the chars
|
||||
// are all Latin1, but it doesn't seem worth the effort.)
|
||||
// are all Latin-1, but it doesn't seem worth the effort.)
|
||||
if (args.length() <= JSFatInlineString::MAX_LENGTH_TWO_BYTE / 2)
|
||||
return str_fromCodePoint_few_args(cx, args);
|
||||
|
||||
|
||||
+23
-4
@@ -371,11 +371,24 @@ str_trimStart(JSContext* cx, unsigned argc, Value* vp);
|
||||
extern bool
|
||||
str_trimEnd(JSContext* cx, unsigned argc, Value* vp);
|
||||
|
||||
extern bool
|
||||
str_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp);
|
||||
/**
|
||||
* Returns the input string converted to lower case based on the language
|
||||
* specific case mappings for the input locale.
|
||||
*
|
||||
* Usage: lowerCase = intl_toLocaleLowerCase(string, locale)
|
||||
*/
|
||||
extern MOZ_MUST_USE bool
|
||||
intl_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp);
|
||||
|
||||
/**
|
||||
* Returns the input string converted to upper case based on the language
|
||||
* specific case mappings for the input locale.
|
||||
*
|
||||
* Usage: upperCase = intl_toLocaleUpperCase(string, locale)
|
||||
*/
|
||||
extern MOZ_MUST_USE bool
|
||||
intl_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp);
|
||||
|
||||
extern bool
|
||||
str_toLocaleUpperCase(JSContext* cx, unsigned argc, Value* vp);
|
||||
|
||||
extern bool
|
||||
str_normalize(JSContext* cx, unsigned argc, Value* vp);
|
||||
@@ -480,6 +493,12 @@ JSString*
|
||||
str_replaceAll_string_raw(JSContext* cx, HandleString string, HandleString pattern,
|
||||
HandleString replacement);
|
||||
|
||||
extern JSString*
|
||||
StringToLowerCase(JSContext* cx, HandleLinearString string);
|
||||
|
||||
extern JSString*
|
||||
StringToUpperCase(JSContext* cx, HandleLinearString string);
|
||||
|
||||
extern bool
|
||||
StringConstructor(JSContext* cx, unsigned argc, Value* vp);
|
||||
|
||||
|
||||
@@ -2207,11 +2207,9 @@ static const JSFunctionSpec intrinsic_functions[] = {
|
||||
JS_FN("std_String_trimStart", str_trimStart, 0,0),
|
||||
JS_FN("std_String_trimRight", str_trimEnd, 0,0),
|
||||
JS_FN("std_String_trimEnd", str_trimEnd, 0,0),
|
||||
JS_FN("std_String_toLocaleLowerCase", str_toLocaleLowerCase, 0,0),
|
||||
JS_FN("std_String_toLocaleUpperCase", str_toLocaleUpperCase, 0,0),
|
||||
JS_FN("std_String_normalize", str_normalize, 0,0),
|
||||
JS_FN("std_String_concat", str_concat, 1,0),
|
||||
|
||||
|
||||
JS_FN("std_TypedArray_buffer", js::TypedArray_bufferGetter, 1,0),
|
||||
|
||||
JS_FN("std_WeakMap_has", WeakMap_has, 1,0),
|
||||
@@ -2485,6 +2483,8 @@ static const JSFunctionSpec intrinsic_functions[] = {
|
||||
JS_FN("intl_PluralRules_availableLocales", intl_PluralRules_availableLocales, 0,0),
|
||||
JS_FN("intl_GetPluralCategories", intl_GetPluralCategories, 2, 0),
|
||||
JS_FN("intl_SelectPluralRule", intl_SelectPluralRule, 2,0),
|
||||
JS_FN("intl_toLocaleLowerCase", intl_toLocaleLowerCase, 2,0),
|
||||
JS_FN("intl_toLocaleUpperCase", intl_toLocaleUpperCase, 2,0),
|
||||
JS_FN("intl_RelativeTimeFormat_availableLocales", intl_RelativeTimeFormat_availableLocales, 0,0),
|
||||
JS_FN("intl_FormatRelativeTime", intl_FormatRelativeTime, 3,0),
|
||||
|
||||
|
||||
@@ -0,0 +1,281 @@
|
||||
# SpecialCasing-11.0.0.txt
|
||||
# Date: 2018-02-22, 06:16:47 GMT
|
||||
# © 2018 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# Special Casing
|
||||
#
|
||||
# This file is a supplement to the UnicodeData.txt file. It does not define any
|
||||
# properties, but rather provides additional information about the casing of
|
||||
# Unicode characters, for situations when casing incurs a change in string length
|
||||
# or is dependent on context or locale. For compatibility, the UnicodeData.txt
|
||||
# file only contains simple case mappings for characters where they are one-to-one
|
||||
# and independent of context and language. The data in this file, combined with
|
||||
# the simple case mappings in UnicodeData.txt, defines the full case mappings
|
||||
# Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc).
|
||||
#
|
||||
# Note that the preferred mechanism for defining tailored casing operations is
|
||||
# the Unicode Common Locale Data Repository (CLDR). For more information, see the
|
||||
# discussion of case mappings and case algorithms in the Unicode Standard.
|
||||
#
|
||||
# All code points not listed in this file that do not have a simple case mappings
|
||||
# in UnicodeData.txt map to themselves.
|
||||
# ================================================================================
|
||||
# Format
|
||||
# ================================================================================
|
||||
# The entries in this file are in the following machine-readable format:
|
||||
#
|
||||
# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
|
||||
#
|
||||
# <code>, <lower>, <title>, and <upper> provide the respective full case mappings
|
||||
# of <code>, expressed as character values in hex. If there is more than one character,
|
||||
# they are separated by spaces. Other than as used to separate elements, spaces are
|
||||
# to be ignored.
|
||||
#
|
||||
# The <condition_list> is optional. Where present, it consists of one or more language IDs
|
||||
# or casing contexts, separated by spaces. In these conditions:
|
||||
# - A condition list overrides the normal behavior if all of the listed conditions are true.
|
||||
# - The casing context is always the context of the characters in the original string,
|
||||
# NOT in the resulting string.
|
||||
# - Case distinctions in the condition list are not significant.
|
||||
# - Conditions preceded by "Not_" represent the negation of the condition.
|
||||
# The condition list is not represented in the UCD as a formal property.
|
||||
#
|
||||
# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
|
||||
#
|
||||
# A casing context for a character is defined by Section 3.13 Default Case Algorithms
|
||||
# of The Unicode Standard.
|
||||
#
|
||||
# Parsers of this file must be prepared to deal with future additions to this format:
|
||||
# * Additional contexts
|
||||
# * Additional fields
|
||||
# ================================================================================
|
||||
|
||||
# ================================================================================
|
||||
# Unconditional mappings
|
||||
# ================================================================================
|
||||
|
||||
# The German es-zed is special--the normal mapping is to SS.
|
||||
# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))
|
||||
|
||||
00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
|
||||
|
||||
# Preserve canonical equivalence for I with dot. Turkic is handled below.
|
||||
|
||||
0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
|
||||
# Ligatures
|
||||
|
||||
FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF
|
||||
FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI
|
||||
FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL
|
||||
FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI
|
||||
FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL
|
||||
FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T
|
||||
FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST
|
||||
|
||||
0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN
|
||||
FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW
|
||||
FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH
|
||||
FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI
|
||||
FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW
|
||||
FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
|
||||
|
||||
# No corresponding uppercase precomposed character
|
||||
|
||||
0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
|
||||
0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
|
||||
03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
|
||||
01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON
|
||||
1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW
|
||||
1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS
|
||||
1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE
|
||||
1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE
|
||||
1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING
|
||||
1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI
|
||||
1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
|
||||
1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
|
||||
1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
|
||||
1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI
|
||||
1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI
|
||||
1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
|
||||
1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
|
||||
1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
|
||||
1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
|
||||
1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
|
||||
1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
|
||||
1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI
|
||||
1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
|
||||
1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
|
||||
1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI
|
||||
|
||||
# IMPORTANT-when iota-subscript (0345) is uppercased or titlecased,
|
||||
# the result will be incorrect unless the iota-subscript is moved to the end
|
||||
# of any sequence of combining marks. Otherwise, the accents will go on the capital iota.
|
||||
# This process can be achieved by first transforming the text to NFC before casing.
|
||||
# E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
|
||||
|
||||
# The following cases are already in the UnicodeData.txt file, so are only commented here.
|
||||
|
||||
# 0345; 0345; 0399; 0399; # COMBINING GREEK YPOGEGRAMMENI
|
||||
|
||||
# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
|
||||
# have special uppercases.
|
||||
# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
|
||||
|
||||
1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
|
||||
1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
|
||||
1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
|
||||
1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
|
||||
1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
|
||||
1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
|
||||
1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
|
||||
1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
|
||||
1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
|
||||
1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
|
||||
1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
|
||||
1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
|
||||
1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
|
||||
1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
|
||||
1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
|
||||
1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
|
||||
1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
|
||||
1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
|
||||
1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
|
||||
1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
|
||||
1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
|
||||
1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
|
||||
1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
|
||||
1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
|
||||
1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
|
||||
1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
|
||||
1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
|
||||
1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
|
||||
1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
|
||||
1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
|
||||
1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
|
||||
1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
|
||||
1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
|
||||
1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
|
||||
1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
|
||||
1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
|
||||
1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
|
||||
1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
|
||||
1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
|
||||
1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
|
||||
1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
|
||||
1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
|
||||
1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
|
||||
1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
|
||||
1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
|
||||
1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
|
||||
1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
|
||||
1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
|
||||
1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
|
||||
1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
|
||||
1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
|
||||
1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
|
||||
1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
|
||||
1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
|
||||
|
||||
# Some characters with YPOGEGRAMMENI also have no corresponding titlecases
|
||||
|
||||
1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
|
||||
1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
|
||||
1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
|
||||
1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
|
||||
1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
|
||||
1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
|
||||
|
||||
1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
|
||||
1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
|
||||
1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
|
||||
|
||||
# ================================================================================
|
||||
# Conditional Mappings
|
||||
# The remainder of this file provides conditional casing data used to produce
|
||||
# full case mappings.
|
||||
# ================================================================================
|
||||
# Language-Insensitive Mappings
|
||||
# These are characters whose full case mappings do not depend on language, but do
|
||||
# depend on context (which characters come before or after). For more information
|
||||
# see the header of this file and the Unicode Standard.
|
||||
# ================================================================================
|
||||
|
||||
# Special case for final form of sigma
|
||||
|
||||
03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
||||
|
||||
# Note: the following cases for non-final are already in the UnicodeData.txt file.
|
||||
|
||||
# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
|
||||
# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
|
||||
# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
|
||||
|
||||
# Note: the following cases are not included, since they would case-fold in lowercasing
|
||||
|
||||
# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
|
||||
# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
|
||||
|
||||
# ================================================================================
|
||||
# Language-Sensitive Mappings
|
||||
# These are characters whose full case mappings depend on language and perhaps also
|
||||
# context (which characters come before or after). For more information
|
||||
# see the header of this file and the Unicode Standard.
|
||||
# ================================================================================
|
||||
|
||||
# Lithuanian
|
||||
|
||||
# Lithuanian retains the dot in a lowercase i when followed by accents.
|
||||
|
||||
# Remove DOT ABOVE after "i" with upper or titlecase
|
||||
|
||||
0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
|
||||
# Introduce an explicit dot above when lowercasing capital I's and J's
|
||||
# whenever there are more accents above.
|
||||
# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
|
||||
|
||||
0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
|
||||
004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
|
||||
012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
|
||||
00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
|
||||
|
||||
# ================================================================================
|
||||
|
||||
# Turkish and Azeri
|
||||
|
||||
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||||
# The following rules handle those cases.
|
||||
|
||||
0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
|
||||
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||||
# This matches the behavior of the canonically equivalent I-dot_above
|
||||
|
||||
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
|
||||
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
|
||||
|
||||
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
|
||||
0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
|
||||
0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
|
||||
|
||||
# When uppercasing, i turns into a dotted capital I
|
||||
|
||||
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
|
||||
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
|
||||
|
||||
# Note: the following case is already in the UnicodeData.txt file.
|
||||
|
||||
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
|
||||
|
||||
# EOF
|
||||
|
||||
+1608
-1008
File diff suppressed because it is too large
Load Diff
+55
-2
@@ -62,8 +62,16 @@ namespace CharFlag {
|
||||
const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY;
|
||||
}
|
||||
|
||||
const char16_t NO_BREAK_SPACE = 0x00A0;
|
||||
const char16_t MICRO_SIGN = 0x00B5;
|
||||
const char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF;
|
||||
const char16_t LATIN_SMALL_LETTER_Y_WITH_DIAERESIS = 0x00FF;
|
||||
const char16_t LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE = 0x0130;
|
||||
const char16_t COMBINING_DOT_ABOVE = 0x0307;
|
||||
const char16_t GREEK_CAPITAL_LETTER_SIGMA = 0x03A3;
|
||||
const char16_t GREEK_SMALL_LETTER_FINAL_SIGMA = 0x03C2;
|
||||
const char16_t GREEK_SMALL_LETTER_SIGMA = 0x03C3;
|
||||
const char16_t BYTE_ORDER_MARK2 = 0xFFFE;
|
||||
const char16_t NO_BREAK_SPACE = 0x00A0;
|
||||
|
||||
const char16_t LeadSurrogateMin = 0xD800;
|
||||
const char16_t LeadSurrogateMax = 0xDBFF;
|
||||
@@ -239,6 +247,10 @@ IsSpaceOrBOM2(char16_t ch)
|
||||
return CharInfo(ch).isSpace();
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the simple upper case mapping (see CanUpperCaseSpecialCasing for
|
||||
* details) of the given UTF-16 code unit.
|
||||
*/
|
||||
inline char16_t
|
||||
ToUpperCase(char16_t ch)
|
||||
{
|
||||
@@ -253,6 +265,10 @@ ToUpperCase(char16_t ch)
|
||||
return uint16_t(ch) + info.upperCase;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the simple lower case mapping (see CanUpperCaseSpecialCasing for
|
||||
* details) of the given UTF-16 code unit.
|
||||
*/
|
||||
inline char16_t
|
||||
ToLowerCase(char16_t ch)
|
||||
{
|
||||
@@ -329,6 +345,43 @@ ToLowerCaseNonBMPTrail(char16_t lead, char16_t trail)
|
||||
return trail;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the given UTF-16 code unit has a language-independent,
|
||||
* unconditional or conditional special upper case mapping.
|
||||
*
|
||||
* Unicode defines two case mapping modes:
|
||||
* 1. "simple case mappings" for one-to-one mappings which are independent of
|
||||
* context and language (defined in UnicodeData.txt).
|
||||
* 2. "special case mappings" for mappings which can increase or decrease the
|
||||
* string length; or are dependent on context or locale (defined in
|
||||
* SpecialCasing.txt).
|
||||
*
|
||||
* The CanUpperCase() method defined above only supports simple case mappings.
|
||||
* In order to support the full case mappings of all Unicode characters,
|
||||
* callers need to check this method in addition to CanUpperCase().
|
||||
*
|
||||
* NOTE: All special upper case mappings are unconditional in Unicode 9.
|
||||
*/
|
||||
bool
|
||||
CanUpperCaseSpecialCasing(char16_t ch);
|
||||
|
||||
/*
|
||||
* Returns the length of the upper case mapping of |ch|.
|
||||
*
|
||||
* This function asserts if |ch| doesn't have a special upper case mapping.
|
||||
*/
|
||||
size_t
|
||||
LengthUpperCaseSpecialCasing(char16_t ch);
|
||||
|
||||
/*
|
||||
* Appends the upper case mapping of |ch| to the given output buffer,
|
||||
* starting at the provided index.
|
||||
*
|
||||
* This function asserts if |ch| doesn't have a special upper case mapping.
|
||||
*/
|
||||
void
|
||||
AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index);
|
||||
|
||||
/*
|
||||
* For a codepoint C, CodepointsWithSameUpperCaseInfo stores three offsets
|
||||
* from C to up to three codepoints with same uppercase (no codepoint in
|
||||
@@ -504,7 +557,7 @@ UTF16Encode(uint32_t codePoint, char16_t* lead, char16_t* trail)
|
||||
*trail = TrailSurrogate(codePoint);
|
||||
}
|
||||
|
||||
static inline void
|
||||
inline void
|
||||
UTF16Encode(uint32_t codePoint, char16_t* elements, unsigned* index)
|
||||
{
|
||||
if (!IsSupplementary(codePoint)) {
|
||||
|
||||
@@ -19,6 +19,12 @@
|
||||
// DIFF: the difference between the code point in the range and
|
||||
// converted code point
|
||||
|
||||
// U+10400 DESERET CAPITAL LETTER LONG I .. U+10427 DESERET CAPITAL LETTER EW
|
||||
// U+104B0 OSAGE CAPITAL LETTER A .. U+104D3 OSAGE CAPITAL LETTER ZHA
|
||||
// U+10C80 OLD HUNGARIAN CAPITAL LETTER A .. U+10CB2 OLD HUNGARIAN CAPITAL LETTER US
|
||||
// U+118A0 WARANG CITI CAPITAL LETTER NGAA .. U+118BF WARANG CITI CAPITAL LETTER VIYO
|
||||
// U+16E40 MEDEFAIDRIN CAPITAL LETTER M .. U+16E5F MEDEFAIDRIN CAPITAL LETTER Y
|
||||
// U+1E900 ADLAM CAPITAL LETTER ALIF .. U+1E921 ADLAM CAPITAL LETTER SHA
|
||||
#define FOR_EACH_NON_BMP_LOWERCASE(macro) \
|
||||
macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40) \
|
||||
macro(0x104b0, 0x104d3, 0xd801, 0xdcb0, 0xdcd3, 40) \
|
||||
@@ -27,6 +33,12 @@
|
||||
macro(0x16e40, 0x16e5f, 0xd81b, 0xde40, 0xde5f, 32) \
|
||||
macro(0x1e900, 0x1e921, 0xd83a, 0xdd00, 0xdd21, 34)
|
||||
|
||||
// U+10428 DESERET SMALL LETTER LONG I .. U+1044F DESERET SMALL LETTER EW
|
||||
// U+104D8 OSAGE SMALL LETTER A .. U+104FB OSAGE SMALL LETTER ZHA
|
||||
// U+10CC0 OLD HUNGARIAN SMALL LETTER A .. U+10CF2 OLD HUNGARIAN SMALL LETTER US
|
||||
// U+118C0 WARANG CITI SMALL LETTER NGAA .. U+118DF WARANG CITI SMALL LETTER VIYO
|
||||
// U+16E60 MEDEFAIDRIN SMALL LETTER M .. U+16E7F MEDEFAIDRIN SMALL LETTER Y
|
||||
// U+1E922 ADLAM SMALL LETTER ALIF .. U+1E943 ADLAM SMALL LETTER SHA
|
||||
#define FOR_EACH_NON_BMP_UPPERCASE(macro) \
|
||||
macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40) \
|
||||
macro(0x104d8, 0x104fb, 0xd801, 0xdcd8, 0xdcfb, -40) \
|
||||
@@ -35,6 +47,12 @@
|
||||
macro(0x16e60, 0x16e7f, 0xd81b, 0xde60, 0xde7f, -32) \
|
||||
macro(0x1e922, 0x1e943, 0xd83a, 0xdd22, 0xdd43, -34)
|
||||
|
||||
// U+10400 DESERET CAPITAL LETTER LONG I .. U+10427 DESERET CAPITAL LETTER EW
|
||||
// U+104B0 OSAGE CAPITAL LETTER A .. U+104D3 OSAGE CAPITAL LETTER ZHA
|
||||
// U+10C80 OLD HUNGARIAN CAPITAL LETTER A .. U+10CB2 OLD HUNGARIAN CAPITAL LETTER US
|
||||
// U+118A0 WARANG CITI CAPITAL LETTER NGAA .. U+118BF WARANG CITI CAPITAL LETTER VIYO
|
||||
// U+16E40 MEDEFAIDRIN CAPITAL LETTER M .. U+16E5F MEDEFAIDRIN CAPITAL LETTER Y
|
||||
// U+1E900 ADLAM CAPITAL LETTER ALIF .. U+1E921 ADLAM CAPITAL LETTER SHA
|
||||
#define FOR_EACH_NON_BMP_CASE_FOLDING(macro) \
|
||||
macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40) \
|
||||
macro(0x104b0, 0x104d3, 0xd801, 0xdcb0, 0xdcd3, 40) \
|
||||
@@ -43,6 +61,12 @@
|
||||
macro(0x16e40, 0x16e5f, 0xd81b, 0xde40, 0xde5f, 32) \
|
||||
macro(0x1e900, 0x1e921, 0xd83a, 0xdd00, 0xdd21, 34)
|
||||
|
||||
// U+10428 DESERET SMALL LETTER LONG I .. U+1044F DESERET SMALL LETTER EW
|
||||
// U+104D8 OSAGE SMALL LETTER A .. U+104FB OSAGE SMALL LETTER ZHA
|
||||
// U+10CC0 OLD HUNGARIAN SMALL LETTER A .. U+10CF2 OLD HUNGARIAN SMALL LETTER US
|
||||
// U+118C0 WARANG CITI SMALL LETTER NGAA .. U+118DF WARANG CITI SMALL LETTER VIYO
|
||||
// U+16E60 MEDEFAIDRIN SMALL LETTER M .. U+16E7F MEDEFAIDRIN SMALL LETTER Y
|
||||
// U+1E922 ADLAM SMALL LETTER ALIF .. U+1E943 ADLAM SMALL LETTER SHA
|
||||
#define FOR_EACH_NON_BMP_REV_CASE_FOLDING(macro) \
|
||||
macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40) \
|
||||
macro(0x104d8, 0x104fb, 0xd801, 0xdcd8, 0xdcfb, -40) \
|
||||
|
||||
+515
-152
@@ -26,6 +26,18 @@ import re
|
||||
import os
|
||||
import sys
|
||||
from contextlib import closing
|
||||
from functools import partial
|
||||
from itertools import chain, groupby, ifilter, imap, izip_longest, tee
|
||||
from operator import is_not, itemgetter
|
||||
|
||||
class codepoint_dict(dict):
|
||||
def name(self, code_point):
|
||||
(_, _, name, alias) = self[code_point]
|
||||
return '{}{}'.format(name, (' (' + alias + ')' if alias else ''))
|
||||
|
||||
def full_name(self, code_point):
|
||||
(_, _, name, alias) = self[code_point]
|
||||
return 'U+{:04X} {}{}'.format(code_point, name, (' (' + alias + ')' if alias else ''))
|
||||
|
||||
# ECMAScript 2016
|
||||
# §11.2 White Space
|
||||
@@ -132,10 +144,32 @@ def read_derived_core_properties(derived_core_properties):
|
||||
for char in range(int(start, 16), int(end, 16) + 1):
|
||||
yield (char, char_property)
|
||||
|
||||
def read_special_casing(special_casing):
|
||||
# Format:
|
||||
# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
|
||||
for line in special_casing:
|
||||
if line == '\n' or line.startswith('#'):
|
||||
continue
|
||||
row = line.split('#')[0].split(';')
|
||||
code = int(row[0].strip(), 16)
|
||||
lower = row[1].strip()
|
||||
lower = [int(c, 16) for c in lower.split(' ')] if lower else []
|
||||
upper = row[3].strip()
|
||||
upper = [int(c, 16) for c in upper.split(' ')] if upper else []
|
||||
languages = []
|
||||
contexts = []
|
||||
condition = row[4].strip()
|
||||
if condition:
|
||||
for cond in condition.split(' '):
|
||||
if cond[0].islower():
|
||||
languages.append(cond)
|
||||
else:
|
||||
contexts.append(cond)
|
||||
pass
|
||||
yield (code, lower, upper, languages, contexts)
|
||||
|
||||
def int_ranges(ints):
|
||||
""" Yields consecutive ranges (inclusive) from integer values. """
|
||||
from itertools import tee, izip_longest
|
||||
|
||||
(a, b) = tee(sorted(ints))
|
||||
start = next(b)
|
||||
for (curr, succ) in izip_longest(a, b):
|
||||
@@ -153,7 +187,7 @@ def utf16_encode(code):
|
||||
|
||||
return lead, trail
|
||||
|
||||
def make_non_bmp_convert_macro(out_file, name, convert_map):
|
||||
def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table):
|
||||
# Find continuous range in convert_map.
|
||||
convert_list = []
|
||||
entry = None
|
||||
@@ -179,6 +213,7 @@ def make_non_bmp_convert_macro(out_file, name, convert_map):
|
||||
|
||||
# Generate macro call for each range.
|
||||
lines = []
|
||||
comment = []
|
||||
for entry in convert_list:
|
||||
from_code = entry['code']
|
||||
to_code = entry['code'] + entry['length'] - 1
|
||||
@@ -190,29 +225,15 @@ def make_non_bmp_convert_macro(out_file, name, convert_map):
|
||||
|
||||
lines.append(' macro(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})'.format(
|
||||
from_code, to_code, lead, from_trail, to_trail, diff))
|
||||
comment.append('// {} .. {}'.format(codepoint_table.full_name(from_code),
|
||||
codepoint_table.full_name(to_code)))
|
||||
|
||||
out_file.write('\n'.join(comment))
|
||||
out_file.write('\n')
|
||||
out_file.write('#define FOR_EACH_NON_BMP_{}(macro) \\\n'.format(name))
|
||||
out_file.write(' \\\n'.join(lines))
|
||||
out_file.write('\n')
|
||||
|
||||
def for_each_non_bmp_group(group_set):
|
||||
# Find continuous range in group_set.
|
||||
group_list = []
|
||||
entry = None
|
||||
for code in sorted(group_set.keys()):
|
||||
if entry and code == entry['code'] + entry['length']:
|
||||
entry['length'] += 1
|
||||
continue
|
||||
|
||||
entry = {
|
||||
'code': code,
|
||||
'length': 1
|
||||
}
|
||||
group_list.append(entry)
|
||||
|
||||
for entry in group_list:
|
||||
yield (entry['code'], entry['code'] + entry['length'] - 1)
|
||||
|
||||
def process_derived_core_properties(derived_core_properties):
|
||||
id_start = set()
|
||||
id_continue = set()
|
||||
@@ -236,7 +257,7 @@ def process_unicode_data(unicode_data, derived_core_properties):
|
||||
same_upper_cache = {same_upper_dummy: 0}
|
||||
same_upper_index = [0] * (MAX_BMP + 1)
|
||||
|
||||
test_table = {}
|
||||
codepoint_table = codepoint_dict()
|
||||
test_space_table = []
|
||||
|
||||
non_bmp_lower_map = {}
|
||||
@@ -254,15 +275,9 @@ def process_unicode_data(unicode_data, derived_core_properties):
|
||||
alias = row[-5]
|
||||
uppercase = row[-3]
|
||||
lowercase = row[-2]
|
||||
flags = 0
|
||||
|
||||
if uppercase:
|
||||
upper = int(uppercase, 16)
|
||||
|
||||
if upper not in same_upper_map:
|
||||
same_upper_map[upper] = [code]
|
||||
else:
|
||||
same_upper_map[upper].append(code)
|
||||
else:
|
||||
upper = code
|
||||
|
||||
@@ -271,6 +286,8 @@ def process_unicode_data(unicode_data, derived_core_properties):
|
||||
else:
|
||||
lower = code
|
||||
|
||||
codepoint_table[code] = (upper, lower, name, alias)
|
||||
|
||||
if code > MAX_BMP:
|
||||
if code != lower:
|
||||
non_bmp_lower_map[code] = lower
|
||||
@@ -285,6 +302,16 @@ def process_unicode_data(unicode_data, derived_core_properties):
|
||||
non_bmp_id_cont_set[code] = 1
|
||||
continue
|
||||
|
||||
assert lower <= MAX_BMP and upper <= MAX_BMP
|
||||
|
||||
if code != upper:
|
||||
if upper not in same_upper_map:
|
||||
same_upper_map[upper] = [code]
|
||||
else:
|
||||
same_upper_map[upper].append(code)
|
||||
|
||||
flags = 0
|
||||
|
||||
# we combine whitespace and lineterminators because in pratice we don't need them separated
|
||||
if category == 'Zs' or code in whitespace or code in line_terminator:
|
||||
flags |= FLAG_SPACE
|
||||
@@ -298,8 +325,6 @@ def process_unicode_data(unicode_data, derived_core_properties):
|
||||
elif code in id_continue or code in compatibility_identifier_part:
|
||||
flags |= FLAG_UNICODE_ID_CONTINUE_ONLY
|
||||
|
||||
test_table[code] = (upper, lower, name, alias)
|
||||
|
||||
up_d = upper - code
|
||||
low_d = lower - code
|
||||
|
||||
@@ -319,12 +344,12 @@ def process_unicode_data(unicode_data, derived_core_properties):
|
||||
index[code] = i
|
||||
|
||||
for code in range(0, MAX_BMP + 1):
|
||||
entry = test_table.get(code)
|
||||
entry = codepoint_table.get(code)
|
||||
|
||||
if not entry:
|
||||
continue
|
||||
|
||||
(upper, lower, name, alias) = entry
|
||||
(upper, _, _, _) = entry
|
||||
|
||||
if upper not in same_upper_map:
|
||||
continue
|
||||
@@ -354,7 +379,7 @@ def process_unicode_data(unicode_data, derived_core_properties):
|
||||
non_bmp_lower_map, non_bmp_upper_map,
|
||||
non_bmp_space_set,
|
||||
non_bmp_id_start_set, non_bmp_id_cont_set,
|
||||
test_table, test_space_table,
|
||||
codepoint_table, test_space_table,
|
||||
)
|
||||
|
||||
def process_case_folding(case_folding):
|
||||
@@ -438,9 +463,149 @@ def process_case_folding(case_folding):
|
||||
folding_tests
|
||||
)
|
||||
|
||||
def process_special_casing(special_casing, table, index):
|
||||
# Unconditional special casing.
|
||||
unconditional_tolower = {}
|
||||
unconditional_toupper = {}
|
||||
|
||||
# Conditional special casing, language independent.
|
||||
conditional_tolower = {}
|
||||
conditional_toupper = {}
|
||||
|
||||
# Conditional special casing, language dependent.
|
||||
lang_conditional_tolower = {}
|
||||
lang_conditional_toupper = {}
|
||||
|
||||
def caseInfo(code):
|
||||
(upper, lower, flags) = table[index[code]]
|
||||
return ((code + lower) & 0xffff, (code + upper) & 0xffff)
|
||||
|
||||
for (code, lower, upper, languages, contexts) in read_special_casing(special_casing):
|
||||
assert code <= MAX_BMP, 'Unexpected character outside of BMP: %s' % code
|
||||
assert len(languages) <= 1, 'Expected zero or one language ids: %s' % languages
|
||||
assert len(contexts) <= 1, 'Expected zero or one casing contexts: %s' % languages
|
||||
|
||||
(default_lower, default_upper) = caseInfo(code)
|
||||
special_lower = len(lower) != 1 or lower[0] != default_lower
|
||||
special_upper = len(upper) != 1 or upper[0] != default_upper
|
||||
|
||||
# Invariant: If |code| has casing per UnicodeData.txt, then it also has
|
||||
# casing rules in SpecialCasing.txt.
|
||||
assert code == default_lower or len(lower) != 1 or code != lower[0]
|
||||
assert code == default_upper or len(upper) != 1 or code != upper[0]
|
||||
|
||||
language = languages[0] if languages else None
|
||||
context = contexts[0] if contexts else None
|
||||
|
||||
if not language and not context:
|
||||
if special_lower:
|
||||
unconditional_tolower[code] = lower
|
||||
if special_upper:
|
||||
unconditional_toupper[code] = upper
|
||||
elif not language and context:
|
||||
if special_lower:
|
||||
conditional_tolower[code] = (lower, context)
|
||||
if special_upper:
|
||||
conditional_toupper[code] = (upper, context)
|
||||
else:
|
||||
if language not in lang_conditional_tolower:
|
||||
lang_conditional_tolower[language] = {}
|
||||
lang_conditional_toupper[language] = {}
|
||||
if special_lower:
|
||||
lang_conditional_tolower[language][code] = (lower, context)
|
||||
if special_upper:
|
||||
lang_conditional_toupper[language][code] = (upper, context)
|
||||
|
||||
# Certain special casing rules are inlined in jsstr.cpp, ensure these cases
|
||||
# still match the current SpecialCasing.txt file.
|
||||
def lowerCase(code):
|
||||
(lower, _) = caseInfo(code)
|
||||
return lower
|
||||
|
||||
def upperCase(code):
|
||||
(_, upper) = caseInfo(code)
|
||||
return upper
|
||||
|
||||
def ascii(char_dict):
|
||||
return ifilter(lambda ch: ch <= 0x7f, char_dict.iterkeys())
|
||||
|
||||
def latin1(char_dict):
|
||||
return ifilter(lambda ch: ch <= 0xff, char_dict.iterkeys())
|
||||
|
||||
def is_empty(iterable):
|
||||
return not any(True for _ in iterable)
|
||||
|
||||
def is_equals(iter1, iter2):
|
||||
return all(x == y for (x, y) in izip_longest(iter1, iter2))
|
||||
|
||||
# Ensure no ASCII characters have special case mappings.
|
||||
assert is_empty(ascii(unconditional_tolower))
|
||||
assert is_empty(ascii(unconditional_toupper))
|
||||
assert is_empty(ascii(conditional_tolower))
|
||||
assert is_empty(ascii(conditional_toupper))
|
||||
|
||||
# Ensure no Latin1 characters have special lower case mappings.
|
||||
assert is_empty(latin1(unconditional_tolower))
|
||||
assert is_empty(latin1(conditional_tolower))
|
||||
|
||||
# Ensure no Latin1 characters have conditional special upper case mappings.
|
||||
assert is_empty(latin1(conditional_toupper))
|
||||
|
||||
# Ensure U+00DF is the only Latin1 character with a special upper case mapping.
|
||||
assert is_equals([0x00DF], latin1(unconditional_toupper))
|
||||
|
||||
# Ensure U+0130 is the only character with a special lower case mapping.
|
||||
assert is_equals([0x0130], unconditional_tolower)
|
||||
|
||||
# Ensure no characters have language independent conditional upper case mappings.
|
||||
assert is_empty(conditional_toupper)
|
||||
|
||||
# Ensure U+03A3 is the only character with language independent conditional lower case mapping.
|
||||
assert is_equals([0x03A3], conditional_tolower)
|
||||
|
||||
# Verify U+0130 and U+03A3 have simple lower case mappings.
|
||||
assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3])
|
||||
|
||||
# Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings.
|
||||
assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.iterkeys()))
|
||||
assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.iterkeys()))
|
||||
|
||||
# Maximum case mapping length is three characters.
|
||||
itervals = lambda d: d.itervalues()
|
||||
assert max(imap(len, chain(
|
||||
itervals(unconditional_tolower),
|
||||
itervals(unconditional_toupper),
|
||||
imap(itemgetter(0), itervals(conditional_tolower)),
|
||||
imap(itemgetter(0), itervals(conditional_toupper)),
|
||||
imap(itemgetter(0), chain.from_iterable(imap(itervals, itervals(lang_conditional_tolower)))),
|
||||
imap(itemgetter(0), chain.from_iterable(imap(itervals, itervals(lang_conditional_toupper)))),
|
||||
))) <= 3
|
||||
|
||||
# Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms).
|
||||
assert set([
|
||||
'After_I', 'After_Soft_Dotted', 'Final_Sigma', 'More_Above', 'Not_Before_Dot',
|
||||
]).issuperset(set(ifilter(partial(is_not, None), chain(
|
||||
imap(itemgetter(1), itervals(conditional_tolower)),
|
||||
imap(itemgetter(1), itervals(conditional_toupper)),
|
||||
imap(itemgetter(1), chain.from_iterable(imap(itervals, itervals(lang_conditional_tolower)))),
|
||||
imap(itemgetter(1), chain.from_iterable(imap(itervals, itervals(lang_conditional_toupper)))),
|
||||
))))
|
||||
|
||||
# Special casing for U+00DF (LATIN SMALL LETTER SHARP S).
|
||||
assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [0x0053, 0x0053];
|
||||
|
||||
# Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE).
|
||||
assert unconditional_tolower[0x0130] == [0x0069, 0x0307]
|
||||
|
||||
# Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA).
|
||||
assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == ([0x03C2], 'Final_Sigma');
|
||||
|
||||
return (unconditional_tolower, unconditional_toupper)
|
||||
|
||||
def make_non_bmp_file(version,
|
||||
non_bmp_lower_map, non_bmp_upper_map,
|
||||
non_bmp_folding_map, non_bmp_rev_folding_map):
|
||||
non_bmp_folding_map, non_bmp_rev_folding_map,
|
||||
codepoint_table):
|
||||
file_name = 'UnicodeNonBMP.h';
|
||||
with io.open(file_name, mode='wb') as non_bmp_file:
|
||||
non_bmp_file.write(mpl_license)
|
||||
@@ -463,77 +628,277 @@ def make_non_bmp_file(version,
|
||||
|
||||
""")
|
||||
|
||||
make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map)
|
||||
make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map, codepoint_table)
|
||||
non_bmp_file.write('\n')
|
||||
make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map)
|
||||
make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map, codepoint_table)
|
||||
non_bmp_file.write('\n')
|
||||
make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map)
|
||||
make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map, codepoint_table)
|
||||
non_bmp_file.write('\n')
|
||||
make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map)
|
||||
make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map, codepoint_table)
|
||||
|
||||
non_bmp_file.write("""
|
||||
#endif /* vm_UnicodeNonBMP_h */
|
||||
""")
|
||||
|
||||
def make_bmp_mapping_test(version, test_table):
|
||||
def write_special_casing_methods(unconditional_toupper, codepoint_table, println):
|
||||
def hexlit(n):
|
||||
""" Returns C++ hex-literal for |n|. """
|
||||
return '0x{:04X}'.format(n)
|
||||
|
||||
def describe_range(ranges, depth):
|
||||
indent = depth * ' '
|
||||
for (start, end) in ranges:
|
||||
if start == end:
|
||||
println(indent, '// {}'.format(codepoint_table.full_name(start)))
|
||||
else:
|
||||
println(indent, '// {} .. {}'.format(codepoint_table.full_name(start),
|
||||
codepoint_table.full_name(end)))
|
||||
|
||||
def out_range(start, end):
|
||||
""" Tests if the input character isn't a member of the set {x | start <= x <= end}. """
|
||||
if (start == end):
|
||||
return 'ch != {}'.format(hexlit(start))
|
||||
return 'ch < {} || ch > {}'.format(hexlit(start), hexlit(end))
|
||||
|
||||
def in_range(start, end, parenthesize=False):
|
||||
""" Tests if the input character is in the set {x | start <= x <= end}. """
|
||||
if (start == end):
|
||||
return 'ch == {}'.format(hexlit(start))
|
||||
(left, right) = ('(', ')') if parenthesize else ('', '')
|
||||
return '{}ch >= {} && ch <= {}{}'.format(left, hexlit(start), hexlit(end), right)
|
||||
|
||||
def in_any_range(ranges, spaces):
|
||||
""" Tests if the input character is included in any of the given ranges. """
|
||||
lines = [[]]
|
||||
for (start, end) in ranges:
|
||||
expr = in_range(start, end, parenthesize=True)
|
||||
line = ' || '.join(lines[-1] + [expr])
|
||||
if len(line) < (100 - len(spaces) - len(' ||')):
|
||||
lines[-1].append(expr)
|
||||
else:
|
||||
lines.append([expr])
|
||||
return ' ||\n{}'.format(spaces).join(imap(lambda t: ' || '.join(t), lines))
|
||||
|
||||
def write_range_accept(parent_list, child_list, depth):
|
||||
""" Accepts the input character if it matches any code unit in |child_list|. """
|
||||
(min_parent, max_parent) = (parent_list[0], parent_list[-1])
|
||||
(min_child, max_child) = (child_list[0], child_list[-1])
|
||||
assert min_child >= min_parent
|
||||
assert max_child <= max_parent
|
||||
indent = depth * ' '
|
||||
|
||||
child_ranges = list(int_ranges(child_list))
|
||||
has_successor = max_child != max_parent
|
||||
|
||||
# If |child_list| is a contiguous list of code units, emit a simple
|
||||
# range check: |min_child <= input <= max_child|.
|
||||
if len(child_ranges) == 1:
|
||||
describe_range(child_ranges, depth)
|
||||
if has_successor:
|
||||
println(indent, 'if (ch <= {})'.format(hexlit(max_child)))
|
||||
println(indent, ' return ch >= {};'.format(hexlit(min_child)))
|
||||
else:
|
||||
println(indent, 'return {};'.format(in_range(min_child, max_child)))
|
||||
return
|
||||
|
||||
# Otherwise create a disjunction over the subranges in |child_ranges|.
|
||||
if not has_successor:
|
||||
spaces = indent + len('return ') * ' '
|
||||
else:
|
||||
spaces = indent + len(' return ') * ' '
|
||||
range_test_expr = in_any_range(child_ranges, spaces)
|
||||
|
||||
if min_child != min_parent:
|
||||
println(indent, 'if (ch < {})'.format(hexlit(min_child)))
|
||||
println(indent, ' return false;')
|
||||
|
||||
# If there's no successor block, we can omit the |input <= max_child| check,
|
||||
# because it was already checked when we emitted the parent range test.
|
||||
if not has_successor:
|
||||
describe_range(child_ranges, depth)
|
||||
println(indent, 'return {};'.format(range_test_expr))
|
||||
else:
|
||||
println(indent, 'if (ch <= {}) {{'.format(hexlit(max_child)))
|
||||
describe_range(child_ranges, depth + 1)
|
||||
println(indent, ' return {};'.format(range_test_expr))
|
||||
println(indent, '}')
|
||||
|
||||
def write_CanUpperCaseSpecialCasing():
|
||||
""" Checks if the input has a special upper case mapping. """
|
||||
println('bool')
|
||||
println('js::unicode::CanUpperCaseSpecialCasing(char16_t ch)')
|
||||
println('{')
|
||||
|
||||
assert unconditional_toupper, "|unconditional_toupper| is not empty"
|
||||
|
||||
# Sorted list of code units with special upper case mappings.
|
||||
code_list = sorted(unconditional_toupper.iterkeys())
|
||||
|
||||
# Fail-fast if the input character isn't a special casing character.
|
||||
println(' if ({})'.format(out_range(code_list[0], code_list[-1])))
|
||||
println(' return false;')
|
||||
|
||||
for i in range(0, 16):
|
||||
# Check if the input characters is in the range:
|
||||
# |start_point <= input < end_point|.
|
||||
start_point = i << 12
|
||||
end_point = (i + 1) << 12
|
||||
matches = [cu for cu in code_list if start_point <= cu < end_point]
|
||||
|
||||
# Skip empty ranges.
|
||||
if not matches:
|
||||
continue
|
||||
|
||||
# If |matches| consists of only a few characters, directly check
|
||||
# the input against the characters in |matches|.
|
||||
if len(matches) <= 8:
|
||||
write_range_accept(code_list, matches, depth=1)
|
||||
continue
|
||||
|
||||
# Otherwise split into further subranges.
|
||||
|
||||
# Only enter the if-block if the input is less-or-equals to the
|
||||
# largest value in the current range.
|
||||
is_last_block = matches[-1] == code_list[-1]
|
||||
if not is_last_block:
|
||||
println(' if (ch <= {}) {{'.format(hexlit(matches[-1])))
|
||||
else:
|
||||
println(' if (ch < {})'.format(hexlit(matches[0])))
|
||||
println(' return false;')
|
||||
|
||||
for j in range(0, 16):
|
||||
inner_start = start_point + (j << 8)
|
||||
inner_end = start_point + ((j + 1) << 8)
|
||||
inner_matches = [cu for cu in matches if inner_start <= cu < inner_end]
|
||||
|
||||
if inner_matches:
|
||||
d = 1 if is_last_block else 2
|
||||
write_range_accept(matches, inner_matches, depth=d)
|
||||
|
||||
if not is_last_block:
|
||||
println(' }')
|
||||
|
||||
println('}')
|
||||
|
||||
def write_LengthUpperCaseSpecialCasing():
|
||||
""" Slow case: Special casing character was found, returns its mapping length. """
|
||||
println('size_t')
|
||||
println('js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)')
|
||||
println('{')
|
||||
|
||||
println(' switch(ch) {')
|
||||
for (code, converted) in sorted(unconditional_toupper.iteritems(), key=itemgetter(0)):
|
||||
println(' case {}: return {}; // {}'.format(hexlit(code), len(converted),
|
||||
codepoint_table.name(code)))
|
||||
println(' }')
|
||||
println('')
|
||||
println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");')
|
||||
println(' return 0;')
|
||||
|
||||
println('}')
|
||||
|
||||
def write_AppendUpperCaseSpecialCasing():
|
||||
""" Slow case: Special casing character was found, append its mapping characters. """
|
||||
println('void')
|
||||
println('js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)')
|
||||
println('{')
|
||||
|
||||
println(' switch(ch) {')
|
||||
for (code, converted) in sorted(unconditional_toupper.iteritems(), key=itemgetter(0)):
|
||||
println(' case {}: // {}'.format(hexlit(code), codepoint_table.name(code)))
|
||||
for ch in converted:
|
||||
println(' elements[(*index)++] = {}; // {}'.format(hexlit(ch),
|
||||
codepoint_table.name(ch)))
|
||||
println(' return;')
|
||||
println(' }')
|
||||
println('')
|
||||
println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");')
|
||||
println(' return;')
|
||||
|
||||
println('}')
|
||||
|
||||
write_CanUpperCaseSpecialCasing()
|
||||
println('')
|
||||
write_LengthUpperCaseSpecialCasing()
|
||||
println('')
|
||||
write_AppendUpperCaseSpecialCasing()
|
||||
|
||||
def make_bmp_mapping_test(version, codepoint_table, unconditional_tolower, unconditional_toupper):
|
||||
def unicodeEsc(n):
|
||||
return '\u{:04X}'.format(n)
|
||||
|
||||
file_name = '../tests/ecma_5/String/string-upper-lower-mapping.js'
|
||||
with io.open(file_name, mode='wb') as test_mapping:
|
||||
test_mapping.write(warning_message)
|
||||
test_mapping.write(unicode_version_message.format(version))
|
||||
test_mapping.write(public_domain)
|
||||
test_mapping.write('var mapping = [\n')
|
||||
with io.open(file_name, mode='wb') as output:
|
||||
write = partial(print, file=output, sep='', end='')
|
||||
println = partial(print, file=output, sep='', end='\n')
|
||||
|
||||
write(warning_message)
|
||||
write(unicode_version_message.format(version))
|
||||
write(public_domain)
|
||||
println('var mapping = [')
|
||||
for code in range(0, MAX_BMP + 1):
|
||||
entry = test_table.get(code)
|
||||
entry = codepoint_table.get(code)
|
||||
|
||||
if entry:
|
||||
(upper, lower, name, alias) = entry
|
||||
test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' +
|
||||
name + (' (' + alias + ')' if alias else '') + ' */\n')
|
||||
(upper, lower, _, _) = entry
|
||||
upper = unconditional_toupper[code] if code in unconditional_toupper else [upper]
|
||||
lower = unconditional_tolower[code] if code in unconditional_tolower else [lower]
|
||||
println(' ["{}", "{}"], /* {} */'.format("".join(imap(unicodeEsc, upper)),
|
||||
"".join(imap(unicodeEsc, lower)),
|
||||
codepoint_table.name(code)))
|
||||
else:
|
||||
test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n')
|
||||
test_mapping.write('];')
|
||||
test_mapping.write("""
|
||||
println(' ["{0}", "{0}"],'.format(unicodeEsc(code)))
|
||||
println('];')
|
||||
write("""
|
||||
assertEq(mapping.length, 0x10000);
|
||||
for (var i = 0; i <= 0xffff; i++) {
|
||||
var char = String.fromCharCode(i);
|
||||
var info = mapping[i];
|
||||
|
||||
assertEq(char.toUpperCase().charCodeAt(0), info[0]);
|
||||
assertEq(char.toLowerCase().charCodeAt(0), info[1]);
|
||||
assertEq(char.toUpperCase(), info[0]);
|
||||
assertEq(char.toLowerCase(), info[1]);
|
||||
}
|
||||
|
||||
if (typeof reportCompare === "function")
|
||||
reportCompare(true, true);
|
||||
""")
|
||||
|
||||
def make_non_bmp_mapping_test(version, non_bmp_upper_map, non_bmp_lower_map):
|
||||
def make_non_bmp_mapping_test(version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table):
|
||||
file_name = '../tests/ecma_6/String/string-code-point-upper-lower-mapping.js'
|
||||
with io.open(file_name, mode='wb') as test_non_bmp_mapping:
|
||||
test_non_bmp_mapping.write(warning_message)
|
||||
test_non_bmp_mapping.write(unicode_version_message.format(version))
|
||||
test_non_bmp_mapping.write(public_domain)
|
||||
|
||||
for code in sorted(non_bmp_upper_map.keys()):
|
||||
test_non_bmp_mapping.write("""\
|
||||
assertEq(String.fromCodePoint(0x{:x}).toUpperCase().codePointAt(0), 0x{:x});
|
||||
""".format(code, non_bmp_upper_map[code]))
|
||||
assertEq(String.fromCodePoint(0x{:04X}).toUpperCase().codePointAt(0), 0x{:04X}); // {}, {}
|
||||
""".format(code, non_bmp_upper_map[code],
|
||||
codepoint_table.name(code), codepoint_table.name(non_bmp_upper_map[code])))
|
||||
|
||||
for code in sorted(non_bmp_lower_map.keys()):
|
||||
test_non_bmp_mapping.write("""\
|
||||
assertEq(String.fromCodePoint(0x{:x}).toLowerCase().codePointAt(0), 0x{:x});
|
||||
""".format(code, non_bmp_lower_map[code]))
|
||||
assertEq(String.fromCodePoint(0x{:04X}).toLowerCase().codePointAt(0), 0x{:04X}); // {}, {}
|
||||
""".format(code, non_bmp_lower_map[code],
|
||||
codepoint_table.name(code), codepoint_table.name(non_bmp_lower_map[code])))
|
||||
|
||||
test_non_bmp_mapping.write("""
|
||||
if (typeof reportCompare === "function")
|
||||
reportCompare(true, true);
|
||||
""")
|
||||
|
||||
def make_space_test(version, test_space_table):
|
||||
def make_space_test(version, test_space_table, codepoint_table):
|
||||
def hex_and_name(c):
|
||||
return ' 0x{:04X} /* {} */'.format(c, codepoint_table.name(c))
|
||||
|
||||
file_name = '../tests/ecma_5/String/string-space-trim.js'
|
||||
with io.open(file_name, mode='wb') as test_space:
|
||||
test_space.write(warning_message)
|
||||
test_space.write(unicode_version_message.format(version))
|
||||
test_space.write(public_domain)
|
||||
test_space.write('var onlySpace = String.fromCharCode(' +
|
||||
', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')
|
||||
test_space.write('var onlySpace = String.fromCharCode(\n')
|
||||
test_space.write(',\n'.join(map(hex_and_name, test_space_table)))
|
||||
test_space.write('\n);\n')
|
||||
test_space.write("""
|
||||
assertEq(onlySpace.trim(), "");
|
||||
assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
|
||||
@@ -544,7 +909,10 @@ if (typeof reportCompare === "function")
|
||||
reportCompare(true, true);
|
||||
""")
|
||||
|
||||
def make_icase_test(version, folding_tests):
|
||||
def make_icase_test(version, folding_tests, codepoint_table):
|
||||
def char_hex(c):
|
||||
return '0x{:04X}'.format(c)
|
||||
|
||||
file_name = '../tests/ecma_6/RegExp/unicode-ignoreCase.js'
|
||||
with io.open(file_name, mode='wb') as test_icase:
|
||||
test_icase.write(warning_message)
|
||||
@@ -565,7 +933,8 @@ function test(code, ...equivs) {
|
||||
}
|
||||
""")
|
||||
for args in folding_tests:
|
||||
test_icase.write('test(' + ','.join([hex(c) for c in args]) + ');\n')
|
||||
test_icase.write('test({}); // {}\n'.format(', '.join(map(char_hex, args)),
|
||||
', '.join(map(codepoint_table.name, args))))
|
||||
test_icase.write("""
|
||||
if (typeof reportCompare === "function")
|
||||
reportCompare(true, true);
|
||||
@@ -576,7 +945,9 @@ def make_unicode_file(version,
|
||||
same_upper_table, same_upper_index,
|
||||
folding_table, folding_index,
|
||||
non_bmp_space_set,
|
||||
non_bmp_id_start_set, non_bmp_id_cont_set):
|
||||
non_bmp_id_start_set, non_bmp_id_cont_set,
|
||||
unconditional_toupper,
|
||||
codepoint_table):
|
||||
index1, index2, shift = splitbins(index)
|
||||
|
||||
# Don't forget to update CharInfo in Unicode.h if you need to change this
|
||||
@@ -665,8 +1036,8 @@ def make_unicode_file(version,
|
||||
* stop if you found the best shift
|
||||
*/
|
||||
"""
|
||||
def dump(data, name, file):
|
||||
file.write('const uint8_t unicode::' + name + '[] = {\n')
|
||||
def dump(data, name, println):
|
||||
println('const uint8_t unicode::{}[] = {{'.format(name))
|
||||
|
||||
line = pad = ' ' * 4
|
||||
lines = []
|
||||
@@ -682,93 +1053,79 @@ def make_unicode_file(version,
|
||||
line = line + s + ', '
|
||||
lines.append(line.rstrip())
|
||||
|
||||
file.write('\n'.join(lines))
|
||||
file.write('\n};\n')
|
||||
println('\n'.join(lines))
|
||||
println('};')
|
||||
|
||||
def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println):
|
||||
println('const {} unicode::{}[] = {{'.format(data_type, name))
|
||||
for d in tbl:
|
||||
println(' {{ {} }},'.format(', '.join(str(e) for e in d)))
|
||||
println('};')
|
||||
println('')
|
||||
|
||||
dump(idx1, idx1_name, println)
|
||||
println('')
|
||||
dump(idx2, idx2_name, println)
|
||||
println('')
|
||||
|
||||
def write_supplemental_identifier_method(name, group_set, println):
|
||||
println('bool')
|
||||
println('js::unicode::{}(uint32_t codePoint)'.format(name))
|
||||
println('{')
|
||||
for (from_code, to_code) in int_ranges(group_set.keys()):
|
||||
println(' if (codePoint >= 0x{:X} && codePoint <= 0x{:X}) // {} .. {}'.format(from_code,
|
||||
to_code,
|
||||
codepoint_table.name(from_code),
|
||||
codepoint_table.name(to_code)))
|
||||
println(' return true;')
|
||||
println(' return false;')
|
||||
println('}')
|
||||
println('')
|
||||
|
||||
file_name = 'Unicode.cpp'
|
||||
with io.open(file_name, 'wb') as data_file:
|
||||
data_file.write(warning_message)
|
||||
data_file.write(unicode_version_message.format(version))
|
||||
data_file.write(public_domain)
|
||||
data_file.write('#include "vm/Unicode.h"\n\n')
|
||||
data_file.write('using namespace js;\n')
|
||||
data_file.write('using namespace js::unicode;\n')
|
||||
data_file.write(comment)
|
||||
data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')
|
||||
for d in table:
|
||||
data_file.write(' {')
|
||||
data_file.write(', '.join((str(e) for e in d)))
|
||||
data_file.write('},\n')
|
||||
data_file.write('};\n')
|
||||
data_file.write('\n')
|
||||
write = partial(print, file=data_file, sep='', end='')
|
||||
println = partial(print, file=data_file, sep='', end='\n')
|
||||
|
||||
dump(index1, 'index1', data_file)
|
||||
data_file.write('\n')
|
||||
dump(index2, 'index2', data_file)
|
||||
data_file.write('\n')
|
||||
write(warning_message)
|
||||
write(unicode_version_message.format(version))
|
||||
write(public_domain)
|
||||
println('#include "vm/Unicode.h"')
|
||||
println('')
|
||||
println('using namespace js;')
|
||||
println('using namespace js::unicode;')
|
||||
write(comment)
|
||||
|
||||
data_file.write('const CodepointsWithSameUpperCaseInfo unicode::js_codepoints_with_same_upper_info[] = {\n')
|
||||
for d in same_upper_table:
|
||||
data_file.write(' {')
|
||||
data_file.write(', '.join((str(e) for e in d)))
|
||||
data_file.write('},\n')
|
||||
data_file.write('};\n')
|
||||
data_file.write('\n')
|
||||
write_table('CharacterInfo',
|
||||
'js_charinfo', table,
|
||||
'index1', index1,
|
||||
'index2', index2,
|
||||
println)
|
||||
|
||||
dump(same_upper_index1, 'codepoints_with_same_upper_index1', data_file)
|
||||
data_file.write('\n')
|
||||
dump(same_upper_index2, 'codepoints_with_same_upper_index2', data_file)
|
||||
data_file.write('\n')
|
||||
write_table('CodepointsWithSameUpperCaseInfo',
|
||||
'js_codepoints_with_same_upper_info', same_upper_table,
|
||||
'codepoints_with_same_upper_index1', same_upper_index1,
|
||||
'codepoints_with_same_upper_index2', same_upper_index2,
|
||||
println)
|
||||
|
||||
data_file.write('const FoldingInfo unicode::js_foldinfo[] = {\n')
|
||||
for d in folding_table:
|
||||
data_file.write(' {')
|
||||
data_file.write(', '.join((str(e) for e in d)))
|
||||
data_file.write('},\n')
|
||||
data_file.write('};\n')
|
||||
data_file.write('\n')
|
||||
|
||||
dump(folding_index1, 'folding_index1', data_file)
|
||||
data_file.write('\n')
|
||||
dump(folding_index2, 'folding_index2', data_file)
|
||||
data_file.write('\n')
|
||||
write_table('FoldingInfo',
|
||||
'js_foldinfo', folding_table,
|
||||
'folding_index1', folding_index1,
|
||||
'folding_index2', folding_index2,
|
||||
println)
|
||||
|
||||
# If the following assert fails, it means space character is added to
|
||||
# non-BMP area. In that case the following code should be uncommented
|
||||
# and the corresponding code should be added to frontend.
|
||||
assert len(non_bmp_space_set.keys()) == 0
|
||||
|
||||
data_file.write("""\
|
||||
bool
|
||||
js::unicode::IsIdentifierStartNonBMP(uint32_t codePoint)
|
||||
{
|
||||
""")
|
||||
write_supplemental_identifier_method('IsIdentifierStartNonBMP', non_bmp_id_start_set,
|
||||
println)
|
||||
|
||||
for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_start_set):
|
||||
data_file.write("""\
|
||||
if (codePoint >= 0x{:x} && codePoint <= 0x{:x})
|
||||
return true;
|
||||
""".format(from_code, to_code))
|
||||
write_supplemental_identifier_method('IsIdentifierPartNonBMP', non_bmp_id_cont_set,
|
||||
println)
|
||||
|
||||
data_file.write("""\
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
js::unicode::IsIdentifierPartNonBMP(uint32_t codePoint)
|
||||
{
|
||||
""")
|
||||
|
||||
for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_cont_set):
|
||||
data_file.write("""\
|
||||
if (codePoint >= 0x{:x} && codePoint <= 0x{:x})
|
||||
return true;
|
||||
""".format(from_code, to_code))
|
||||
|
||||
data_file.write("""\
|
||||
return false;
|
||||
}
|
||||
""")
|
||||
write_special_casing_methods(unconditional_toupper, codepoint_table, println)
|
||||
|
||||
def getsize(data):
|
||||
""" return smallest possible integer size for the given array """
|
||||
@@ -842,10 +1199,8 @@ def splitbins(t):
|
||||
def make_irregexp_tables(version,
|
||||
table, index,
|
||||
folding_table, folding_index,
|
||||
test_table):
|
||||
codepoint_table):
|
||||
import string
|
||||
from functools import partial
|
||||
from itertools import chain, ifilter, imap
|
||||
|
||||
MAX_ASCII = 0x7F
|
||||
MAX_LATIN1 = 0xFF
|
||||
@@ -894,13 +1249,13 @@ def make_irregexp_tables(version,
|
||||
|
||||
def char_name(code):
|
||||
assert 0 <= code and code <= MAX_BMP
|
||||
if code not in test_table:
|
||||
if code not in codepoint_table:
|
||||
return '<Unused>'
|
||||
if code == LEAD_SURROGATE_MIN:
|
||||
return '<Lead Surrogate Min>'
|
||||
if code == TRAIL_SURROGATE_MAX:
|
||||
return '<Trail Surrogate Max>'
|
||||
(_, _, name, alias) = test_table[code]
|
||||
(_, _, name, alias) = codepoint_table[code]
|
||||
return name if not name.startswith('<') else alias
|
||||
|
||||
def write_character_range(println, name, characters):
|
||||
@@ -1080,7 +1435,8 @@ def update_unicode(args):
|
||||
|
||||
with download_or_open('UnicodeData.txt') as unicode_data, \
|
||||
download_or_open('CaseFolding.txt') as case_folding, \
|
||||
download_or_open('DerivedCoreProperties.txt') as derived_core_properties:
|
||||
download_or_open('DerivedCoreProperties.txt') as derived_core_properties, \
|
||||
download_or_open('SpecialCasing.txt') as special_casing:
|
||||
unicode_version = version_from_file(derived_core_properties, 'DerivedCoreProperties')
|
||||
|
||||
print('Processing...')
|
||||
@@ -1090,13 +1446,16 @@ def update_unicode(args):
|
||||
non_bmp_lower_map, non_bmp_upper_map,
|
||||
non_bmp_space_set,
|
||||
non_bmp_id_start_set, non_bmp_id_cont_set,
|
||||
test_table, test_space_table
|
||||
codepoint_table, test_space_table
|
||||
) = process_unicode_data(unicode_data, derived_core_properties)
|
||||
(
|
||||
folding_table, folding_index,
|
||||
non_bmp_folding_map, non_bmp_rev_folding_map,
|
||||
folding_tests
|
||||
) = process_case_folding(case_folding)
|
||||
(
|
||||
unconditional_tolower, unconditional_toupper
|
||||
) = process_special_casing(special_casing, table, index)
|
||||
|
||||
print('Generating...')
|
||||
make_unicode_file(unicode_version,
|
||||
@@ -1104,19 +1463,23 @@ def update_unicode(args):
|
||||
same_upper_table, same_upper_index,
|
||||
folding_table, folding_index,
|
||||
non_bmp_space_set,
|
||||
non_bmp_id_start_set, non_bmp_id_cont_set)
|
||||
non_bmp_id_start_set, non_bmp_id_cont_set,
|
||||
unconditional_toupper,
|
||||
codepoint_table)
|
||||
make_non_bmp_file(unicode_version,
|
||||
non_bmp_lower_map, non_bmp_upper_map,
|
||||
non_bmp_folding_map, non_bmp_rev_folding_map)
|
||||
non_bmp_folding_map, non_bmp_rev_folding_map,
|
||||
codepoint_table)
|
||||
make_irregexp_tables(unicode_version,
|
||||
table, index,
|
||||
folding_table, folding_index,
|
||||
test_table)
|
||||
codepoint_table)
|
||||
|
||||
make_bmp_mapping_test(unicode_version, test_table)
|
||||
make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map)
|
||||
make_space_test(unicode_version, test_space_table)
|
||||
make_icase_test(unicode_version, folding_tests)
|
||||
make_bmp_mapping_test(unicode_version,
|
||||
codepoint_table, unconditional_tolower, unconditional_toupper)
|
||||
make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table)
|
||||
make_space_test(unicode_version, test_space_table, codepoint_table)
|
||||
make_icase_test(unicode_version, folding_tests, codepoint_table)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
|
||||
Reference in New Issue
Block a user