1
0
mirror of https://github.com/roytam1/UXP.git synced 2026-05-26 13:58:49 +00:00
Files
UXP/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
T
Moonchild e701dad7ef Issue #3011 - Part 2: Switch spellchecker root to Shadow DOM.
Set the root for spelling checker to shadow root if the contenteditable
nodes are in the shadow DOM. Bail if the position can't be set properly.
2026-03-25 07:19:01 +08:00

1104 lines
35 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "mozInlineSpellWordUtil.h"
#include "nsDebug.h"
#include "nsIAtom.h"
#include "nsComponentManagerUtils.h"
#include "nsIDOMCSSStyleDeclaration.h"
#include "nsIDOMElement.h"
#include "nsIDOMRange.h"
#include "nsIEditor.h"
#include "nsIDOMNode.h"
#include "nsUnicharUtilCIID.h"
#include "nsUnicodeProperties.h"
#include "nsServiceManagerUtils.h"
#include "nsIContent.h"
#include "nsTextFragment.h"
#include "mozilla/dom/Element.h"
#include "nsRange.h"
#include "nsContentUtils.h"
#include "nsIFrame.h"
#include <algorithm>
#include "mozilla/BinarySearch.h"
#include "mozilla/HTMLEditor.h"
#include "mozilla/dom/ShadowRoot.h"
using namespace mozilla;
// IsIgnorableCharacter
//
// These characters are ones that we should ignore in input.
inline bool IsIgnorableCharacter(char16_t ch)
{
return (ch == 0xAD || // SOFT HYPHEN
ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
}
// IsConditionalPunctuation
//
// Some characters (like apostrophes) require characters on each side to be
// part of a word, and are otherwise punctuation.
inline bool IsConditionalPunctuation(char16_t ch)
{
return (ch == '\'' ||
ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
ch == 0x00B7); // MIDDLE DOT
}
// mozInlineSpellWordUtil::Init
nsresult
mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
{
nsresult rv;
// getting the editor can fail commonly because the editor was detached, so
// don't assert
nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
if (NS_FAILED(rv))
return rv;
nsCOMPtr<nsIDOMDocument> domDoc;
rv = editor->GetDocument(getter_AddRefs(domDoc));
NS_ENSURE_SUCCESS(rv, rv);
NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER);
mDOMDocument = domDoc;
mDocument = do_QueryInterface(domDoc);
mIsContentEditableOrDesignMode = !!editor->AsHTMLEditor();
// Find the root node for the editor. For contenteditable the mRootNode could
// change to shadow root if the begin and end are inside the shadowDOM.
nsCOMPtr<nsIDOMElement> rootElt;
rv = editor->GetRootElement(getter_AddRefs(rootElt));
NS_ENSURE_SUCCESS(rv, rv);
nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt);
mRootNode = rootNode;
NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
return NS_OK;
}
static inline bool
IsTextNode(nsINode* aNode)
{
return aNode->IsNodeOfType(nsINode::eTEXT);
}
typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
// Find the next node in the DOM tree in preorder.
// Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
// why we can't just use GetNextNode here, sadly.
static nsINode*
FindNextNode(nsINode* aNode, nsINode* aRoot,
OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure)
{
NS_PRECONDITION(aNode, "Null starting node?");
nsINode* next = aNode->GetFirstChild();
if (next)
return next;
// Don't look at siblings or otherwise outside of aRoot
if (aNode == aRoot)
return nullptr;
next = aNode->GetNextSibling();
if (next)
return next;
// Go up
for (;;) {
if (aOnLeaveNode) {
aOnLeaveNode(aNode, aClosure);
}
next = aNode->GetParent();
if (next == aRoot || ! next)
return nullptr;
aNode = next;
next = aNode->GetNextSibling();
if (next)
return next;
}
}
// aNode is not a text node. Find the first text node starting at aNode/aOffset
// in a preorder DOM traversal.
static nsINode*
FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot)
{
NS_PRECONDITION(aNode, "Null starting node?");
NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
nsINode* checkNode;
// Need to start at the aOffset'th child
nsIContent* child = aNode->GetChildAt(aOffset);
if (child) {
checkNode = child;
} else {
// aOffset was beyond the end of the child list.
// goto next node after the last descendant of aNode in
// a preorder DOM traversal.
checkNode = aNode->GetNextNonChildNode(aRoot);
}
while (checkNode && !IsTextNode(checkNode)) {
checkNode = checkNode->GetNextNode(aRoot);
}
return checkNode;
}
// mozInlineSpellWordUtil::SetPositionAndEnd
//
// We have two ranges "hard" and "soft". The hard boundary is simply
// the scope of the root node. The soft boundary is that which is set
// by the caller of this class by calling this function. If this function is
// not called, the soft boundary is the same as the hard boundary.
//
// When we reach the soft boundary (mSoftEnd), we keep
// going until we reach the end of a word. This allows the caller to set the
// end of the range to anything, and we will always check whole multiples of
// words. When we reach the hard boundary we stop no matter what.
//
// There is no beginning soft boundary. This is because we only go to the
// previous node once, when finding the previous word boundary in
// SetPosition(). You might think of the soft boundary as being this initial
// position.
nsresult
mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode,
int32_t aPositionOffset,
nsINode* aEndNode,
int32_t aEndOffset)
{
MOZ_ASSERT(aPositionNode, "Null begin node?");
NS_PRECONDITION(aEndNode, "Null end node?");
NS_ASSERTION(mRootNode, "Not initialized");
// Find a appropriate root if we are dealing with contenteditable nodes which
// are in the shadow DOM. See UXP Issue #3011
if (mIsContentEditableOrDesignMode) {
nsINode* rootNode = aPositionNode->SubtreeRoot();
if (rootNode != aEndNode->SubtreeRoot()) {
return NS_ERROR_FAILURE;
}
if (mozilla::dom::ShadowRoot::FromNode(rootNode)) {
mRootNode = rootNode;
}
}
InvalidateWords();
if (!IsTextNode(aPositionNode)) {
// Start at the start of the first text node after aNode/aOffset.
aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode);
aPositionOffset = 0;
}
mSoftBegin = NodeOffset(aPositionNode, aPositionOffset);
if (!IsTextNode(aEndNode)) {
// End at the start of the first text node after aEndNode/aEndOffset.
aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
aEndOffset = 0;
}
mSoftEnd = NodeOffset(aEndNode, aEndOffset);
nsresult rv = EnsureWords();
if (NS_FAILED(rv)) {
return rv;
}
int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
if (textOffset < 0) {
return NS_OK;
}
mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
return NS_OK;
}
nsresult
mozInlineSpellWordUtil::EnsureWords()
{
if (mSoftTextValid)
return NS_OK;
BuildSoftText();
nsresult rv = BuildRealWords();
if (NS_FAILED(rv)) {
mRealWords.Clear();
return rv;
}
mSoftTextValid = true;
return NS_OK;
}
nsresult
mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange)
{
NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
return MakeRange(begin, end, aRange);
}
// mozInlineSpellWordUtil::GetRangeForWord
nsresult
mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
int32_t aWordOffset,
nsRange** aRange)
{
// Set our soft end and start
nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode);
NodeOffset pt = NodeOffset(wordNode, aWordOffset);
if (!mSoftTextValid || pt != mSoftBegin || pt != mSoftEnd) {
InvalidateWords();
mSoftBegin = mSoftEnd = pt;
nsresult rv = EnsureWords();
if (NS_FAILED(rv)) {
return rv;
}
}
int32_t offset = MapDOMPositionToSoftTextOffset(pt);
if (offset < 0)
return MakeRange(pt, pt, aRange);
int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
if (wordIndex < 0)
return MakeRange(pt, pt, aRange);
return MakeRangeForWord(mRealWords[wordIndex], aRange);
}
// This is to fix characters that the spellchecker may not like
static void
NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput)
{
aOutput.Truncate();
for (int32_t i = 0; i < aLen; i++) {
char16_t ch = aInput.CharAt(i + aPos);
// remove ignorable characters from the word
if (IsIgnorableCharacter(ch))
continue;
// the spellchecker doesn't handle curly apostrophes in all languages
if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
ch = '\'';
}
aOutput.Append(ch);
}
}
// mozInlineSpellWordUtil::GetNextWord
//
// FIXME-optimization: we shouldn't have to generate a range every single
// time. It would be better if the inline spellchecker didn't require a
// range unless the word was misspelled. This may or may not be possible.
nsresult
mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange,
bool* aSkipChecking)
{
#ifdef DEBUG_SPELLCHECK
printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
#endif
if (mNextWordIndex < 0 ||
mNextWordIndex >= int32_t(mRealWords.Length())) {
mNextWordIndex = -1;
*aRange = nullptr;
*aSkipChecking = true;
return NS_OK;
}
const RealWord& word = mRealWords[mNextWordIndex];
nsresult rv = MakeRangeForWord(word, aRange);
NS_ENSURE_SUCCESS(rv, rv);
++mNextWordIndex;
*aSkipChecking = !word.mCheckableWord;
::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
#ifdef DEBUG_SPELLCHECK
printf("GetNextWord returning: %s (skip=%d)\n",
NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
#endif
return NS_OK;
}
// mozInlineSpellWordUtil::MakeRange
//
// Convenience function for creating a range over the current document.
nsresult
mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
nsRange** aRange)
{
NS_ENSURE_ARG_POINTER(aBegin.mNode);
if (!mDOMDocument)
return NS_ERROR_NOT_INITIALIZED;
RefPtr<nsRange> range = new nsRange(aBegin.mNode);
nsresult rv = range->SetStartAndEnd(aBegin.mNode, aBegin.mOffset,
aEnd.mNode, aEnd.mOffset);
if (NS_WARN_IF(NS_FAILED(rv))) {
return rv;
}
range.forget(aRange);
return NS_OK;
}
/*********** DOM text extraction ************/
// IsDOMWordSeparator
//
// Determines if the given character should be considered as a DOM Word
// separator. Basically, this is whitespace, although it could also have
// certain punctuation that we know ALWAYS breaks words. This is important.
// For example, we can't have any punctuation that could appear in a URL
// or email address in this, because those need to always fit into a single
// DOM word.
static bool
IsDOMWordSeparator(char16_t ch)
{
// simple spaces
if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
return true;
// complex spaces - check only if char isn't ASCII (uncommon)
if (ch >= 0xA0 &&
(ch == 0x00A0 || // NO-BREAK SPACE
ch == 0x2002 || // EN SPACE
ch == 0x2003 || // EM SPACE
ch == 0x2009 || // THIN SPACE
ch == 0x3000)) // IDEOGRAPHIC SPACE
return true;
// otherwise not a space
return false;
}
static inline bool
IsBRElement(nsINode* aNode)
{
return aNode->IsHTMLElement(nsGkAtoms::br);
}
/**
* Given a TextNode, checks to see if there's a DOM word separator before
* aBeforeOffset within it. This function does not modify aSeparatorOffset when
* it returns false.
*
* @param aNode the TextNode to check.
* @param aBeforeOffset the offset in the TextNode before which we will search
* for the DOM separator. You can pass INT32_MAX to search the entire
* length of the string.
* @param aSeparatorOffset will be set to the offset of the first separator it
* encounters. Will not be written to if no separator is found.
* @returns True if it found a separator.
*/
static bool
TextNodeContainsDOMWordSeparator(nsINode* aNode,
int32_t aBeforeOffset,
int32_t* aSeparatorOffset)
{
// aNode is actually an nsIContent, since it's eTEXT
nsIContent* content = static_cast<nsIContent*>(aNode);
const nsTextFragment* textFragment = content->GetText();
NS_ASSERTION(textFragment, "Where is our text?");
for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) {
if (IsDOMWordSeparator(textFragment->CharAt(i))) {
// Be greedy, find as many separators as we can
for (int32_t j = i - 1; j >= 0; --j) {
if (IsDOMWordSeparator(textFragment->CharAt(j))) {
i = j;
} else {
break;
}
}
*aSeparatorOffset = i;
return true;
}
}
return false;
}
/**
* Check if there's a DOM word separator before aBeforeOffset in this node.
* Always returns true if it's a BR element.
* aSeparatorOffset is set to the index of the first character in the last
* separator if any is found (0 for BR elements).
*
* This function does not modify aSeparatorOffset when it returns false.
*/
static bool
ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
int32_t* aSeparatorOffset)
{
if (IsBRElement(aNode)) {
*aSeparatorOffset = 0;
return true;
}
if (!IsTextNode(aNode))
return false;
return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset,
aSeparatorOffset);
}
static bool
IsBreakElement(nsINode* aNode)
{
if (!aNode->IsElement()) {
return false;
}
dom::Element *element = aNode->AsElement();
if (element->IsHTMLElement(nsGkAtoms::br))
return true;
// If we don't have a frame, we don't consider ourselves a break
// element. In particular, words can span us.
if (!element->GetPrimaryFrame())
return false;
// Anything that's not an inline element is a break element.
// XXXbz should replaced inlines be break elements, though?
return element->GetPrimaryFrame()->StyleDisplay()->mDisplay !=
StyleDisplay::Inline;
}
struct CheckLeavingBreakElementClosure {
bool mLeftBreakElement;
};
static void
CheckLeavingBreakElement(nsINode* aNode, void* aClosure)
{
CheckLeavingBreakElementClosure* cl =
static_cast<CheckLeavingBreakElementClosure*>(aClosure);
if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
cl->mLeftBreakElement = true;
}
}
void
mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
{
nsAutoString result;
::NormalizeWord(aWord, 0, aWord.Length(), result);
aWord = result;
}
void
mozInlineSpellWordUtil::BuildSoftText()
{
// First we have to work backwards from mSoftStart to find a text node
// containing a DOM word separator, a non-inline-element
// boundary, or the hard start node. That's where we'll start building the
// soft string from.
nsINode* node = mSoftBegin.mNode;
int32_t firstOffsetInNode = 0;
int32_t checkBeforeOffset = mSoftBegin.mOffset;
while (node) {
if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
if (node == mSoftBegin.mNode) {
// If we find a word separator on the first node, look at the preceding
// word on the text node as well.
int32_t newOffset = 0;
if (firstOffsetInNode > 0) {
// Try to find the previous word boundary in the current node. If
// we can't find one, start checking previous sibling nodes (if any
// adjacent ones exist) to see if we can find any text nodes with
// DOM word separators. We bail out as soon as we see a node that is
// not a text node, or we run out of previous sibling nodes. In the
// event that we simply cannot find any preceding word separator, the
// offset is set to 0, and the soft text beginning node is set to the
// "most previous" text node before the original starting node, or
// kept at the original starting node if no previous text nodes exist.
if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
&newOffset)) {
nsINode* prevNode = node->GetPreviousSibling();
while (prevNode && IsTextNode(prevNode)) {
mSoftBegin.mNode = prevNode;
if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX,
&newOffset)) {
break;
}
prevNode = prevNode->GetPreviousSibling();
}
}
}
firstOffsetInNode = newOffset;
mSoftBegin.mOffset = newOffset;
}
break;
}
checkBeforeOffset = INT32_MAX;
if (IsBreakElement(node)) {
// Since GetPreviousContent follows tree *preorder*, we're about to traverse
// up out of 'node'. Since node induces breaks (e.g., it's a block),
// don't bother trying to look outside it, just stop now.
break;
}
// GetPreviousContent below expects mRootNode to be an ancestor of node.
if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) {
break;
}
node = node->GetPreviousContent(mRootNode);
}
// Now build up the string moving forward through the DOM until we reach
// the soft end and *then* see a DOM word separator, a non-inline-element
// boundary, or the hard end node.
mSoftText.Truncate();
mSoftTextDOMMapping.Clear();
bool seenSoftEnd = false;
// Leave this outside the loop so large heap string allocations can be reused
// across iterations
while (node) {
if (node == mSoftEnd.mNode) {
seenSoftEnd = true;
}
bool exit = false;
if (IsTextNode(node)) {
nsIContent* content = static_cast<nsIContent*>(node);
NS_ASSERTION(content, "Where is our content?");
const nsTextFragment* textFragment = content->GetText();
NS_ASSERTION(textFragment, "Where is our text?");
int32_t lastOffsetInNode = textFragment->GetLength();
if (seenSoftEnd) {
// check whether we can stop after this
for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
i < int32_t(textFragment->GetLength()); ++i) {
if (IsDOMWordSeparator(textFragment->CharAt(i))) {
exit = true;
// stop at the first separator after the soft end point
lastOffsetInNode = i;
break;
}
}
}
if (firstOffsetInNode < lastOffsetInNode) {
int32_t len = lastOffsetInNode - firstOffsetInNode;
mSoftTextDOMMapping.AppendElement(
DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len,
mozilla::fallible);
if (!ok) {
// probably out of memory, remove from mSoftTextDOMMapping
mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1);
exit = true;
}
}
firstOffsetInNode = 0;
}
if (exit)
break;
CheckLeavingBreakElementClosure closure = { false };
node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
// We left, or are entering, a break element (e.g., block). Maybe we can
// stop now.
if (seenSoftEnd)
break;
// Record the break
mSoftText.Append(' ');
}
}
#ifdef DEBUG_SPELLCHECK
printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
#endif
}
nsresult
mozInlineSpellWordUtil::BuildRealWords()
{
// This is pretty simple. We just have to walk mSoftText, tokenizing it
// into "real words".
// We do an outer traversal of words delimited by IsDOMWordSeparator, calling
// SplitDOMWord on each of those DOM words
int32_t wordStart = -1;
mRealWords.Clear();
for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) {
if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
if (wordStart >= 0) {
nsresult rv = SplitDOMWord(wordStart, i);
if (NS_FAILED(rv)) {
return rv;
}
wordStart = -1;
}
} else {
if (wordStart < 0) {
wordStart = i;
}
}
}
if (wordStart >= 0) {
nsresult rv = SplitDOMWord(wordStart, mSoftText.Length());
if (NS_FAILED(rv)) {
return rv;
}
}
return NS_OK;
}
/*********** DOM/realwords<->mSoftText mapping functions ************/
int32_t
mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
{
if (!mSoftTextValid) {
NS_ERROR("Soft text must be valid if we're to map into it");
return -1;
}
for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) {
const DOMTextMapping& map = mSoftTextDOMMapping[i];
if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
// Allow offsets at either end of the string, in particular, allow the
// offset that's at the end of the contributed string
int32_t offsetInContributedString =
aNodeOffset.mOffset - map.mNodeOffset.mOffset;
if (offsetInContributedString >= 0 &&
offsetInContributedString <= map.mLength)
return map.mSoftTextOffset + offsetInContributedString;
return -1;
}
}
return -1;
}
namespace {
template<class T>
class FirstLargerOffset
{
int32_t mSoftTextOffset;
public:
explicit FirstLargerOffset(int32_t aSoftTextOffset) : mSoftTextOffset(aSoftTextOffset) {}
int operator()(const T& t) const {
// We want the first larger offset, so never return 0 (which would
// short-circuit evaluation before finding the last such offset).
return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;
}
};
template<class T>
bool
FindLastNongreaterOffset(const nsTArray<T>& aContainer, int32_t aSoftTextOffset, size_t* aIndex)
{
if (aContainer.Length() == 0) {
return false;
}
BinarySearchIf(aContainer, 0, aContainer.Length(),
FirstLargerOffset<T>(aSoftTextOffset), aIndex);
if (*aIndex > 0) {
// There was at least one mapping with offset <= aSoftTextOffset. Step back
// to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
*aIndex -= 1;
} else {
// Every mapping had offset greater than aSoftTextOffset.
MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);
}
return true;
}
} // namespace
mozInlineSpellWordUtil::NodeOffset
mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,
DOMMapHint aHint)
{
NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
if (!mSoftTextValid)
return NodeOffset(nullptr, -1);
// Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
size_t index;
bool found = FindLastNongreaterOffset(mSoftTextDOMMapping, aSoftTextOffset, &index);
if (!found) {
return NodeOffset(nullptr, -1);
}
// 'index' is now the last mapping, if any, such that
// mSoftTextOffset <= aSoftTextOffset.
// If we're doing HINT_END, then we may want to return the end of the
// the previous mapping instead of the start of this mapping
if (aHint == HINT_END && index > 0) {
const DOMTextMapping& map = mSoftTextDOMMapping[index - 1];
if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
}
// We allow ourselves to return the end of this mapping even if we're
// doing HINT_START. This will only happen if there is no mapping which this
// point is the start of. I'm not 100% sure this is OK...
const DOMTextMapping& map = mSoftTextDOMMapping[index];
int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
if (offset >= 0 && offset <= map.mLength)
return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
return NodeOffset(nullptr, -1);
}
int32_t
mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset,
DOMMapHint aHint, bool aSearchForward)
{
NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
if (!mSoftTextValid)
return -1;
// Find the last word, if any, such that mSoftTextOffset <= aSoftTextOffset
size_t index;
bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);
if (!found) {
return -1;
}
// 'index' is now the last word, if any, such that
// mSoftTextOffset <= aSoftTextOffset.
// If we're doing HINT_END, then we may want to return the end of the
// the previous word instead of the start of this word
if (aHint == HINT_END && index > 0) {
const RealWord& word = mRealWords[index - 1];
if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
return index - 1;
}
// We allow ourselves to return the end of this word even if we're
// doing HINT_START. This will only happen if there is no word which this
// point is the start of. I'm not 100% sure this is OK...
const RealWord& word = mRealWords[index];
int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength))
return index;
if (aSearchForward) {
if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
// All words have mSoftTextOffset > aSoftTextOffset
return 0;
}
// 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
// Word index+1, if it exists, will be the first with
// mSoftTextOffset > aSoftTextOffset.
if (index + 1 < mRealWords.Length())
return index + 1;
}
return -1;
}
/*********** Word Splitting ************/
// classifies a given character in the DOM word
enum CharClass {
CHAR_CLASS_WORD,
CHAR_CLASS_SEPARATOR,
CHAR_CLASS_END_OF_INPUT };
// Encapsulates DOM-word to real-word splitting
struct MOZ_STACK_CLASS WordSplitState
{
mozInlineSpellWordUtil* mWordUtil;
const nsDependentSubstring mDOMWordText;
int32_t mDOMWordOffset;
CharClass mCurCharClass;
WordSplitState(mozInlineSpellWordUtil* aWordUtil,
const nsString& aString, int32_t aStart, int32_t aLen)
: mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
void Advance();
void AdvanceThroughSeparators();
void AdvanceThroughWord();
// Finds special words like email addresses and URLs that may start at the
// current position, and returns their length, or 0 if not found. This allows
// arbitrary word breaking rules to be used for these special entities, as
// long as they can not contain whitespace.
bool IsSpecialWord();
// Similar to IsSpecialWord except that this takes a split word as
// input. This checks for things that do not require special word-breaking
// rules.
bool ShouldSkipWord(int32_t aStart, int32_t aLength);
};
// WordSplitState::ClassifyCharacter
CharClass
WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const
{
NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
"Index out of range");
if (aIndex == int32_t(mDOMWordText.Length()))
return CHAR_CLASS_SEPARATOR;
// this will classify the character, we want to treat "ignorable" characters
// such as soft hyphens, and also ZWJ and ZWNJ as word characters.
nsIUGenCategory::nsUGenCategory
charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
if (charCategory == nsIUGenCategory::kLetter ||
IsIgnorableCharacter(mDOMWordText[aIndex]) ||
mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
mDOMWordText[aIndex] == 0x200D /* ZWJ */)
return CHAR_CLASS_WORD;
// If conditional punctuation is surrounded immediately on both sides by word
// characters it also counts as a word character.
if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
if (!aRecurse) {
// not allowed to look around, this punctuation counts like a separator
return CHAR_CLASS_SEPARATOR;
}
// check the left-hand character
if (aIndex == 0)
return CHAR_CLASS_SEPARATOR;
if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
return CHAR_CLASS_SEPARATOR;
// If the previous charatcer is a word-char, make sure that it's not a
// special dot character.
if (mDOMWordText[aIndex - 1] == '.')
return CHAR_CLASS_SEPARATOR;
// now we know left char is a word-char, check the right-hand character
if (aIndex == int32_t(mDOMWordText.Length()) - 1)
return CHAR_CLASS_SEPARATOR;
if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
return CHAR_CLASS_SEPARATOR;
// If the next charatcer is a word-char, make sure that it's not a
// special dot character.
if (mDOMWordText[aIndex + 1] == '.')
return CHAR_CLASS_SEPARATOR;
// char on either side is a word, this counts as a word
return CHAR_CLASS_WORD;
}
// The dot character, if appearing at the end of a word, should
// be considered part of that word. Example: "etc.", or
// abbreviations
if (aIndex > 0 &&
mDOMWordText[aIndex] == '.' &&
mDOMWordText[aIndex - 1] != '.' &&
ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
return CHAR_CLASS_WORD;
}
// all other punctuation
if (charCategory == nsIUGenCategory::kSeparator ||
charCategory == nsIUGenCategory::kOther ||
charCategory == nsIUGenCategory::kPunctuation ||
charCategory == nsIUGenCategory::kSymbol) {
// Don't break on hyphens, as hunspell handles them on its own.
if (aIndex > 0 &&
mDOMWordText[aIndex] == '-' &&
mDOMWordText[aIndex - 1] != '-' &&
ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
// A hyphen is only meaningful as a separator inside a word
// if the previous and next characters are a word character.
if (aIndex == int32_t(mDOMWordText.Length()) - 1)
return CHAR_CLASS_SEPARATOR;
if (mDOMWordText[aIndex + 1] != '.' &&
ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
return CHAR_CLASS_WORD;
}
return CHAR_CLASS_SEPARATOR;
}
// any other character counts as a word
return CHAR_CLASS_WORD;
}
// WordSplitState::Advance
void
WordSplitState::Advance()
{
NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
"Length beyond end");
mDOMWordOffset ++;
if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
mCurCharClass = CHAR_CLASS_END_OF_INPUT;
else
mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
}
// WordSplitState::AdvanceThroughSeparators
void
WordSplitState::AdvanceThroughSeparators()
{
while (mCurCharClass == CHAR_CLASS_SEPARATOR)
Advance();
}
// WordSplitState::AdvanceThroughWord
void
WordSplitState::AdvanceThroughWord()
{
while (mCurCharClass == CHAR_CLASS_WORD)
Advance();
}
// WordSplitState::IsSpecialWord
bool
WordSplitState::IsSpecialWord()
{
// Search for email addresses. We simply define these as any sequence of
// characters with an '@' character in the middle. The DOM word is already
// split on whitepace, so we know that everything to the end is the address
int32_t firstColon = -1;
for (int32_t i = mDOMWordOffset;
i < int32_t(mDOMWordText.Length()); i ++) {
if (mDOMWordText[i] == '@') {
// only accept this if there are unambiguous word characters (don't bother
// recursing to disambiguate apostrophes) on each side. This prevents
// classifying, e.g. "@home" as an email address
// Use this condition to only accept words with '@' in the middle of
// them. It works, but the inlinespellcker doesn't like this. The problem
// is that you type "fhsgfh@" that's a misspelled word followed by a
// symbol, but when you type another letter "fhsgfh@g" that first word
// need to be unmarked misspelled. It doesn't do this. it only checks the
// current position for potentially removing a spelling range.
if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
i < (int32_t)mDOMWordText.Length() - 1 &&
ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
return true;
}
} else if (mDOMWordText[i] == ':' && firstColon < 0) {
firstColon = i;
// If the first colon is followed by a slash, consider it a URL
// This will catch things like asdf://foo.com
if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
mDOMWordText[firstColon + 1] == '/') {
return true;
}
}
}
// Check the text before the first colon against some known protocols. It
// is impossible to check against all protocols, especially since you can
// plug in new protocols. We also don't want to waste time here checking
// against a lot of obscure protocols.
if (firstColon > mDOMWordOffset) {
nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
firstColon - mDOMWordOffset));
if (protocol.EqualsIgnoreCase("http") ||
protocol.EqualsIgnoreCase("https") ||
protocol.EqualsIgnoreCase("news") ||
protocol.EqualsIgnoreCase("file") ||
protocol.EqualsIgnoreCase("javascript") ||
protocol.EqualsIgnoreCase("data") ||
protocol.EqualsIgnoreCase("ftp")) {
return true;
}
}
// not anything special
return false;
}
// WordSplitState::ShouldSkipWord
bool
WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)
{
int32_t last = aStart + aLength;
// check to see if the word contains a digit
for (int32_t i = aStart; i < last; i ++) {
if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) {
return true;
}
}
// not special
return false;
}
// mozInlineSpellWordUtil::SplitDOMWord
nsresult
mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd)
{
WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
state.mCurCharClass = state.ClassifyCharacter(0, true);
state.AdvanceThroughSeparators();
if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT &&
state.IsSpecialWord()) {
int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset;
if (!mRealWords.AppendElement(
RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),
fallible)) {
return NS_ERROR_OUT_OF_MEMORY;
}
return NS_OK;
}
while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
state.AdvanceThroughSeparators();
if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
break;
// save the beginning of the word
int32_t wordOffset = state.mDOMWordOffset;
// find the end of the word
state.AdvanceThroughWord();
int32_t wordLen = state.mDOMWordOffset - wordOffset;
if (!mRealWords.AppendElement(
RealWord(aStart + wordOffset, wordLen,
!state.ShouldSkipWord(wordOffset, wordLen)), fallible)) {
return NS_ERROR_OUT_OF_MEMORY;
}
}
return NS_OK;
}