Fix an issue with the html5 tokenizer and tree builder (java htmlparser)

2026-05-26 22:48:47 +00:00 · 2020-01-15 22:07:59 -05:00
parent 7938a0ac1b
commit 3338198f76
2 changed files with 35 additions and 18 deletions
@@ -680,6 +680,22 @@ public class Tokenizer implements Locator {
     *
     * @param specialTokenizerState
     *            the tokenizer state to set
+     */
+    public void setState(int specialTokenizerState) {
+        this.stateSave = specialTokenizerState;
+        this.endTagExpectation = null;
+        this.endTagExpectationAsArray = null;
+    }
+
+    // [NOCPP[
+
+    /**
+     * Sets the tokenizer state and the associated element name. This should
+     * only ever used to put the tokenizer into one of the states that have
+     * a special end tag expectation. For use from the tokenizer test harness.
+     *
+     * @param specialTokenizerState
+     *            the tokenizer state to set
     * @param endTagExpectation
     *            the expected end tag for transitioning back to normal
     */
@@ -695,6 +711,8 @@ public class Tokenizer implements Locator {
        endTagExpectationToArray();
    }

+    // ]NOCPP]
+
    /**
     * Sets the tokenizer state and the associated element name. This should
     * only ever used to put the tokenizer into one of the states that have
@@ -3749,11 +3767,17 @@ public class Tokenizer implements Locator {
                        c = checkChar(buf, pos);
                        /*
                         * ASSERT! when entering this state, set index to 0 and
-                         * call clearStrBufBeforeUse() assert (contentModelElement !=
-                         * null); Let's implement the above without lookahead.
-                         * strBuf is the 'temporary buffer'.
+                         * call clearStrBufBeforeUse(); Let's implement the above
+                         * without lookahead. strBuf is the 'temporary buffer'.
                         */
-                        if (index < endTagExpectationAsArray.length) {
+                        if (endTagExpectationAsArray == null) {
+                            tokenHandler.characters(Tokenizer.LT_SOLIDUS,
+                                    0, 2);
+                            cstart = pos;
+                            reconsume = true;
+                            state = transition(state, returnState, reconsume, pos);
+                            continue stateloop;
+                        } else if (index < endTagExpectationAsArray.length) {
                            char e = endTagExpectationAsArray[index];
                            char folded = c;
                            if (c >= 'A' && c <= 'Z') {
@@ -640,8 +640,7 @@ public abstract class TreeBuilder<T> implements TokenHandler,
                );
                currentPtr++;
                stack[currentPtr] = node;
-                tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA,
-                        contextName);
+                tokenizer.setState(Tokenizer.DATA);
                // The frameset-ok flag is set even though <frameset> never
                // ends up being allowed as HTML frameset in the fragment case.
                mode = FRAMESET_OK;
@@ -671,8 +670,7 @@ public abstract class TreeBuilder<T> implements TokenHandler,
                );
                currentPtr++;
                stack[currentPtr] = node;
-                tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA,
-                        contextName);
+                tokenizer.setState(Tokenizer.DATA);
                // The frameset-ok flag is set even though <frameset> never
                // ends up being allowed as HTML frameset in the fragment case.
                mode = FRAMESET_OK;
@@ -691,23 +689,18 @@ public abstract class TreeBuilder<T> implements TokenHandler,
                resetTheInsertionMode();
                formPointer = getFormPointerForContext(contextNode);
                if ("title" == contextName || "textarea" == contextName) {
-                    tokenizer.setStateAndEndTagExpectation(Tokenizer.RCDATA,
-                            contextName);
+                    tokenizer.setState(Tokenizer.RCDATA);
                } else if ("style" == contextName || "xmp" == contextName
                        || "iframe" == contextName || "noembed" == contextName
                        || "noframes" == contextName
                        || (scriptingEnabled && "noscript" == contextName)) {
-                    tokenizer.setStateAndEndTagExpectation(Tokenizer.RAWTEXT,
-                            contextName);
+                    tokenizer.setState(Tokenizer.RAWTEXT);
                } else if ("plaintext" == contextName) {
-                    tokenizer.setStateAndEndTagExpectation(Tokenizer.PLAINTEXT,
-                            contextName);
+                    tokenizer.setState(Tokenizer.PLAINTEXT);
                } else if ("script" == contextName) {
-                    tokenizer.setStateAndEndTagExpectation(
-                            Tokenizer.SCRIPT_DATA, contextName);
+                    tokenizer.setState(Tokenizer.SCRIPT_DATA);
                } else {
-                    tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA,
-                            contextName);
+                    tokenizer.setState(Tokenizer.DATA);
                }
            }
            contextName = null;