From e1d66093d5b6141ce28d74c9efe36fb6990767a2 Mon Sep 17 00:00:00 2001 From: Alec Murphy Date: Sun, 15 May 2022 13:18:43 -0400 Subject: [PATCH] Add named character entity © Add states HTML_STATE_AFTER_DOCTYPE_NAME, HTML_STATE_BOGUS_DOCTYPE, HTML_STATE_SELF_CLOSING_START_TAG --- Src/HTMLTokenizer.HC | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/Src/HTMLTokenizer.HC b/Src/HTMLTokenizer.HC index 9a0cb64..c564bae 100644 --- a/Src/HTMLTokenizer.HC +++ b/Src/HTMLTokenizer.HC @@ -153,6 +153,11 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) { @recalculate_temp_buffer_size(t); return; } + if (!StrICmp(t->tempBuffer.data, "©")) { + StrCpy(t->tempBuffer.data, "(c)"); + @recalculate_temp_buffer_size(t); + return; + } if (!StrICmp(t->tempBuffer.data, " ")) { StrCpy(t->tempBuffer.data, " "); @recalculate_temp_buffer_size(t); @@ -499,6 +504,46 @@ U0 @tokenizer_html_state_doctype_name(Tokenizer *t) { } } +U0 @tokenizer_html_state_after_doctype_name(Tokenizer *t) { + @consume_next_input_char(t); + switch (t->currentInputChar) { + case '\n': + case '\r': + case '\t': + case ' ': + // Ignore the character. + break; + case 'A' ... 'Z': + // Create a new DOCTYPE token. Set the token's name to the lowercase version + // of the current input character (add 0x0020 to the character's code + // point). Switch to the DOCTYPE name state. + t->state = HTML_STATE_DOCTYPE_NAME; + break; + case '>': + // Switch to the data state. Emit the current DOCTYPE token. + t->state = HTML_STATE_DATA; + break; + default: + // Reconsume in the bogus DOCTYPE state. + t->inputBuffer.pos--; + t->state = HTML_STATE_BOGUS_DOCTYPE; + break; + } +} + +U0 @tokenizer_html_state_bogus_doctype(Tokenizer *t) { + @consume_next_input_char(t); + switch (t->currentInputChar) { + case '>': + // Switch to the data state. Emit the DOCTYPE token. + t->state = HTML_STATE_DATA; + break; + default: + // Ignore the character. + break; + } +} + U0 @tokenizer_html_state_tag_name(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { @@ -775,6 +820,24 @@ U0 @tokenizer_html_state_after_attribute_name(Tokenizer *t) { } } +U0 @tokenizer_html_state_self_closing_start_tag(Tokenizer *t) { + @consume_next_input_char(t); + switch (t->currentInputChar) { + case '>': + // Set the self-closing flag of the current tag token. Switch to the data + // state. Emit the current tag token. + @emit_current_node(t); + t->state = HTML_STATE_DATA; + break; + default: + // This is an unexpected-solidus-in-tag parse error. Reconsume in the before + // attribute name state. + t->inputBuffer.pos--; + t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; + break; + } +} + U0 @tokenizer_html_state_character_reference(Tokenizer *t) { // Set the temporary buffer to the empty string. @empty_temp_buffer(t); @@ -929,6 +992,15 @@ Node *@html_tokenize_and_create_node_tree(U8 *buffer, I64 size, case HTML_STATE_NUMERIC_CHARACTER_REFERENCE: @tokenizer_html_state_numeric_character_reference(&t); break; + case HTML_STATE_AFTER_DOCTYPE_NAME: + @tokenizer_html_state_after_doctype_name(&t); + break; + case HTML_STATE_BOGUS_DOCTYPE: + @tokenizer_html_state_bogus_doctype(&t); + break; + case HTML_STATE_SELF_CLOSING_START_TAG: + @tokenizer_html_state_self_closing_start_tag(&t); + break; case HTML_STATE_INVALID: default: @debug("\n$FG,0$HTML Tokenization error: Invalid or unimplemented "