#define GROWABLE_STRING_INCREMENT_SIZE 16 #define HTML_STATE_INVALID 0 #define HTML_STATE_DATA 1 #define HTML_STATE_RCDATA 2 #define HTML_STATE_RAWTEXT 3 #define HTML_STATE_SCRIPT_DATA 4 #define HTML_STATE_PLAINTEXT 5 #define HTML_STATE_TAG_OPEN 6 #define HTML_STATE_END_TAG_OPEN 7 #define HTML_STATE_TAG_NAME 8 #define HTML_STATE_RCDATA_LESS_THAN_SIGN 9 #define HTML_STATE_RCDATA_END_TAG_OPEN 10 #define HTML_STATE_RCDATA_END_TAG_NAME 11 #define HTML_STATE_RAWTEXT_LESS_THAN_SIGN 12 #define HTML_STATE_RAWTEXT_END_TAG_OPEN 13 #define HTML_STATE_RAWTEXT_END_TAG_NAME 14 #define HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN 15 #define HTML_STATE_SCRIPT_DATA_END_TAG_OPEN 16 #define HTML_STATE_SCRIPT_DATA_END_TAG_NAME 17 #define HTML_STATE_SCRIPT_DATA_ESCAPE_START 18 #define HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH 19 #define HTML_STATE_SCRIPT_DATA_ESCAPED 20 #define HTML_STATE_SCRIPT_DATA_ESCAPED_DASH 21 #define HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH 22 #define HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN 23 #define HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN 24 #define HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME 25 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START 26 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED 27 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH 28 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH 29 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN 30 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END 31 #define HTML_STATE_BEFORE_ATTRIBUTE_NAME 32 #define HTML_STATE_ATTRIBUTE_NAME 33 #define HTML_STATE_AFTER_ATTRIBUTE_NAME 34 #define HTML_STATE_BEFORE_ATTRIBUTE_VALUE 35 #define HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED 36 #define HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED 37 #define HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED 38 #define HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED 39 #define HTML_STATE_SELF_CLOSING_START_TAG 40 #define HTML_STATE_BOGUS_COMMENT 41 #define HTML_STATE_MARKUP_DECLARATION_OPEN 42 #define HTML_STATE_COMMENT_START 43 #define HTML_STATE_COMMENT_START_DASH 44 #define HTML_STATE_COMMENT 45 #define HTML_STATE_COMMENT_LESS_THAN_SIGN 46 #define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG 47 #define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH 48 #define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH 49 #define HTML_STATE_COMMENT_END_DASH 50 #define HTML_STATE_COMMENT_END 51 #define HTML_STATE_COMMENT_END_BANG 52 #define HTML_STATE_DOCTYPE 53 #define HTML_STATE_BEFORE_DOCTYPE_NAME 54 #define HTML_STATE_DOCTYPE_NAME 55 #define HTML_STATE_AFTER_DOCTYPE_NAME 56 #define HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD 57 #define HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER 58 #define HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED 59 #define HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED 60 #define HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER 61 #define HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS 62 #define HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD 63 #define HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER 64 #define HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED 65 #define HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED 66 #define HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER 67 #define HTML_STATE_BOGUS_DOCTYPE 68 #define HTML_STATE_CDATA_SECTION 69 #define HTML_STATE_CDATA_SECTION_BRACKET 70 #define HTML_STATE_CDATA_SECTION_END 71 #define HTML_STATE_CHARACTER_REFERENCE 72 #define HTML_STATE_NAMED_CHARACTER_REFERENCE 73 #define HTML_STATE_AMBIGUOUS_AMPERSAND 74 #define HTML_STATE_NUMERIC_CHARACTER_REFERENCE 75 #define HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START 76 #define HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START 77 #define HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE 78 #define HTML_STATE_DECIMAL_CHARACTER_REFERENCE 79 #define HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END 80 class Node : JsonElement { Node *parentNode; U8 tagName[32]; JsonObject *attributes; JsonArray *children; U8 *text; }; class InputBuffer { U8 *data; I64 size; I64 pos; }; class Tokenizer { InputBuffer inputBuffer; I64 state; I64 returnState; U8 currentInputChar; JsonKey *currentAttribute; Node *appendNode; Node *currentNode; Node *originNode; I64 nodeTreeDepth; I64 dataStateCounter; InputBuffer tempBuffer; Bool consumeTempBuffer; I64 numOfImgNodes; }; U8 *@init_growable_string() { return CAlloc(GROWABLE_STRING_INCREMENT_SIZE); } U8 *@append_char_to_growable_string(U8 *s, I64 char) { I64 oldBufSize = roundUp(StrLen(s), GROWABLE_STRING_INCREMENT_SIZE - 1); I64 newBufSize = roundUp(StrLen(s) + 1, GROWABLE_STRING_INCREMENT_SIZE - 1); if (newBufSize > oldBufSize) { U8 *newBuf = CAlloc(roundUp(StrLen(s) + 1, GROWABLE_STRING_INCREMENT_SIZE - 1) * 2); StrCpy(newBuf, s); newBuf[StrLen(newBuf)] = char; Free(s); return newBuf; } else { s[StrLen(s)] = char; return s; } } U0 @empty_temp_buffer(Tokenizer *t) { MemSet(t->tempBuffer.data, NULL, 512); t->tempBuffer.size = 0; t->tempBuffer.pos = 0; } U0 @recalculate_temp_buffer_size(Tokenizer *t) { t->tempBuffer.size = StrLen(t->tempBuffer.data); t->tempBuffer.pos = 0; } U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) { if (!StrICmp(t->tempBuffer.data, "&")) { StrCpy(t->tempBuffer.data, "\x11"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "å")) { StrCpy(t->tempBuffer.data, "\xc5"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "•")) { StrPrint(t->tempBuffer.data, "%c", 254); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "©")) { StrCpy(t->tempBuffer.data, "(c)"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, " ")) { StrCpy(t->tempBuffer.data, " "); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "—")) { StrCpy(t->tempBuffer.data, "-"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, " ")) { StrCpy(t->tempBuffer.data, " "); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "<")) { StrCpy(t->tempBuffer.data, "\x12"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, ">")) { StrCpy(t->tempBuffer.data, ">"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, """)) { StrCpy(t->tempBuffer.data, "\""); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "&zerowidthspace;")) { StrCpy(t->tempBuffer.data, ""); @recalculate_temp_buffer_size(t); return; } StrCpy(t->tempBuffer.data, " ? "); @recalculate_temp_buffer_size(t); return; /* @debug("\n$FG,0$HTML Tokenization error: Unimplemented named character " "reference " "InputBuffer position: %d\nName: %s$FD$\n", t->inputBuffer.pos, t->tempBuffer.data); "\n"; PressAKey; */ } I64 @hex_table_i; I64 @hex_table[256]; MemSet(&@hex_table, NULL, sizeof(I64) * 256); for (@hex_table_i = '0'; @hex_table_i < ':'; @hex_table_i++) { @hex_table[@hex_table_i] = @hex_table_i - '0'; } for (@hex_table_i = 'A'; @hex_table_i < 'G'; @hex_table_i++) { @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'A'); } for (@hex_table_i = 'a'; @hex_table_i < 'g'; @hex_table_i++) { @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'a'); } U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) { t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = NULL; // chop off semicolon I64 charCode = Str2I64(t->tempBuffer.data + 2); switch (charCode) { case 32...127: StrPrint(t->tempBuffer.data, "%c", charCode); @recalculate_temp_buffer_size(t); break; case 956: StrPrint(t->tempBuffer.data, "%c", 230); @recalculate_temp_buffer_size(t); break; case 8217: StrCpy(t->tempBuffer.data, "'"); @recalculate_temp_buffer_size(t); return; break; case 8230: StrCpy(t->tempBuffer.data, "..."); @recalculate_temp_buffer_size(t); return; break; case 9660: StrCpy(t->tempBuffer.data, ""); @recalculate_temp_buffer_size(t); return; break; case 10006: StrCpy(t->tempBuffer.data, "x"); @recalculate_temp_buffer_size(t); return; break; default: StrCpy(t->tempBuffer.data, " ? "); @recalculate_temp_buffer_size(t); return; /* @debug("\n$FG,0$HTML Tokenization error: Unimplemented decimal character " "reference " "InputBuffer position: %d\nValue: %d$FD$\n", t->inputBuffer.pos, charCode); "\n"; PressAKey; */ break; } } U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) { // TODO: convert hex to dec in buffer and call // @replace_temp_buffer_with_dec_character_reference no_warn t; I64 dec_char = 0; U8 buf[512]; t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = NULL; // chop off semicolon U8 *ch = t->tempBuffer.data + 3; while (*ch && dec_char >= 0) { dec_char = (dec_char << 4) | @hex_table[*ch++]; } StrPrint(t->tempBuffer.data, "&#%d;", dec_char); @recalculate_temp_buffer_size(t); @replace_temp_buffer_with_dec_character_reference(t); //"unimplemented: @replace_temp_buffer_with_hex_character_reference\n"; // PressAKey; } U0 @replace_temp_buffer_with_numeric_character_reference(Tokenizer *t) { switch (t->tempBuffer.data[2]) { case 'x': @replace_temp_buffer_with_hex_character_reference(t); break; default: @replace_temp_buffer_with_dec_character_reference(t); break; } } U0 @append_char_to_temp_buffer(Tokenizer *t, I64 char) { t->tempBuffer.data[StrLen(t->tempBuffer.data)] = char; t->tempBuffer.size++; } Node *@create_new_node(U8 *tagName) { Node *node = CAlloc(sizeof(Node)); StrCpy(node->tagName, tagName); node->attributes = Json.CreateObject(); node->children = Json.CreateArray(); node->text = @init_growable_string; node->type = JSON_HTML; return node; } U0 @init_tokenizer(Tokenizer *t, U8 *data, I64 size) { t->inputBuffer.data = data; t->inputBuffer.size = size; t->inputBuffer.pos = 0; t->state = HTML_STATE_DATA; t->tempBuffer.data = CAlloc(512); t->tempBuffer.size = size; t->tempBuffer.pos = 0; t->originNode = @create_new_node("Document"); t->appendNode = t->originNode; t->currentNode = t->originNode; t->consumeTempBuffer = FALSE; t->dataStateCounter = 0; t->numOfImgNodes = 0; } U0 @consume_next_input_char(Tokenizer *t) { if (t->consumeTempBuffer) { if (t->tempBuffer.pos < t->tempBuffer.size) { t->currentInputChar = t->tempBuffer.data[t->tempBuffer.pos++]; return; } else { t->consumeTempBuffer = FALSE; } } t->currentInputChar = t->inputBuffer.data[t->inputBuffer.pos++]; } U0 @emit_current_character(Tokenizer *t) { if (!t->dataStateCounter) { Node *node = @create_new_node("InternalTextNode"); t->currentNode = node; } t->currentNode->text = @append_char_to_growable_string(t->currentNode->text, t->currentInputChar); t->dataStateCounter++; } Bool @node_is_self_closing(Node *node) { if (!StrICmp(node->tagName, "InternalTextNode")) return TRUE; if (!StrICmp(node->tagName, "area")) return TRUE; if (!StrICmp(node->tagName, "base")) return TRUE; if (!StrICmp(node->tagName, "br")) return TRUE; if (!StrICmp(node->tagName, "col")) return TRUE; if (!StrICmp(node->tagName, "embed")) return TRUE; if (!StrICmp(node->tagName, "hr")) return TRUE; if (!StrICmp(node->tagName, "img")) return TRUE; if (!StrICmp(node->tagName, "input")) return TRUE; if (!StrICmp(node->tagName, "link")) return TRUE; if (!StrICmp(node->tagName, "meta")) return TRUE; if (!StrICmp(node->tagName, "param")) return TRUE; if (!StrICmp(node->tagName, "source")) return TRUE; if (!StrICmp(node->tagName, "track")) return TRUE; if (!StrICmp(node->tagName, "wbr")) return TRUE; return FALSE; } U0 @emit_current_node(Tokenizer *t) { Node *origAppendNode = t->appendNode; if (t->currentNode->tagName[0] == '/') { if (StrICmp(t->appendNode->tagName, t->currentNode->tagName + 1)) { /* end tag tagName for currentNode does not match appendNode tagName, * traverse up parentNode until we find a match */ while (StrICmp(t->appendNode->tagName, t->currentNode->tagName + 1)) { if (!StrICmp(t->appendNode->tagName, "Document")) { // If we've traversed this far up, then the // closing tag is invalid t->appendNode = origAppendNode; return; } t->appendNode = t->appendNode->parentNode; } } t->appendNode = t->appendNode->parentNode; return; } JsonItem *nodeItem = CAlloc(sizeof(JsonItem)); t->currentNode->parentNode = t->appendNode; nodeItem->value = t->currentNode; Json.AppendItem(t->appendNode->children, nodeItem); if (!@node_is_self_closing(t->currentNode)) t->appendNode = t->currentNode; } U0 @set_current_attribute_on_current_node(Tokenizer *t) { Json.Set(t->currentNode->attributes, t->currentAttribute->name, t->currentAttribute->value, JSON_STRING); } U0 @tokenizer_html_state_data(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '&': // Set the return state to the data state. Switch to the character reference // state. t->returnState = HTML_STATE_DATA; t->state = HTML_STATE_CHARACTER_REFERENCE; break; case '<': // Switch to the tag open state. if (t->dataStateCounter) @emit_current_node(t); t->dataStateCounter = 0; t->state = HTML_STATE_TAG_OPEN; break; default: // Emit the current input character as a character token. @emit_current_character(t); break; } } U0 @tokenizer_html_state_tag_open(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '!': // Switch to the markup declaration open state. t->state = HTML_STATE_MARKUP_DECLARATION_OPEN; break; case '/': // Switch to the end tag open state. t->state = HTML_STATE_END_TAG_OPEN; break; case 'A' ... 'Z': case 'a' ... 'z': // Create a new start tag token, set its tag name to the empty string. // Reconsume in the tag name state. Node *node = @create_new_node(""); t->currentNode = node; t->inputBuffer.pos--; t->state = HTML_STATE_TAG_NAME; break; case '?': // This is an unexpected-question-mark-instead-of-tag-name parse error. // Create a comment token whose data is the empty string. Reconsume in the // bogus comment state. t->inputBuffer.pos--; t->state = HTML_STATE_BOGUS_COMMENT; break; default: // This is an invalid-first-character-of-tag-name parse error. Emit a U+003C // LESS-THAN SIGN character token. Reconsume in the data state. @emit_current_character(t); t->inputBuffer.pos--; t->state = HTML_STATE_DATA; break; } } U0 @tokenizer_html_state_markup_declaration_open(Tokenizer *t) { if ((t->inputBuffer.data[t->inputBuffer.pos] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 1] == '-')) { // Consume those two characters, create a comment token whose data is the // empty string, and switch to the comment start state. t->inputBuffer.pos += 2; t->state = HTML_STATE_COMMENT_START; return; } U8 buf[8]; buf[7] = NULL; MemCpy(buf, t->inputBuffer.data + t->inputBuffer.pos, 7); if (!StrICmp(buf, "DOCTYPE")) { // Consume those characters and switch to the DOCTYPE state. t->inputBuffer.pos += 7; t->state = HTML_STATE_DOCTYPE; return; } t->state = HTML_STATE_BOGUS_COMMENT; } U0 @tokenizer_html_state_doctype(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the before DOCTYPE name state. t->state = HTML_STATE_BEFORE_DOCTYPE_NAME; break; case '>': // Reconsume in the before DOCTYPE name state. t->inputBuffer.pos--; t->state = HTML_STATE_BEFORE_DOCTYPE_NAME; break; default: // This is a missing-whitespace-before-doctype-name parse error. Reconsume // in the before DOCTYPE name state. t->inputBuffer.pos--; t->state = HTML_STATE_BEFORE_DOCTYPE_NAME; break; } } U0 @tokenizer_html_state_before_doctype_name(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; case 'A' ... 'Z': // Create a new DOCTYPE token. Set the token's name to the lowercase version // of the current input character (add 0x0020 to the character's code // point). Switch to the DOCTYPE name state. t->state = HTML_STATE_DOCTYPE_NAME; break; case '>': // This is a missing-doctype-name parse error. Create a new DOCTYPE token. // Set its force-quirks flag to on. Switch to the data state. Emit the // current token. @emit_current_character(t); t->state = HTML_STATE_DATA; break; default: // Create a new DOCTYPE token. Set the token's name to the current input // character. Switch to the DOCTYPE name state. t->state = HTML_STATE_DOCTYPE_NAME; break; } } U0 @tokenizer_html_state_doctype_name(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the after DOCTYPE name state. t->state = HTML_STATE_AFTER_DOCTYPE_NAME; break; case '>': // Switch to the data state. Emit the current DOCTYPE token. t->state = HTML_STATE_DATA; break; case 'A' ... 'Z': // Append the lowercase version of the current input character (add 0x0020 // to the character's code point) to the current DOCTYPE token's name. break; default: // Append the current input character to the current DOCTYPE token's name. break; } } U0 @tokenizer_html_state_after_doctype_name(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; case 'A' ... 'Z': // Create a new DOCTYPE token. Set the token's name to the lowercase version // of the current input character (add 0x0020 to the character's code // point). Switch to the DOCTYPE name state. t->state = HTML_STATE_DOCTYPE_NAME; break; case '>': // Switch to the data state. Emit the current DOCTYPE token. t->state = HTML_STATE_DATA; break; default: // Reconsume in the bogus DOCTYPE state. t->inputBuffer.pos--; t->state = HTML_STATE_BOGUS_DOCTYPE; break; } } U0 @tokenizer_html_state_bogus_doctype(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '>': // Switch to the data state. Emit the DOCTYPE token. t->state = HTML_STATE_DATA; break; default: // Ignore the character. break; } } U0 @tokenizer_html_state_tag_name(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the before attribute name state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': // Switch to the self-closing start tag state. t->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': // Switch to the data state. Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; case 'A' ... 'Z': // Append the lowercase version of the current input character (add 0x0020 // to the character's code point) to the current tag token's tag name. t->currentNode->tagName[StrLen(t->currentNode->tagName)] = t->currentInputChar + 0x20; if (!StrICmp(t->currentNode->tagName, "img")) t->numOfImgNodes++; break; default: // Append the current input character to the current tag token's tag name. t->currentNode->tagName[StrLen(t->currentNode->tagName)] = t->currentInputChar; if (!StrICmp(t->currentNode->tagName, "img")) t->numOfImgNodes++; break; } } U0 @tokenizer_html_state_before_attribute_name(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; case '/': case '>': // Reconsume in the after attribute name state. t->inputBuffer.pos--; t->state = HTML_STATE_AFTER_ATTRIBUTE_NAME; break; case '=': // This is an unexpected-equals-sign-before-attribute-name parse error. // Start a new attribute in the current tag token. Set that attribute's name // to the current input character, and its value to the empty string. Switch // to the attribute name state. t->currentAttribute = CAlloc(sizeof(JsonKey)); t->currentAttribute->name = @init_growable_string; t->currentAttribute->value = @init_growable_string; t->currentAttribute->name = @append_char_to_growable_string( t->currentAttribute->name, t->currentInputChar); t->state = HTML_STATE_ATTRIBUTE_NAME; break; default: // Start a new attribute in the current tag token. Set that attribute name // and value to the empty string. Reconsume in the attribute name state. t->currentAttribute = CAlloc(sizeof(JsonKey)); t->currentAttribute->name = @init_growable_string; t->currentAttribute->value = @init_growable_string; t->inputBuffer.pos--; t->state = HTML_STATE_ATTRIBUTE_NAME; break; } } U0 @tokenizer_html_state_attribute_name(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': case '/': case '>': // Reconsume in the after attribute name state. t->inputBuffer.pos--; t->state = HTML_STATE_AFTER_ATTRIBUTE_NAME; break; case '=': // Switch to the before attribute value state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE; break; case 'A' ... 'Z': // Append the lowercase version of the current input character (add 0x0020 // to the character's code point) to the current attribute's name. t->currentAttribute->name = @append_char_to_growable_string( t->currentAttribute->name, t->currentInputChar + 0x20); break; case '"': case '\'': case '<': // This is an unexpected-character-in-attribute-name parse error. Treat it as // per the "anything else" entry below. default: // Append the current input character to the current attribute's name. t->currentAttribute->name = @append_char_to_growable_string( t->currentAttribute->name, t->currentInputChar); break; } } U0 @tokenizer_html_state_before_attribute_value(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; case '"': // Switch to the attribute value (double-quoted) state. t->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED; break; case '\'': // Switch to the attribute value (single-quoted) state. t->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; break; case '>': // This is a missing-attribute-value parse error. Switch to the data state. // Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // Reconsume in the attribute value (unquoted) state. t->inputBuffer.pos--; t->state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED; break; } } U0 @tokenizer_html_state_attribute_value_double_quoted(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '"': // Switch to the after attribute value (quoted) state. @set_current_attribute_on_current_node(t); t->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED; break; /* case '&': // Set the return state to the attribute value (double-quoted) state. Switch // to the character reference state. t->returnState = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED; t->state = HTML_STATE_CHARACTER_REFERENCE; break; */ default: // Append the current input character to the current attribute's value. t->currentAttribute->value = @append_char_to_growable_string( t->currentAttribute->value, t->currentInputChar); break; } } U0 @tokenizer_html_state_attribute_value_unquoted(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the before attribute name state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; /* case '&': // Set the return state to the attribute value (double-quoted) state. Switch // to the character reference state. t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; t->state = HTML_STATE_CHARACTER_REFERENCE; break; */ case '>': // Switch to the data state. Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // Append the current input character to the current attribute's value. t->currentAttribute->value = @append_char_to_growable_string( t->currentAttribute->value, t->currentInputChar); break; } } U0 @tokenizer_html_state_attribute_value_single_quoted(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\'': // Switch to the after attribute value (quoted) state. @set_current_attribute_on_current_node(t); t->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED; break; /* case '&': // Set the return state to the attribute value (double-quoted) state. Switch // to the character reference state. t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; t->state = HTML_STATE_CHARACTER_REFERENCE; break; */ default: // Append the current input character to the current attribute's value. t->currentAttribute->value = @append_char_to_growable_string( t->currentAttribute->value, t->currentInputChar); break; } } U0 @tokenizer_html_state_after_attribute_value_quoted(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the before attribute name state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': // Switch to the self-closing start tag state. t->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': // Switch to the data state. Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // This is a missing-whitespace-between-attributes parse error. Reconsume in // the before attribute name state. t->inputBuffer.pos--; t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; } } U0 @tokenizer_html_state_end_tag_open(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case 'A' ... 'Z': case 'a' ... 'z': // Create a new end tag token, set its tag name to the empty string. // Reconsume in the tag name state. Node *node = @create_new_node("/"); t->currentNode = node; t->inputBuffer.pos--; t->state = HTML_STATE_TAG_NAME; break; case '>': // This is a missing-end-tag-name parse error. Switch to the data state. t->state = HTML_STATE_DATA; break; default: // This is an invalid-first-character-of-tag-name parse error. Create a // comment token whose data is the empty string. Reconsume in the bogus // comment state. t->inputBuffer.pos--; t->state = HTML_STATE_BOGUS_COMMENT; break; } } U0 @tokenizer_html_state_after_attribute_name(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; /* case '"': // Switch to the attribute value (double-quoted) state. t->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED; break; case '\'': // Switch to the attribute value (single-quoted) state. t->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; break; */ case '/': // Switch to the self-closing start tag state. t->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '=': // Switch to the before attribute value state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE; break; case '>': // Switch to the data state. Emit the current tag token. @set_current_attribute_on_current_node(t); @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // Start a new attribute in the current tag token. Set that attribute name // and value to the empty string. Reconsume in the attribute name state. t->currentAttribute = CAlloc(sizeof(JsonKey)); t->currentAttribute->name = @init_growable_string; t->currentAttribute->value = @init_growable_string; t->inputBuffer.pos--; t->state = HTML_STATE_ATTRIBUTE_NAME; break; } } U0 @tokenizer_html_state_self_closing_start_tag(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '>': // Set the self-closing flag of the current tag token. Switch to the data // state. Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // This is an unexpected-solidus-in-tag parse error. Reconsume in the before // attribute name state. t->inputBuffer.pos--; t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; } } U0 @tokenizer_html_state_character_reference(Tokenizer *t) { // Set the temporary buffer to the empty string. @empty_temp_buffer(t); // Append a U+0026 AMPERSAND (&) character to the temporary buffer. @append_char_to_temp_buffer(t, '&'); @consume_next_input_char(t); switch (t->currentInputChar) { case 'A' ... 'Z': case 'a' ... 'z': // Reconsume in the named character reference state. t->inputBuffer.pos--; t->state = HTML_STATE_NAMED_CHARACTER_REFERENCE; break; case '#': // Append the current input character to the temporary buffer. Switch to the // numeric character reference state. @append_char_to_temp_buffer(t, '#'); t->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE; break; default: // Flush code points consumed as a character reference. Reconsume in the // return state. t->consumeTempBuffer = TRUE; t->state = t->returnState; break; } } U0 @tokenizer_html_state_named_character_reference(Tokenizer *t) { // Consume the maximum number of characters possible, where the consumed // characters are one of the identifiers in the first column of the named // character references table. Append each character to the temporary buffer // when it's consumed. @consume_next_input_char(t); @append_char_to_temp_buffer(t, t->currentInputChar); switch (t->currentInputChar) { case ';': @replace_temp_buffer_with_named_character_reference(t); t->consumeTempBuffer = TRUE; t->state = t->returnState; break; default: break; } } U0 @tokenizer_html_state_numeric_character_reference(Tokenizer *t) { @consume_next_input_char(t); @append_char_to_temp_buffer(t, t->currentInputChar); switch (t->currentInputChar) { case ';': @replace_temp_buffer_with_numeric_character_reference(t); t->consumeTempBuffer = TRUE; t->state = t->returnState; break; default: break; } } U0 @tokenizer_html_state_comment_start(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '-': // Switch to the comment start dash state. t->state = HTML_STATE_COMMENT_START_DASH; break; case '>': // This is an abrupt-closing-of-empty-comment parse error. Switch to the // data state. Emit the current comment token. t->state = HTML_STATE_DATA; break; default: // Reconsume in the comment state. t->inputBuffer.pos--; t->state = HTML_STATE_COMMENT; break; } } U0 @tokenizer_html_state_comment(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { /* case '<': // Append the current input character to the comment token's data. Switch to // the comment less-than sign state. t->state = HTML_STATE_COMMENT_LESS_THAN_SIGN; break; */ case '-': // Switch to the comment end dash state. t->state = HTML_STATE_COMMENT_END_DASH; break; default: // Append the current input character to the comment token's data. break; } } U0 @tokenizer_html_state_comment_end_dash(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '-': // Switch to the comment end state. t->state = HTML_STATE_COMMENT_END; break; default: // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. // Reconsume in the comment state. t->inputBuffer.pos--; t->inputBuffer.data[t->inputBuffer.pos] = '-'; t->state = HTML_STATE_COMMENT; break; } } U0 @tokenizer_html_state_comment_end(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '>': // Switch to the data state. Emit the current comment token. t->state = HTML_STATE_DATA; break; case '!': // Switch to the comment end bang state. t->state = HTML_STATE_COMMENT_END_BANG; break; case '-': // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. break; default: // Append two U+002D HYPHEN-MINUS characters (-) to the comment token's // data. Reconsume in the comment state. t->inputBuffer.pos--; t->inputBuffer.data[t->inputBuffer.pos] = '-'; t->inputBuffer.pos--; t->inputBuffer.data[t->inputBuffer.pos] = '-'; t->state = HTML_STATE_COMMENT; break; } } U0 @dump_node(Tokenizer *t, Node *node) { I64 i; if (StrICmp(node->tagName, "InternalTextNode") && StrICmp(node->tagName, "Document")) { for (i = 0; i < t->nodeTreeDepth; i++) "-"; "<%s> : parentNode: <%s 0x%08x>\n", node->tagName, node->parentNode->tagName, node->parentNode; } if (node->children->length) { t->nodeTreeDepth += 2; for (i = 0; i < node->children->length; i++) @dump_node(t, Json.ArrayIndex(node->children, i)); t->nodeTreeDepth -= 2; } } U0 @dump_node_tree(Tokenizer *t) { t->nodeTreeDepth = -2; @dump_node(t, t->originNode); "\n"; } Bool tokenizer_debug = FALSE; Node *@html_tokenize_and_create_node_tree(U8 *buffer, I64 size, I64 *num_of_images) { Tokenizer t; U8 buf[512]; U8 *buf2; no_warn buf, buf2; @init_tokenizer(&t, buffer, size); while (t.inputBuffer.pos < t.inputBuffer.size && buffer[t.inputBuffer.pos]) { if (tokenizer_debug) { StrPrint(buf, "pos: %d, char: %c, state: %d\n", t.inputBuffer.pos, t.inputBuffer.data[t.inputBuffer.pos], t.state); buf2 = &buf; while (*buf2) OutU8(0x504, *buf2++); } switch (t.state) { case HTML_STATE_DATA: @tokenizer_html_state_data(&t); break; case HTML_STATE_TAG_OPEN: @tokenizer_html_state_tag_open(&t); break; case HTML_STATE_MARKUP_DECLARATION_OPEN: @tokenizer_html_state_markup_declaration_open(&t); break; case HTML_STATE_DOCTYPE: @tokenizer_html_state_doctype(&t); break; case HTML_STATE_BEFORE_DOCTYPE_NAME: @tokenizer_html_state_before_doctype_name(&t); break; case HTML_STATE_DOCTYPE_NAME: @tokenizer_html_state_doctype_name(&t); break; case HTML_STATE_TAG_NAME: @tokenizer_html_state_tag_name(&t); break; case HTML_STATE_BEFORE_ATTRIBUTE_NAME: @tokenizer_html_state_before_attribute_name(&t); break; case HTML_STATE_ATTRIBUTE_NAME: @tokenizer_html_state_attribute_name(&t); break; case HTML_STATE_BEFORE_ATTRIBUTE_VALUE: @tokenizer_html_state_before_attribute_value(&t); break; case HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED: @tokenizer_html_state_attribute_value_double_quoted(&t); break; case HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED: @tokenizer_html_state_after_attribute_value_quoted(&t); break; case HTML_STATE_CHARACTER_REFERENCE: @tokenizer_html_state_character_reference(&t); break; case HTML_STATE_END_TAG_OPEN: @tokenizer_html_state_end_tag_open(&t); break; case HTML_STATE_AFTER_ATTRIBUTE_NAME: @tokenizer_html_state_after_attribute_name(&t); break; case HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED: @tokenizer_html_state_attribute_value_single_quoted(&t); break; case HTML_STATE_NAMED_CHARACTER_REFERENCE: @tokenizer_html_state_named_character_reference(&t); break; case HTML_STATE_NUMERIC_CHARACTER_REFERENCE: @tokenizer_html_state_numeric_character_reference(&t); break; case HTML_STATE_AFTER_DOCTYPE_NAME: @tokenizer_html_state_after_doctype_name(&t); break; case HTML_STATE_BOGUS_DOCTYPE: @tokenizer_html_state_bogus_doctype(&t); break; case HTML_STATE_SELF_CLOSING_START_TAG: @tokenizer_html_state_self_closing_start_tag(&t); break; case HTML_STATE_COMMENT_START: @tokenizer_html_state_comment_start(&t); break; case HTML_STATE_COMMENT: @tokenizer_html_state_comment(&t); break; case HTML_STATE_COMMENT_END_DASH: @tokenizer_html_state_comment_end_dash(&t); break; case HTML_STATE_COMMENT_END: @tokenizer_html_state_comment_end(&t); break; case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED: @tokenizer_html_state_attribute_value_unquoted(&t); break; case HTML_STATE_INVALID: default: @debug("\n$FG,0$HTML Tokenization error: Invalid or unimplemented " "state\nInputBuffer position: %d\nState: %d$FD$\n", t.inputBuffer.pos, t.state); "\n"; //@dump_node_tree(&t); PressAKey; break; } } //@dump_node_tree(&t); Node *node_tree = t.originNode; *num_of_images = t.numOfImgNodes; return node_tree; }