diff --git a/Src/HTMLTokenizer.HC b/Src/HTMLTokenizer.HC index 858cd27..d7320b0 100644 --- a/Src/HTMLTokenizer.HC +++ b/Src/HTMLTokenizer.HC @@ -158,6 +158,16 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) { @recalculate_temp_buffer_size(t); return; } + if (!StrICmp(t->tempBuffer.data, " ")) { + StrCpy(t->tempBuffer.data, " "); + @recalculate_temp_buffer_size(t); + return; + } + if (!StrICmp(t->tempBuffer.data, "—")) { + StrCpy(t->tempBuffer.data, "-"); + @recalculate_temp_buffer_size(t); + return; + } if (!StrICmp(t->tempBuffer.data, " ")) { StrCpy(t->tempBuffer.data, " "); @recalculate_temp_buffer_size(t); @@ -187,12 +197,20 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) { PressAKey; } -U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) { - // TODO: convert hex to dec in buffer and call - // @replace_temp_buffer_with_dec_character_reference - no_warn t; - "unimplemented: @replace_temp_buffer_with_hex_character_reference\n"; - PressAKey; +I64 @hex_table_i; +I64 @hex_table[256]; +MemSet(&@hex_table, NULL, sizeof(I64) * 256); + +for (@hex_table_i = '0'; @hex_table_i < ':'; @hex_table_i++) { + @hex_table[@hex_table_i] = @hex_table_i - '0'; +} + +for (@hex_table_i = 'A'; @hex_table_i < 'G'; @hex_table_i++) { + @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'A'); +} + +for (@hex_table_i = 'a'; @hex_table_i < 'g'; @hex_table_i++) { + @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'a'); } U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) { @@ -205,11 +223,30 @@ U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) { StrPrint(t->tempBuffer.data, "%c", charCode); @recalculate_temp_buffer_size(t); break; + case 956: + StrPrint(t->tempBuffer.data, "%c", 230); + @recalculate_temp_buffer_size(t); + break; + case 8217: + StrCpy(t->tempBuffer.data, "'"); + @recalculate_temp_buffer_size(t); + return; + break; + case 8230: + StrCpy(t->tempBuffer.data, "..."); + @recalculate_temp_buffer_size(t); + return; + break; case 9660: StrCpy(t->tempBuffer.data, ""); @recalculate_temp_buffer_size(t); return; break; + case 10006: + StrCpy(t->tempBuffer.data, "x"); + @recalculate_temp_buffer_size(t); + return; + break; default: @debug("\n$FG,0$HTML Tokenization error: Unimplemented decimal character " @@ -222,6 +259,32 @@ U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) { } } +U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) { + // TODO: convert hex to dec in buffer and call + // @replace_temp_buffer_with_dec_character_reference + no_warn t; + + I64 dec_char = 0; + + U8 buf[512]; + t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = + NULL; // chop off semicolon + + U8 *ch = t->tempBuffer.data + 3; + + while (*ch && dec_char >= 0) { + dec_char = (dec_char << 4) | @hex_table[*ch++]; + } + + StrPrint(t->tempBuffer.data, "&#%d;", dec_char); + @recalculate_temp_buffer_size(t); + + @replace_temp_buffer_with_dec_character_reference(t); + + //"unimplemented: @replace_temp_buffer_with_hex_character_reference\n"; + // PressAKey; +} + U0 @replace_temp_buffer_with_numeric_character_reference(Tokenizer *t) { switch (t->tempBuffer.data[2]) { case 'x': @@ -718,6 +781,37 @@ U0 @tokenizer_html_state_attribute_value_double_quoted(Tokenizer *t) { } } +U0 @tokenizer_html_state_attribute_value_unquoted(Tokenizer *t) { + @consume_next_input_char(t); + switch (t->currentInputChar) { + case '\n': + case '\r': + case '\t': + case ' ': + // Switch to the before attribute name state. + t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; + break; + /* + case '&': + // Set the return state to the attribute value (double-quoted) state. Switch + // to the character reference state. + t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; + t->state = HTML_STATE_CHARACTER_REFERENCE; + break; + */ + case '>': + // Switch to the data state. Emit the current tag token. + @emit_current_node(t); + t->state = HTML_STATE_DATA; + break; + default: + // Append the current input character to the current attribute's value. + t->currentAttribute->value = @append_char_to_growable_string( + t->currentAttribute->value, t->currentInputChar); + break; + } +} + U0 @tokenizer_html_state_attribute_value_single_quoted(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { @@ -942,11 +1036,13 @@ U0 @tokenizer_html_state_comment_start(Tokenizer *t) { U0 @tokenizer_html_state_comment(Tokenizer *t) { @consume_next_input_char(t); switch (t->currentInputChar) { + /* case '<': // Append the current input character to the comment token's data. Switch to // the comment less-than sign state. t->state = HTML_STATE_COMMENT_LESS_THAN_SIGN; break; + */ case '-': // Switch to the comment end dash state. t->state = HTML_STATE_COMMENT_END_DASH; @@ -1121,6 +1217,9 @@ Node *@html_tokenize_and_create_node_tree(U8 *buffer, I64 size, case HTML_STATE_COMMENT_END: @tokenizer_html_state_comment_end(&t); break; + case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED: + @tokenizer_html_state_attribute_value_unquoted(&t); + break; case HTML_STATE_INVALID: default: @debug("\n$FG,0$HTML Tokenization error: Invalid or unimplemented "