Add state HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED, convert hex character references to dec, add some more character references

This commit is contained in:
Alec Murphy
2022-05-30 11:46:24 -04:00
parent 0d487fd495
commit cfb9c081be
+105 -6
View File
@@ -158,6 +158,16 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) {
@recalculate_temp_buffer_size(t);
return;
}
if (!StrICmp(t->tempBuffer.data, " ")) {
StrCpy(t->tempBuffer.data, " ");
@recalculate_temp_buffer_size(t);
return;
}
if (!StrICmp(t->tempBuffer.data, "—")) {
StrCpy(t->tempBuffer.data, "-");
@recalculate_temp_buffer_size(t);
return;
}
if (!StrICmp(t->tempBuffer.data, " ")) {
StrCpy(t->tempBuffer.data, " ");
@recalculate_temp_buffer_size(t);
@@ -187,12 +197,20 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) {
PressAKey;
}
U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) {
// TODO: convert hex to dec in buffer and call
// @replace_temp_buffer_with_dec_character_reference
no_warn t;
"unimplemented: @replace_temp_buffer_with_hex_character_reference\n";
PressAKey;
I64 @hex_table_i;
I64 @hex_table[256];
MemSet(&@hex_table, NULL, sizeof(I64) * 256);
for (@hex_table_i = '0'; @hex_table_i < ':'; @hex_table_i++) {
@hex_table[@hex_table_i] = @hex_table_i - '0';
}
for (@hex_table_i = 'A'; @hex_table_i < 'G'; @hex_table_i++) {
@hex_table[@hex_table_i] = 10 + (@hex_table_i - 'A');
}
for (@hex_table_i = 'a'; @hex_table_i < 'g'; @hex_table_i++) {
@hex_table[@hex_table_i] = 10 + (@hex_table_i - 'a');
}
U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
@@ -205,11 +223,30 @@ U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
StrPrint(t->tempBuffer.data, "%c", charCode);
@recalculate_temp_buffer_size(t);
break;
case 956:
StrPrint(t->tempBuffer.data, "%c", 230);
@recalculate_temp_buffer_size(t);
break;
case 8217:
StrCpy(t->tempBuffer.data, "'");
@recalculate_temp_buffer_size(t);
return;
break;
case 8230:
StrCpy(t->tempBuffer.data, "...");
@recalculate_temp_buffer_size(t);
return;
break;
case 9660:
StrCpy(t->tempBuffer.data, "");
@recalculate_temp_buffer_size(t);
return;
break;
case 10006:
StrCpy(t->tempBuffer.data, "x");
@recalculate_temp_buffer_size(t);
return;
break;
default:
@debug("\n$FG,0$HTML Tokenization error: Unimplemented decimal character "
@@ -222,6 +259,32 @@ U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
}
}
U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) {
// TODO: convert hex to dec in buffer and call
// @replace_temp_buffer_with_dec_character_reference
no_warn t;
I64 dec_char = 0;
U8 buf[512];
t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] =
NULL; // chop off semicolon
U8 *ch = t->tempBuffer.data + 3;
while (*ch && dec_char >= 0) {
dec_char = (dec_char << 4) | @hex_table[*ch++];
}
StrPrint(t->tempBuffer.data, "&#%d;", dec_char);
@recalculate_temp_buffer_size(t);
@replace_temp_buffer_with_dec_character_reference(t);
//"unimplemented: @replace_temp_buffer_with_hex_character_reference\n";
// PressAKey;
}
U0 @replace_temp_buffer_with_numeric_character_reference(Tokenizer *t) {
switch (t->tempBuffer.data[2]) {
case 'x':
@@ -718,6 +781,37 @@ U0 @tokenizer_html_state_attribute_value_double_quoted(Tokenizer *t) {
}
}
U0 @tokenizer_html_state_attribute_value_unquoted(Tokenizer *t) {
@consume_next_input_char(t);
switch (t->currentInputChar) {
case '\n':
case '\r':
case '\t':
case ' ':
// Switch to the before attribute name state.
t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
break;
/*
case '&':
// Set the return state to the attribute value (double-quoted) state. Switch
// to the character reference state.
t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
t->state = HTML_STATE_CHARACTER_REFERENCE;
break;
*/
case '>':
// Switch to the data state. Emit the current tag token.
@emit_current_node(t);
t->state = HTML_STATE_DATA;
break;
default:
// Append the current input character to the current attribute's value.
t->currentAttribute->value = @append_char_to_growable_string(
t->currentAttribute->value, t->currentInputChar);
break;
}
}
U0 @tokenizer_html_state_attribute_value_single_quoted(Tokenizer *t) {
@consume_next_input_char(t);
switch (t->currentInputChar) {
@@ -942,11 +1036,13 @@ U0 @tokenizer_html_state_comment_start(Tokenizer *t) {
U0 @tokenizer_html_state_comment(Tokenizer *t) {
@consume_next_input_char(t);
switch (t->currentInputChar) {
/*
case '<':
// Append the current input character to the comment token's data. Switch to
// the comment less-than sign state.
t->state = HTML_STATE_COMMENT_LESS_THAN_SIGN;
break;
*/
case '-':
// Switch to the comment end dash state.
t->state = HTML_STATE_COMMENT_END_DASH;
@@ -1121,6 +1217,9 @@ Node *@html_tokenize_and_create_node_tree(U8 *buffer, I64 size,
case HTML_STATE_COMMENT_END:
@tokenizer_html_state_comment_end(&t);
break;
case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED:
@tokenizer_html_state_attribute_value_unquoted(&t);
break;
case HTML_STATE_INVALID:
default:
@debug("\n$FG,0$HTML Tokenization error: Invalid or unimplemented "