mirror of
https://git.checksum.fail/alec/Web.git
synced 2026-05-26 19:15:49 +00:00
Add state HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED, convert hex character references to dec, add some more character references
This commit is contained in:
+105
-6
@@ -158,6 +158,16 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) {
|
||||
@recalculate_temp_buffer_size(t);
|
||||
return;
|
||||
}
|
||||
if (!StrICmp(t->tempBuffer.data, " ")) {
|
||||
StrCpy(t->tempBuffer.data, " ");
|
||||
@recalculate_temp_buffer_size(t);
|
||||
return;
|
||||
}
|
||||
if (!StrICmp(t->tempBuffer.data, "—")) {
|
||||
StrCpy(t->tempBuffer.data, "-");
|
||||
@recalculate_temp_buffer_size(t);
|
||||
return;
|
||||
}
|
||||
if (!StrICmp(t->tempBuffer.data, " ")) {
|
||||
StrCpy(t->tempBuffer.data, " ");
|
||||
@recalculate_temp_buffer_size(t);
|
||||
@@ -187,12 +197,20 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) {
|
||||
PressAKey;
|
||||
}
|
||||
|
||||
U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) {
|
||||
// TODO: convert hex to dec in buffer and call
|
||||
// @replace_temp_buffer_with_dec_character_reference
|
||||
no_warn t;
|
||||
"unimplemented: @replace_temp_buffer_with_hex_character_reference\n";
|
||||
PressAKey;
|
||||
I64 @hex_table_i;
|
||||
I64 @hex_table[256];
|
||||
MemSet(&@hex_table, NULL, sizeof(I64) * 256);
|
||||
|
||||
for (@hex_table_i = '0'; @hex_table_i < ':'; @hex_table_i++) {
|
||||
@hex_table[@hex_table_i] = @hex_table_i - '0';
|
||||
}
|
||||
|
||||
for (@hex_table_i = 'A'; @hex_table_i < 'G'; @hex_table_i++) {
|
||||
@hex_table[@hex_table_i] = 10 + (@hex_table_i - 'A');
|
||||
}
|
||||
|
||||
for (@hex_table_i = 'a'; @hex_table_i < 'g'; @hex_table_i++) {
|
||||
@hex_table[@hex_table_i] = 10 + (@hex_table_i - 'a');
|
||||
}
|
||||
|
||||
U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
|
||||
@@ -205,11 +223,30 @@ U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
|
||||
StrPrint(t->tempBuffer.data, "%c", charCode);
|
||||
@recalculate_temp_buffer_size(t);
|
||||
break;
|
||||
case 956:
|
||||
StrPrint(t->tempBuffer.data, "%c", 230);
|
||||
@recalculate_temp_buffer_size(t);
|
||||
break;
|
||||
case 8217:
|
||||
StrCpy(t->tempBuffer.data, "'");
|
||||
@recalculate_temp_buffer_size(t);
|
||||
return;
|
||||
break;
|
||||
case 8230:
|
||||
StrCpy(t->tempBuffer.data, "...");
|
||||
@recalculate_temp_buffer_size(t);
|
||||
return;
|
||||
break;
|
||||
case 9660:
|
||||
StrCpy(t->tempBuffer.data, "");
|
||||
@recalculate_temp_buffer_size(t);
|
||||
return;
|
||||
break;
|
||||
case 10006:
|
||||
StrCpy(t->tempBuffer.data, "x");
|
||||
@recalculate_temp_buffer_size(t);
|
||||
return;
|
||||
break;
|
||||
default:
|
||||
|
||||
@debug("\n$FG,0$HTML Tokenization error: Unimplemented decimal character "
|
||||
@@ -222,6 +259,32 @@ U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
|
||||
}
|
||||
}
|
||||
|
||||
U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) {
|
||||
// TODO: convert hex to dec in buffer and call
|
||||
// @replace_temp_buffer_with_dec_character_reference
|
||||
no_warn t;
|
||||
|
||||
I64 dec_char = 0;
|
||||
|
||||
U8 buf[512];
|
||||
t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] =
|
||||
NULL; // chop off semicolon
|
||||
|
||||
U8 *ch = t->tempBuffer.data + 3;
|
||||
|
||||
while (*ch && dec_char >= 0) {
|
||||
dec_char = (dec_char << 4) | @hex_table[*ch++];
|
||||
}
|
||||
|
||||
StrPrint(t->tempBuffer.data, "&#%d;", dec_char);
|
||||
@recalculate_temp_buffer_size(t);
|
||||
|
||||
@replace_temp_buffer_with_dec_character_reference(t);
|
||||
|
||||
//"unimplemented: @replace_temp_buffer_with_hex_character_reference\n";
|
||||
// PressAKey;
|
||||
}
|
||||
|
||||
U0 @replace_temp_buffer_with_numeric_character_reference(Tokenizer *t) {
|
||||
switch (t->tempBuffer.data[2]) {
|
||||
case 'x':
|
||||
@@ -718,6 +781,37 @@ U0 @tokenizer_html_state_attribute_value_double_quoted(Tokenizer *t) {
|
||||
}
|
||||
}
|
||||
|
||||
U0 @tokenizer_html_state_attribute_value_unquoted(Tokenizer *t) {
|
||||
@consume_next_input_char(t);
|
||||
switch (t->currentInputChar) {
|
||||
case '\n':
|
||||
case '\r':
|
||||
case '\t':
|
||||
case ' ':
|
||||
// Switch to the before attribute name state.
|
||||
t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
|
||||
break;
|
||||
/*
|
||||
case '&':
|
||||
// Set the return state to the attribute value (double-quoted) state. Switch
|
||||
// to the character reference state.
|
||||
t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
|
||||
t->state = HTML_STATE_CHARACTER_REFERENCE;
|
||||
break;
|
||||
*/
|
||||
case '>':
|
||||
// Switch to the data state. Emit the current tag token.
|
||||
@emit_current_node(t);
|
||||
t->state = HTML_STATE_DATA;
|
||||
break;
|
||||
default:
|
||||
// Append the current input character to the current attribute's value.
|
||||
t->currentAttribute->value = @append_char_to_growable_string(
|
||||
t->currentAttribute->value, t->currentInputChar);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
U0 @tokenizer_html_state_attribute_value_single_quoted(Tokenizer *t) {
|
||||
@consume_next_input_char(t);
|
||||
switch (t->currentInputChar) {
|
||||
@@ -942,11 +1036,13 @@ U0 @tokenizer_html_state_comment_start(Tokenizer *t) {
|
||||
U0 @tokenizer_html_state_comment(Tokenizer *t) {
|
||||
@consume_next_input_char(t);
|
||||
switch (t->currentInputChar) {
|
||||
/*
|
||||
case '<':
|
||||
// Append the current input character to the comment token's data. Switch to
|
||||
// the comment less-than sign state.
|
||||
t->state = HTML_STATE_COMMENT_LESS_THAN_SIGN;
|
||||
break;
|
||||
*/
|
||||
case '-':
|
||||
// Switch to the comment end dash state.
|
||||
t->state = HTML_STATE_COMMENT_END_DASH;
|
||||
@@ -1121,6 +1217,9 @@ Node *@html_tokenize_and_create_node_tree(U8 *buffer, I64 size,
|
||||
case HTML_STATE_COMMENT_END:
|
||||
@tokenizer_html_state_comment_end(&t);
|
||||
break;
|
||||
case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED:
|
||||
@tokenizer_html_state_attribute_value_unquoted(&t);
|
||||
break;
|
||||
case HTML_STATE_INVALID:
|
||||
default:
|
||||
@debug("\n$FG,0$HTML Tokenization error: Invalid or unimplemented "
|
||||
|
||||
Reference in New Issue
Block a user