diff --git a/Src/HTMLTokenizer.HC b/Src/HTMLTokenizer.HC
index 858cd27..d7320b0 100644
--- a/Src/HTMLTokenizer.HC
+++ b/Src/HTMLTokenizer.HC
@@ -158,6 +158,16 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) {
@recalculate_temp_buffer_size(t);
return;
}
+ if (!StrICmp(t->tempBuffer.data, " ")) {
+ StrCpy(t->tempBuffer.data, " ");
+ @recalculate_temp_buffer_size(t);
+ return;
+ }
+ if (!StrICmp(t->tempBuffer.data, "—")) {
+ StrCpy(t->tempBuffer.data, "-");
+ @recalculate_temp_buffer_size(t);
+ return;
+ }
if (!StrICmp(t->tempBuffer.data, " ")) {
StrCpy(t->tempBuffer.data, " ");
@recalculate_temp_buffer_size(t);
@@ -187,12 +197,20 @@ U0 @replace_temp_buffer_with_named_character_reference(Tokenizer *t) {
PressAKey;
}
-U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) {
- // TODO: convert hex to dec in buffer and call
- // @replace_temp_buffer_with_dec_character_reference
- no_warn t;
- "unimplemented: @replace_temp_buffer_with_hex_character_reference\n";
- PressAKey;
+I64 @hex_table_i;
+I64 @hex_table[256];
+MemSet(&@hex_table, NULL, sizeof(I64) * 256);
+
+for (@hex_table_i = '0'; @hex_table_i < ':'; @hex_table_i++) {
+ @hex_table[@hex_table_i] = @hex_table_i - '0';
+}
+
+for (@hex_table_i = 'A'; @hex_table_i < 'G'; @hex_table_i++) {
+ @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'A');
+}
+
+for (@hex_table_i = 'a'; @hex_table_i < 'g'; @hex_table_i++) {
+ @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'a');
}
U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
@@ -205,11 +223,30 @@ U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
StrPrint(t->tempBuffer.data, "%c", charCode);
@recalculate_temp_buffer_size(t);
break;
+ case 956:
+ StrPrint(t->tempBuffer.data, "%c", 230);
+ @recalculate_temp_buffer_size(t);
+ break;
+ case 8217:
+ StrCpy(t->tempBuffer.data, "'");
+ @recalculate_temp_buffer_size(t);
+ return;
+ break;
+ case 8230:
+ StrCpy(t->tempBuffer.data, "...");
+ @recalculate_temp_buffer_size(t);
+ return;
+ break;
case 9660:
StrCpy(t->tempBuffer.data, "");
@recalculate_temp_buffer_size(t);
return;
break;
+ case 10006:
+ StrCpy(t->tempBuffer.data, "x");
+ @recalculate_temp_buffer_size(t);
+ return;
+ break;
default:
@debug("\n$FG,0$HTML Tokenization error: Unimplemented decimal character "
@@ -222,6 +259,32 @@ U0 @replace_temp_buffer_with_dec_character_reference(Tokenizer *t) {
}
}
+U0 @replace_temp_buffer_with_hex_character_reference(Tokenizer *t) {
+ // TODO: convert hex to dec in buffer and call
+ // @replace_temp_buffer_with_dec_character_reference
+ no_warn t;
+
+ I64 dec_char = 0;
+
+ U8 buf[512];
+ t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] =
+ NULL; // chop off semicolon
+
+ U8 *ch = t->tempBuffer.data + 3;
+
+ while (*ch && dec_char >= 0) {
+ dec_char = (dec_char << 4) | @hex_table[*ch++];
+ }
+
+ StrPrint(t->tempBuffer.data, "%d;", dec_char);
+ @recalculate_temp_buffer_size(t);
+
+ @replace_temp_buffer_with_dec_character_reference(t);
+
+ //"unimplemented: @replace_temp_buffer_with_hex_character_reference\n";
+ // PressAKey;
+}
+
U0 @replace_temp_buffer_with_numeric_character_reference(Tokenizer *t) {
switch (t->tempBuffer.data[2]) {
case 'x':
@@ -718,6 +781,37 @@ U0 @tokenizer_html_state_attribute_value_double_quoted(Tokenizer *t) {
}
}
+U0 @tokenizer_html_state_attribute_value_unquoted(Tokenizer *t) {
+ @consume_next_input_char(t);
+ switch (t->currentInputChar) {
+ case '\n':
+ case '\r':
+ case '\t':
+ case ' ':
+ // Switch to the before attribute name state.
+ t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ break;
+ /*
+ case '&':
+ // Set the return state to the attribute value (double-quoted) state. Switch
+ // to the character reference state.
+ t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
+ t->state = HTML_STATE_CHARACTER_REFERENCE;
+ break;
+ */
+ case '>':
+ // Switch to the data state. Emit the current tag token.
+ @emit_current_node(t);
+ t->state = HTML_STATE_DATA;
+ break;
+ default:
+ // Append the current input character to the current attribute's value.
+ t->currentAttribute->value = @append_char_to_growable_string(
+ t->currentAttribute->value, t->currentInputChar);
+ break;
+ }
+}
+
U0 @tokenizer_html_state_attribute_value_single_quoted(Tokenizer *t) {
@consume_next_input_char(t);
switch (t->currentInputChar) {
@@ -942,11 +1036,13 @@ U0 @tokenizer_html_state_comment_start(Tokenizer *t) {
U0 @tokenizer_html_state_comment(Tokenizer *t) {
@consume_next_input_char(t);
switch (t->currentInputChar) {
+ /*
case '<':
// Append the current input character to the comment token's data. Switch to
// the comment less-than sign state.
t->state = HTML_STATE_COMMENT_LESS_THAN_SIGN;
break;
+ */
case '-':
// Switch to the comment end dash state.
t->state = HTML_STATE_COMMENT_END_DASH;
@@ -1121,6 +1217,9 @@ Node *@html_tokenize_and_create_node_tree(U8 *buffer, I64 size,
case HTML_STATE_COMMENT_END:
@tokenizer_html_state_comment_end(&t);
break;
+ case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED:
+ @tokenizer_html_state_attribute_value_unquoted(&t);
+ break;
case HTML_STATE_INVALID:
default:
@debug("\n$FG,0$HTML Tokenization error: Invalid or unimplemented "