Skip to content

Commit 5f8363b

Browse files
committed
Back to goto instead boolean flag invalid_entity. Optimize condition.
1 parent 24ff722 commit 5f8363b

File tree

1 file changed

+40
-35
lines changed

1 file changed

+40
-35
lines changed

ext/standard/html.c

+40-35
Original file line numberDiff line numberDiff line change
@@ -852,39 +852,38 @@ static void traverse_for_entities(
852852

853853
unsigned code = 0, code2 = 0;
854854
const char *entity_end_ptr = NULL;
855-
bool valid_entity = true;
856855

857856
if (current_ptr[1] == '#') {
858857
/* Processing numeric entity */
859858
const char *num_start = current_ptr + 2;
860859
entity_end_ptr = num_start;
861860
if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) {
862-
valid_entity = false;
861+
goto invalid_incomplete_entity;
863862
}
864-
if (valid_entity && !all && (code > 63U || stage3_table_be_apos_00000[code].data.ent.entity == NULL)) {
863+
if (!all && (code > 63U || stage3_table_be_apos_00000[code].data.ent.entity == NULL)) {
865864
/* If we're in htmlspecialchars_decode, we're only decoding entities
866865
* that represent &, <, >, " and '. Is this one of them? */
867-
valid_entity = false;
868-
} else if (valid_entity && (!unicode_cp_is_allowed(code, doctype) ||
869-
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))) {
866+
goto invalid_incomplete_entity;
867+
} else if (!unicode_cp_is_allowed(code, doctype) ||
868+
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)) {
870869
/* are we allowed to decode this entity in this document type?
871870
* HTML 5 is the only that has a character that cannot be used in
872871
* a numeric entity but is allowed literally (U+000D). The
873872
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
874-
valid_entity = false;
873+
goto invalid_incomplete_entity;
875874
}
876875
} else {
877-
/* Processing named entity */
876+
/* Processing named entity */
878877
const char *name_start = current_ptr + 1;
879878
/* Search for ';' */
880879
const size_t max_search_len = MIN(LONGEST_ENTITY_LENGTH + 1, input_end - name_start);
881880
const char *semi_colon_ptr = memchr(name_start, ';', max_search_len);
882881
if (!semi_colon_ptr) {
883-
valid_entity = false;
882+
goto invalid_incomplete_entity;
884883
} else {
885884
const size_t name_len = semi_colon_ptr - name_start;
886885
if (name_len == 0) {
887-
valid_entity = false;
886+
goto invalid_incomplete_entity;
888887
} else {
889888
if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) {
890889
if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
@@ -895,7 +894,7 @@ static void traverse_for_entities(
895894
* hack to support it */
896895
code = (unsigned)'\'';
897896
} else {
898-
valid_entity = false;
897+
goto invalid_incomplete_entity;
899898
}
900899
}
901900
entity_end_ptr = semi_colon_ptr;
@@ -904,45 +903,51 @@ static void traverse_for_entities(
904903
}
905904

906905
/* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
907-
if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') {
908-
*output_ptr++ = *current_ptr++;
909-
continue;
906+
if (entity_end_ptr == NULL) {
907+
goto invalid_incomplete_entity;
910908
}
911909

912910
/* Check if quotes are allowed for entities representing ' or " */
913911
if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
914912
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
915913
{
916-
valid_entity = false;
914+
goto invalid_complete_entity;
917915
}
918916

919917
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
920918
* the call is needed to ensure the codepoint <= U+00FF) */
921-
if (valid_entity && charset != cs_utf_8) {
919+
if (charset != cs_utf_8) {
922920
/* replace unicode code point */
923-
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
924-
valid_entity = false;
921+
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0) {
922+
goto invalid_complete_entity;
923+
}
925924
}
926925

927-
if (valid_entity) {
928-
/* Write the parsed entity into the output buffer */
929-
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
930-
if (code2) {
931-
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
932-
}
933-
/* Move current_ptr past the semicolon */
934-
current_ptr = entity_end_ptr + 1;
926+
/* Write the parsed entity into the output buffer */
927+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
928+
if (code2) {
929+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
930+
}
931+
/* Move current_ptr past the semicolon */
932+
current_ptr = entity_end_ptr + 1;
933+
continue;
934+
935+
invalid_incomplete_entity:
936+
/* If the entity is invalid at parse stage or entity_end_ptr was never found, copy '&' as normal */
937+
*output_ptr++ = *current_ptr++;
938+
continue;
939+
940+
invalid_complete_entity:
941+
/* If the entity became invalid after we found entity_end_ptr */
942+
if (entity_end_ptr) {
943+
const size_t len = entity_end_ptr - current_ptr;
944+
memcpy(output_ptr, current_ptr, len);
945+
output_ptr += len;
946+
current_ptr = entity_end_ptr;
935947
} else {
936-
/* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
937-
if (entity_end_ptr) {
938-
const size_t len = entity_end_ptr - current_ptr;
939-
memcpy(output_ptr, current_ptr, len);
940-
output_ptr += len;
941-
current_ptr = entity_end_ptr;
942-
} else {
943-
*output_ptr++ = *current_ptr++;
944-
}
948+
*output_ptr++ = *current_ptr++;
945949
}
950+
continue;
946951
}
947952

948953
*output_ptr = '\0';

0 commit comments

Comments
 (0)