Skip to content

Commit

Permalink
Provisionarily: add back CDATA and PI nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
dmsnell committed Jan 12, 2024
1 parent d80f2b9 commit 2051d38
Showing 1 changed file with 103 additions and 1 deletion.
104 changes: 103 additions & 1 deletion src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1664,6 +1664,24 @@ private function parse_next_tag() {
$this->text_starts_at = $this->token_starts_at + 2;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;

// Identify nodes that would be CDATA if HTML had CDATA sections.
if (
$this->token_length >= 10 &&
'[' === $html[ $this->token_starts_at + 2 ] &&
'C' === $html[ $this->token_starts_at + 3 ] &&
'D' === $html[ $this->token_starts_at + 4 ] &&
'A' === $html[ $this->token_starts_at + 5 ] &&
'T' === $html[ $this->token_starts_at + 6 ] &&
'A' === $html[ $this->token_starts_at + 7 ] &&
'[' === $html[ $this->token_starts_at + 8 ] &&
']' === $html[ $closer_at - 1 ]
) {
$this->parser_state = self::STATE_CDATA_NODE;
$this->text_starts_at += 7;
$this->text_length -= 9;
}

return true;
}

Expand Down Expand Up @@ -1700,6 +1718,41 @@ private function parse_next_tag() {
$this->text_starts_at = $this->token_starts_at + 2;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;

/*
* Identify a Processing Instruction node were HTML to have them.
*
* XML allows for more target names, but this code only identifies
* a subset. This is more or less okay because ultimately these are
* HTML comments in the DOM and this safely supports _some_ kinds
* of PI Nodes without getting lost while parsing.
*
* This code identifies processing instruction nodes whose target
* name can be represented in single-byte UTF-8 / 7-bit ASCII.
*
* > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
* [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
* [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
* [#x10000-#xEFFFF]
* > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
*
* @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
*/
if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
$comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );

if ( 0 < $pi_target_length ) {
$pi_target_length += strspn( $comment_text,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );

$this->parser_state = self::STATE_PI_NODE;
$this->tag_name_starts_at = $this->token_starts_at + 2;
$this->tag_name_length = $pi_target_length;
$this->text_starts_at += $pi_target_length;
$this->text_length -= $pi_target_length + 1;
}
}

return true;
}

Expand Down Expand Up @@ -2507,6 +2560,9 @@ public function get_token_type() {
case self::STATE_DOCTYPE:
return '#doctype';

case self::STATE_PI_NODE:
return '#processing-instruction';

default:
return $this->get_token_name();
}
Expand Down Expand Up @@ -2540,6 +2596,12 @@ public function get_token_name() {
case self::STATE_TEXT_NODE:
return '#text';

case self::STATE_CDATA_NODE:
return '#cdata-section';

case self::STATE_PI_NODE:
return substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );

case self::STATE_COMMENT:
return '#comment';

Expand Down Expand Up @@ -2580,7 +2642,15 @@ public function get_modifiable_text() {
$at = $this->text_starts_at;
$length = $this->text_length;
$text = substr( $this->html, $at, $length );
$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );

if (
self::STATE_CDATA_NODE === $this->parser_state ||
self::STATE_PI_NODE === $this->parser_state
) {
return $text;
}

$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );

if ( empty( $text ) ) {
return '';
Expand Down Expand Up @@ -3135,6 +3205,38 @@ private function matches() {
*/
const STATE_TEXT_NODE = 'STATE_TEXT_NODE';

/**
* Parser CDATA Node State.
*
* Indicates that the parser has found a CDADA node and it's possible
* to read and modify its modifiable text. Note that in HTML there are
* no CDATA nodes outside foreign elements (SVG and MathML). Outside
* of foreign elements, they are treated as HTML comments. Nonetheless,
* the Tag Processor still recognizes them as they appear in the HTML
* stream and exposes them for inspection and modification.
*
* @since 6.5.0
*
* @access private
*/
const STATE_CDATA_NODE = 'STATE_CDATA_NODE';

/**
* Parser Processing Instruction State.
*
* Indicates that the parser has found a Processing Instruction and
* it's possible to read and modify its modifiable text. Note that in
* HTML there are no Processing Instruction nodes and they are treated
* as HTML comments. Nonetheless, the Tag Processor still recognizes
* them as they appear in the HTML stream and exposes them for
* inspection and modification.
*
* @since 6.5.0
*
* @access private
*/
const STATE_PI_NODE = 'STATE_PI_NODE';

/**
* Indicates that the parser has found an HTML comment and it's
* possible to read and modify its modifiable text.
Expand Down

0 comments on commit 2051d38

Please sign in to comment.