Major refactor: Introduce and use WP_Token_Set

In order to clarify the main loop of `_esc_attr_single_pass_utf8` I've moved the named character reference lookup outside of the function and into a new high-performance token set class dubbed `WP_Token_Set`. I created this class to retain the performance perks brought by the optimized data format. There are two lookup sets though because WordPress traditionally has its own custom set based on HTML4, but I would like to see us allow everything that HTML5 allows, including the common `'` so we don't have to keep writing `&WordPress#39;` (because that doesn't stand out as clearly as the name does). Performance in this change is even better than it was previously because I've removed the substitutions from the lookup table and that removes both iteration and working memory. In order to provide the reverse function, decoding these entities, it would probably be best to create two separate tables, or add a fixed byte length and offset value as a lookup into another table so that we can avoid reintroducing the double crawling scan that we had before.
dmsnell · Oct 3, 2023 · b5caa5c · b5caa5c
1 parent 49af2fd
commit b5caa5c
Show file tree

Hide file tree

Showing 6 changed files with 1,941 additions and 954 deletions.
diff --git a/src/wp-includes/class-wp-token-set.php b/src/wp-includes/class-wp-token-set.php
@@ -0,0 +1,197 @@
+<?php
+
+class WP_Token_Set {
+	const KEY_LENGTH = 2;
+
+	const MAX_LENGTH = 256;
+
+	/**
+	 * Stores an optimized form of the word set, where words are grouped
+	 * by first two letters and then collapsed into a string.
+	 *
+	 * @var array
+	 */
+	private $large_words = array();
+
+	/**
+	 * Stores an optimized row of short words, where every entry is two
+	 * bytes long and zero-extended if the word is only a single byte.
+	 *
+	 * @var string
+	 */
+	private $small_words = '';
+
+	public static function from_array( $words ) {
+		$set = new WP_Token_Set();
+
+		// Start by grouping words.
+
+		$groups = array();
+		$shorts = array();
+		foreach ( $words as $word ) {
+			if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) {
+				return null;
+			}
+
+			$length = strlen( $word );
+
+			if ( self::KEY_LENGTH >= $length ) {
+				$shorts[] = $word;
+			} else {
+				$group = substr( $word, 0, self::KEY_LENGTH );
+
+				if ( ! isset( $groups[ $group ] ) ) {
+					$groups[ $group ] = array();
+				}
+
+				$groups[ $group ][] = substr( $word, self::KEY_LENGTH );
+			}
+		}
+
+		// Sort the words by longest-first, then alphabetical.
+
+		usort( $shorts, array( self::class, 'longest_first_then_alphabetical' ) );
+		foreach ( $groups as $group_key => $group ) {
+			usort( $groups[ $group_key ], array( self::class, 'longest_first_then_alphabetical' ) );
+		}
+
+		// Finally construct the optimized lookups.
+
+		foreach ( $shorts as $word ) {
+			$set->small_words .= str_pad( $word, self::KEY_LENGTH, "\x00" );
+		}
+
+		foreach ( $groups as $group => $group_words ) {
+			$group_string = '';
+
+			foreach ( $group_words as $word ) {
+				$group_string .= chr( strlen( $word ) ) . $word;
+			}
+
+			$set->large_words[ $group ] = $group_string;
+		}
+
+		return $set;
+	}
+
+	public static function from_precomputed_table( $large_words, $small_words ) {
+		$set = new WP_Token_Set();
+
+		$set->large_words = $large_words;
+		$set->small_words = $small_words;
+
+		return $set;
+	}
+
+	public function contains( $word ) {
+		if ( self::KEY_LENGTH >= strlen( $word ) ) {
+			return str_contains( $this->small_words, str_pad( $word, self::KEY_LENGTH, "\x00" ) );
+		}
+
+		$group_key = substr( $word, 0, self::KEY_LENGTH );
+		if ( ! isset( $this->large_words[ $group_key ] ) ) {
+			return false;
+		}
+
+		$group  = $this->large_words[ $group_key ];
+		$slug   = substr( $word, self::KEY_LENGTH );
+		$length = strlen( $slug );
+		$at     = 0;
+		while ( $at < strlen( $group ) ) {
+			$token_length = ord( $group[ $at++ ] );
+			if ( $token_length === $length && 0 === substr_compare( $group, $slug, $at, $token_length ) ) {
+				return true;
+			}
+
+			$at += $token_length;
+		}
+
+		return false;
+	}
+
+	public function read_token( $text, $offset ) {
+		$text_length = strlen( $text );
+
+		// Search for a long word first, if the text is long enough, and if that fails, a short one.
+		if ( self::KEY_LENGTH < $text_length ) {
+			$group_key = substr( $text, $offset, self::KEY_LENGTH );
+
+			if ( ! isset( $this->large_words[ $group_key ] ) ) {
+				return false;
+			}
+
+			$group        = $this->large_words[ $group_key ];
+			$group_length = strlen( $group );
+			$at           = 0;
+			while ( $at < $group_length ) {
+				$token_length = ord( $group[ $at++ ] );
+				$token        = substr( $group, $at, $token_length );
+
+				if ( 0 === substr_compare( $text, $token, $offset + self::KEY_LENGTH, $token_length ) ) {
+					return $group_key . $token;
+				}
+
+				$at += $token_length;
+			}
+		}
+
+		// Perhaps a short word then.
+		$small_text = str_pad( substr( $text, $offset, self::KEY_LENGTH ), self::KEY_LENGTH, "\x00" );
+		$at         = strpos( $this->small_words, $small_text );
+
+		return false !== $at
+			? rtrim( substr( $this->small_words, $at, self::KEY_LENGTH ), "\x00" )
+			: false;
+	}
+
+	public function precomputed_php_source_table( $indent = "\t" ) {
+		$i1 = $indent;
+		$i2 = $indent . $indent;
+
+		$output  = self::class . "::from_precomputed_table(\n";
+		$output .= $i1 . "array(\n";
+
+		foreach ( $this->large_words as $prefix => $group ) {
+			$comment_line = "{$i2}//";
+			$data_line    = "{$i2}'{$prefix}' => \"";
+			$at           = 0;
+			while ( $at < strlen( $group ) ) {
+				$length = ord( $group[ $at++ ] );
+				$digits = str_pad( dechex( $length ), 2, '0', STR_PAD_LEFT );
+				$token  = substr( $group, $at, $length );
+				$at    += $length;
+
+				$comment_line .= " &{$prefix}{$token}";
+				$data_line    .= "\\x{$digits}{$token}";
+			}
+			$comment_line .= "\n";
+			$data_line    .= "\",\n";
+
+			$output .= $comment_line;
+			$output .= $data_line;
+		}
+
+		$output    .= "{$i1}),\n";
+		$small_text = str_replace( "\x00", '\x00', $this->small_words );
+		$output    .= "{$i1}'{$small_text}'\n";
+		$output    .= ");\n";
+
+		return $output;
+	}
+
+	private static function longest_first_then_alphabetical( $a, $b ) {
+		if ( $a === $b ) {
+			return 0;
+		}
+
+		$la = strlen( $a );
+		$lb = strlen( $b );
+
+		// Longer strings are less-than for comparison's sake.
+		if ( $la !== $lb ) {
+			return $lb - $la;
+		}
+
+		return strcmp( $a, $b );
+	}
+}
diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
@@ -923,16 +923,38 @@ function seems_utf8( $str ) {
  *  - Truncates superfluous leading zeros in numeric character references: e.g. `&#x0003F;` becomes `&#3F;`.
  *  - Leaves valid character references untouched: e.g. `&hellip;` remains `&hellip;`.
  *
- * @param string $text Unescaped raw string input bound for an HTML attribute.
+ * @param string              $text               Unescaped raw string input bound for an HTML attribute.
+ * @param string|WP_Token_Set $allowable_entities 'legacy' for legacy WordPress allowables,
+ *                                                'html5' for everything in the HTML5 spec,
+ *                                                or a WP_Token_Set for custom needs.
  * @return string
  */
-function _esc_attr_single_pass_utf8( $text ) {
-	global $allowedentitynames, $named_character_reference_lookup_table;
+function _esc_attr_single_pass_utf8( $text, $allowable_entities = 'legacy' ) {
+	global $html4_named_character_entity_set;
+	global $html5_named_character_entity_set;
 
 	if ( 0 === strlen( $text ) ) {
 		return $text;
 	}
 
+	switch ( $allowable_entities ) {
+		case 'legacy':
+			$entity_set = $html4_named_character_entity_set;
+			break;
+
+		case 'html5':
+			$entity_set = $html5_named_character_entity_set;
+			break;
+
+		default:
+			if ( $allowable_entities instanceof WP_Token_Set ) {
+				$entity_set = $allowable_entities;
+				break;
+			} else {
+				return $text;
+			}
+	}
+
 	$at     = 0;
 	$output = '';
 	$length = strlen( $text );
@@ -1099,118 +1121,46 @@ function _esc_attr_single_pass_utf8( $text ) {
 				// Advance past the `&`.
 				++$name_at;
 
-				// &Aacute; -> group "Aa" (skip & since we know it's there).
-				$group_key = substr( $text, $name_at, 2 );
-				// Match cannot form a named character reference.
-				if ( ! array_key_exists( $group_key, $named_character_reference_lookup_table ) ) {
+				$name = $entity_set->read_token( $text, $name_at );
+				if ( false === $name ) {
 					$output .= '&amp;';
 					++$at;
 					break;
 				}
 
-				$name_at += 2;
-				$group    = $named_character_reference_lookup_table[ $group_key ];
-
-				$i = 0;
-				while ( $i < strlen( $group ) ) {
-					/*
-					 * Extract name and substitution information from group string.
-					 *
-					 * Example:
-					 *
-					 * For group "qu", during lookup that will find "&quot;"
-					 *
-					 * ┌─────┬────┬──────┬────┬──────────────┬────┬─────┐
-					 * │ ... │ N5 │ Name │ S5 │ Substitution │ N6 │ ... │
-					 * ├─────┼────┼──────┼────┼──────────────┼────┼─────┤
-					 * │ ... │ 03 │ ot;  │ 01 │ "            │ 03 │ ... │
-					 * └─────┴────┴──────┴────┴──────────────┴────┴─────┘
-					 *         ^^          ^^
-					 *          |           |
-					 *          |           ╰ The substitution is one byte,
-					 *          |             even though it's represented in
-					 *          |             the string literal as "\x22", which
-					 *          |             is done for the sake of avoiding
-					 *          |             quoting issues in PHP.
-					 *          |
-					 *          ╰ The "ot;" is three bytes (the finishing of &quo̱t;).
-					 *
-					 * The part of the group string this represents follows:
-					 * > ...\x03ot;\x01\x22\x03...
-					 *
-					 * So we can see that we read a single character and interpret
-					 * it as a byte containing the length of the bytes in the name,
-					 * then we read the name, then the byte after that indicates how
-					 * many bytes are in the substitution string for that name, then
-					 * we start the next name pair until we reach the end of the
-					 * group string.
-					 *
-					 */
-					$name_length = ord( $group[ $i++ ] );
-					$name        = substr( $group, $i, $name_length );
-					$i          += $name_length;
-					$sub_length  = ord( $group[ $i++ ] );
-					$i          += $sub_length;
-
-					// The end of the document came mid-name or the name is not a match.
-					if ( $name_at + $name_length > $length || 0 !== substr_compare( $text, $name, $name_at, $name_length ) ) {
-						continue;
-					}
-
-					$name_at += $name_length;
+				$name_at += strlen( $name );
 
-//					$semicolon_delta = ';' === $name[ $name_length - 1 ] ? -1 : 0;
-//					$reference_name  = substr( $text, $at + 1, $name_at - ( $at + 1 ) + $semicolon_delta );
-
-					/*
-					 * Some names are not allowed by WordPress, even though they are permitted by HTML.
-					 *
-					 * @TODO: Is there a reason these are limited, or was it simply that not all of the
-					 *        original named character references were added? Is there a reason not to
-					 *        allow all of them? There don't seem to be plugins changing this list.
-					 */
-//					if ( ! in_array( $reference_name, $allowedentitynames, true ) ) {
-//						$output .= '&amp;' . substr( $text, $at + 1, $name_at - ( $at + 1 ) );
-//						$at      = $name_at;
-//						break 2;
-//					}
-
-					// If we have an un-ambiguous ampersand we can safely leave it in.
-					if ( ';' === $text[ $name_at - 1 ] ) {
-						$output .= substr( $text, $at, $name_at - $at );
-						$at      = $name_at;
-						break 2;
-					}
-
-					/*
-					 * At this point though have matched an entry in the named
-					 * character reference table but the match doesn't end in `;`.
-					 * We need to determine if the next letter makes it an ambiguous.
-					 */
-					$ambiguous_follower = (
-						$name_at < $length &&
-						(
-							ctype_alnum( $text[ $name_at ] ) ||
-							'=' === $text[ $name_at ]
-						)
-					);
+				// If we have an un-ambiguous ampersand we can safely leave it in.
+				if ( ';' === $text[ $name_at - 1 ] ) {
+					$output .= substr( $text, $at, $name_at - $at );
+					$at      = $name_at;
+					break;
+				}
 
-					// It's non-ambiguous, safe to leave it in.
-					if ( ! $ambiguous_follower ) {
-						$output .= substr( $text, $at, $name_at - $at );
-						$at      = $name_at;
-						break 2;
-					}
+				/*
+				 * At this point though have matched an entry in the named
+				 * character reference table but the match doesn't end in `;`.
+				 * We need to determine if the next letter makes it an ambiguous.
+				 */
+				$ambiguous_follower = (
+					$name_at < $length &&
+					(
+						ctype_alnum( $text[ $name_at ] ) ||
+						'=' === $text[ $name_at ]
+					)
+				);
 
-					// Ambiguous ampersands are not allowed in an attribute, escape it.
-					$output .= '&amp;' . substr( $text, $at + 1, $name_at - ( $at + 1 ) );
+				// It's non-ambiguous, safe to leave it in.
+				if ( ! $ambiguous_follower ) {
+					$output .= substr( $text, $at, $name_at - $at );
 					$at      = $name_at;
-					break 2;
+					break;
 				}
 
-				// The character wasn't found in the groups.
-				$output .= '&amp;';
-				++$at;
+				// Ambiguous ampersands are not allowed in an attribute, escape it.
+				$output .= '&amp;' . substr( $text, $at + 1, $name_at - ( $at + 1 ) );
+				$at      = $name_at;
+				break;
 		}
 	}