From 24d857e036aad1b8a8c7930a6128371baec0959f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Fri, 31 May 2024 23:14:44 +0200 Subject: [PATCH 1/4] Use atomic group for regex keywords matching -2.5% improved runtime --- src/Tokenizer.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index b683731..80f9e7b 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -770,24 +770,24 @@ public function __construct() }; // Set up regular expressions - $this->regexBoundaries = '(' . implode( + $this->regexBoundaries = '(?>' . implode( '|', $this->quoteRegex($this->boundaries), ) . ')'; - $this->regexReserved = '(' . implode( + $this->regexReserved = '(?>' . implode( '|', $this->quoteRegex($sortByLengthFx($this->reserved)), ) . ')'; - $this->regexReservedToplevel = str_replace(' ', '\s+', '(' . implode( + $this->regexReservedToplevel = str_replace(' ', '\s+', '(?>' . implode( '|', $this->quoteRegex($sortByLengthFx($this->reservedToplevel)), ) . ')'); - $this->regexReservedNewline = str_replace(' ', '\s+', '(' . implode( + $this->regexReservedNewline = str_replace(' ', '\s+', '(?>' . implode( '|', $this->quoteRegex($sortByLengthFx($this->reservedNewline)), ) . ')'); - $this->regexFunction = '(' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')'; + $this->regexFunction = '(?>' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')'; } /** From 7d3ff7b665c212c43ab46b3e5012953a104d8d8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Fri, 31 May 2024 23:29:52 +0200 Subject: [PATCH 2/4] Build regexes only once -63% improved runtime --- src/Tokenizer.php | 52 +++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 80f9e7b..1a98073 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -720,11 +720,13 @@ final class Tokenizer // Regular expressions for tokenizing - private readonly string $regexBoundaries; - private readonly string $regexReserved; - private readonly string $regexReservedNewline; - private readonly string $regexReservedToplevel; - private readonly string $regexFunction; + private readonly string $nextTokenRegexNumber; + private readonly string $nextTokenRegexBoundaryCharacter; + private readonly string $nextTokenRegexReservedToplevel; + private readonly string $nextTokenRegexReservedNewline; + private readonly string $nextTokenRegexReserved; + private readonly string $nextTokenRegexFunction; + private readonly string $nextTokenRegexNonReserved; /** * Punctuation that can be used as a boundary between other tokens @@ -770,24 +772,31 @@ public function __construct() }; // Set up regular expressions - $this->regexBoundaries = '(?>' . implode( + $regexBoundaries = '(?>' . implode( '|', $this->quoteRegex($this->boundaries), ) . ')'; - $this->regexReserved = '(?>' . implode( + $regexReserved = '(?>' . implode( '|', $this->quoteRegex($sortByLengthFx($this->reserved)), ) . ')'; - $this->regexReservedToplevel = str_replace(' ', '\s+', '(?>' . implode( + $regexReservedToplevel = '(?>' . str_replace(' ', '\s+', implode( '|', $this->quoteRegex($sortByLengthFx($this->reservedToplevel)), - ) . ')'); - $this->regexReservedNewline = str_replace(' ', '\s+', '(?>' . implode( + )) . ')'; + $regexReservedNewline = '(?>' . str_replace(' ', '\s+', implode( '|', $this->quoteRegex($sortByLengthFx($this->reservedNewline)), - ) . ')'); + )) . ')'; + $regexFunction = '(?>' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')'; - $this->regexFunction = '(?>' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')'; + $this->nextTokenRegexNumber = '/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $regexBoundaries . ')/'; + $this->nextTokenRegexBoundaryCharacter = '/\G(' . $regexBoundaries . ')/'; + $this->nextTokenRegexReservedToplevel = '/\G(' . $regexReservedToplevel . ')($|\s|' . $regexBoundaries . ')/'; + $this->nextTokenRegexReservedNewline = '/\G(' . $regexReservedNewline . ')($|\s|' . $regexBoundaries . ')/'; + $this->nextTokenRegexReserved = '/\G(' . $regexReserved . ')($|\s|' . $regexBoundaries . ')/'; + $this->nextTokenRegexFunction = '/\G(' . $regexFunction . '[(]|\s|[)])/'; + $this->nextTokenRegexNonReserved = '/\G(.*?)($|\s|["\'`]|' . $regexBoundaries . ')/'; } /** @@ -883,7 +892,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok $value = $firstChar . $this->getNextQuotedString($string, $offset + 1); } else { // Non-quoted variable name - preg_match('/\G(' . $firstChar . '[\w.$]+)/', $string, $matches, 0, $offset); + preg_match('/\G([@:][\w.$]+)/', $string, $matches, 0, $offset); if ($matches) { $value = $matches[1]; } @@ -897,7 +906,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // Number (decimal, binary, or hex) if ( preg_match( - '/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/', + $this->nextTokenRegexNumber, $string, $matches, 0, @@ -908,7 +917,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok } // Boundary Character (punctuation and symbols) - if (preg_match('/\G(' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset)) { + if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) { return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]); } @@ -918,7 +927,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // Top Level Reserved Word if ( preg_match( - '/\G(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/', + $this->nextTokenRegexReservedToplevel, $upper, $matches, 0, @@ -934,7 +943,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // Newline Reserved Word if ( preg_match( - '/\G(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/', + $this->nextTokenRegexReservedNewline, $upper, $matches, 0, @@ -950,7 +959,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // Other Reserved Word if ( preg_match( - '/\G(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/', + $this->nextTokenRegexReserved, $upper, $matches, 0, @@ -965,9 +974,8 @@ private function createNextToken(string $string, string $upper, int $offset, Tok } // A function must be succeeded by '(' - // this makes it so "count(" is considered a function, but "count" alone is not - // function - if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches, 0, $offset)) { + // this makes it so "count(" is considered a function, but "count" alone is not function + if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) { return new Token( Token::TOKEN_TYPE_RESERVED, substr($string, $offset, strlen($matches[1]) - 1), @@ -975,7 +983,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok } // Non reserved word - preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset); + preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset); return new Token(Token::TOKEN_TYPE_WORD, $matches[1]); } From 19bb3ab1ae5ca5b975f0974fe084beb9bdc1db6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Mon, 3 Jun 2024 23:44:10 +0200 Subject: [PATCH 3/4] Minimize capturing groups in tokenizer regexes -4% improved runtime --- src/Tokenizer.php | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 1a98073..0fcb327 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -790,13 +790,13 @@ public function __construct() )) . ')'; $regexFunction = '(?>' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')'; - $this->nextTokenRegexNumber = '/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $regexBoundaries . ')/'; - $this->nextTokenRegexBoundaryCharacter = '/\G(' . $regexBoundaries . ')/'; - $this->nextTokenRegexReservedToplevel = '/\G(' . $regexReservedToplevel . ')($|\s|' . $regexBoundaries . ')/'; - $this->nextTokenRegexReservedNewline = '/\G(' . $regexReservedNewline . ')($|\s|' . $regexBoundaries . ')/'; - $this->nextTokenRegexReserved = '/\G(' . $regexReserved . ')($|\s|' . $regexBoundaries . ')/'; - $this->nextTokenRegexFunction = '/\G(' . $regexFunction . '[(]|\s|[)])/'; - $this->nextTokenRegexNonReserved = '/\G(.*?)($|\s|["\'`]|' . $regexBoundaries . ')/'; + $this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/'; + $this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/'; + $this->nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/'; + $this->nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/'; + $this->nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/'; + $this->nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/'; + $this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/'; } /** @@ -838,7 +838,6 @@ public function tokenize(string $string): Cursor */ private function createNextToken(string $string, string $upper, int $offset, Token|null $previous = null): Token { - $matches = []; // Whitespace if (preg_match('/\G\s+/', $string, $matches, 0, $offset)) { return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]); @@ -892,9 +891,9 @@ private function createNextToken(string $string, string $upper, int $offset, Tok $value = $firstChar . $this->getNextQuotedString($string, $offset + 1); } else { // Non-quoted variable name - preg_match('/\G([@:][\w.$]+)/', $string, $matches, 0, $offset); + preg_match('/\G[@:][\w.$]+/', $string, $matches, 0, $offset); if ($matches) { - $value = $matches[1]; + $value = $matches[0]; } } @@ -913,12 +912,12 @@ private function createNextToken(string $string, string $upper, int $offset, Tok $offset, ) ) { - return new Token(Token::TOKEN_TYPE_NUMBER, $matches[1]); + return new Token(Token::TOKEN_TYPE_NUMBER, $matches[0]); } // Boundary Character (punctuation and symbols) if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) { - return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]); + return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[0]); } // A reserved word cannot be preceded by a '.' @@ -936,7 +935,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok ) { return new Token( Token::TOKEN_TYPE_RESERVED_TOPLEVEL, - substr($string, $offset, strlen($matches[1])), + substr($string, $offset, strlen($matches[0])), ); } @@ -952,7 +951,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok ) { return new Token( Token::TOKEN_TYPE_RESERVED_NEWLINE, - substr($string, $offset, strlen($matches[1])), + substr($string, $offset, strlen($matches[0])), ); } @@ -968,7 +967,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok ) { return new Token( Token::TOKEN_TYPE_RESERVED, - substr($string, $offset, strlen($matches[1])), + substr($string, $offset, strlen($matches[0])), ); } } @@ -978,14 +977,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) { return new Token( Token::TOKEN_TYPE_RESERVED, - substr($string, $offset, strlen($matches[1]) - 1), + substr($string, $offset, strlen($matches[0])), ); } // Non reserved word preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset); - return new Token(Token::TOKEN_TYPE_WORD, $matches[1]); + return new Token(Token::TOKEN_TYPE_WORD, $matches[0]); } /** From fda3d6edc460d547bf7707e4f85742b0a9c7323e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Sun, 2 Jun 2024 02:51:32 +0200 Subject: [PATCH 4/4] Build regexes from lists using deduplicated function --- src/Tokenizer.php | 47 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 0fcb327..ee2ed86 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -771,24 +771,22 @@ public function __construct() return array_keys($valuesMap); }; + $buildRegexFromListFx = static function ($values) use ($sortByLengthFx) { + return '(?>' . implode( + '|', + array_map( + static fn ($v) => preg_quote($v, '/'), + $sortByLengthFx($values), + ), + ) . ')'; + }; + // Set up regular expressions - $regexBoundaries = '(?>' . implode( - '|', - $this->quoteRegex($this->boundaries), - ) . ')'; - $regexReserved = '(?>' . implode( - '|', - $this->quoteRegex($sortByLengthFx($this->reserved)), - ) . ')'; - $regexReservedToplevel = '(?>' . str_replace(' ', '\s+', implode( - '|', - $this->quoteRegex($sortByLengthFx($this->reservedToplevel)), - )) . ')'; - $regexReservedNewline = '(?>' . str_replace(' ', '\s+', implode( - '|', - $this->quoteRegex($sortByLengthFx($this->reservedNewline)), - )) . ')'; - $regexFunction = '(?>' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')'; + $regexBoundaries = $buildRegexFromListFx($this->boundaries); + $regexReserved = $buildRegexFromListFx($this->reserved); + $regexReservedToplevel = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedToplevel)); + $regexReservedNewline = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedNewline)); + $regexFunction = $buildRegexFromListFx($this->functions); $this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/'; $this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/'; @@ -987,21 +985,6 @@ private function createNextToken(string $string, string $upper, int $offset, Tok return new Token(Token::TOKEN_TYPE_WORD, $matches[0]); } - /** - * Helper function for building regular expressions for reserved words and boundary characters - * - * @param string[] $strings The strings to be quoted - * - * @return string[] The quoted strings - */ - private function quoteRegex(array $strings): array - { - return array_map( - static fn (string $string): string => preg_quote($string, '/'), - $strings, - ); - } - private function getNextQuotedString(string $string, int $offset): string { $ret = '';