Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve tokenizer regex matching #131

Merged
merged 4 commits into from
Jun 22, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 47 additions & 57 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -720,11 +720,13 @@ final class Tokenizer

// Regular expressions for tokenizing

private readonly string $regexBoundaries;
private readonly string $regexReserved;
private readonly string $regexReservedNewline;
private readonly string $regexReservedToplevel;
private readonly string $regexFunction;
private readonly string $nextTokenRegexNumber;
private readonly string $nextTokenRegexBoundaryCharacter;
private readonly string $nextTokenRegexReservedToplevel;
private readonly string $nextTokenRegexReservedNewline;
private readonly string $nextTokenRegexReserved;
private readonly string $nextTokenRegexFunction;
private readonly string $nextTokenRegexNonReserved;

/**
* Punctuation that can be used as a boundary between other tokens
Expand Down Expand Up @@ -769,25 +771,30 @@ public function __construct()
return array_keys($valuesMap);
};

// Set up regular expressions
$this->regexBoundaries = '(' . implode(
'|',
$this->quoteRegex($this->boundaries),
) . ')';
$this->regexReserved = '(' . implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reserved)),
) . ')';
$this->regexReservedToplevel = str_replace(' ', '\s+', '(' . implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reservedToplevel)),
) . ')');
$this->regexReservedNewline = str_replace(' ', '\s+', '(' . implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reservedNewline)),
) . ')');
$buildRegexFromListFx = static function ($values) use ($sortByLengthFx) {
return '(?>' . implode(
'|',
array_map(
static fn ($v) => preg_quote($v, '/'),
$sortByLengthFx($values),
),
) . ')';
};

$this->regexFunction = '(' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')';
// Set up regular expressions
$regexBoundaries = $buildRegexFromListFx($this->boundaries);
$regexReserved = $buildRegexFromListFx($this->reserved);
$regexReservedToplevel = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedToplevel));
$regexReservedNewline = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedNewline));
$regexFunction = $buildRegexFromListFx($this->functions);

$this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
$this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
$this->nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/';
$this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
}

/**
Expand Down Expand Up @@ -829,7 +836,6 @@ public function tokenize(string $string): Cursor
*/
private function createNextToken(string $string, string $upper, int $offset, Token|null $previous = null): Token
{
$matches = [];
// Whitespace
if (preg_match('/\G\s+/', $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]);
Expand Down Expand Up @@ -883,9 +889,9 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
$value = $firstChar . $this->getNextQuotedString($string, $offset + 1);
} else {
// Non-quoted variable name
preg_match('/\G(' . $firstChar . '[\w.$]+)/', $string, $matches, 0, $offset);
preg_match('/\G[@:][\w.$]+/', $string, $matches, 0, $offset);
if ($matches) {
$value = $matches[1];
$value = $matches[0];
}
}

Expand All @@ -897,19 +903,19 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Number (decimal, binary, or hex)
if (
preg_match(
'/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexNumber,
greg0ire marked this conversation as resolved.
Show resolved Hide resolved
$string,
$matches,
0,
$offset,
)
) {
return new Token(Token::TOKEN_TYPE_NUMBER, $matches[1]);
return new Token(Token::TOKEN_TYPE_NUMBER, $matches[0]);
}

// Boundary Character (punctuation and symbols)
if (preg_match('/\G(' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]);
if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[0]);
}

// A reserved word cannot be preceded by a '.'
Expand All @@ -918,7 +924,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Top Level Reserved Word
if (
preg_match(
'/\G(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReservedToplevel,
$upper,
$matches,
0,
Expand All @@ -927,14 +933,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
) {
return new Token(
Token::TOKEN_TYPE_RESERVED_TOPLEVEL,
substr($string, $offset, strlen($matches[1])),
substr($string, $offset, strlen($matches[0])),
);
}

// Newline Reserved Word
if (
preg_match(
'/\G(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReservedNewline,
$upper,
$matches,
0,
Expand All @@ -943,14 +949,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
) {
return new Token(
Token::TOKEN_TYPE_RESERVED_NEWLINE,
substr($string, $offset, strlen($matches[1])),
substr($string, $offset, strlen($matches[0])),
);
}

// Other Reserved Word
if (
preg_match(
'/\G(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReserved,
$upper,
$matches,
0,
Expand All @@ -959,40 +965,24 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[1])),
substr($string, $offset, strlen($matches[0])),
);
}
}

// A function must be succeeded by '('
// this makes it so "count(" is considered a function, but "count" alone is not
// function
if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches, 0, $offset)) {
// this makes it so "count(" is considered a function, but "count" alone is not function
if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[1]) - 1),
substr($string, $offset, strlen($matches[0])),
);
}

// Non reserved word
preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset);

return new Token(Token::TOKEN_TYPE_WORD, $matches[1]);
}
preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset);

/**
* Helper function for building regular expressions for reserved words and boundary characters
*
* @param string[] $strings The strings to be quoted
*
* @return string[] The quoted strings
*/
private function quoteRegex(array $strings): array
{
return array_map(
static fn (string $string): string => preg_quote($string, '/'),
$strings,
);
return new Token(Token::TOKEN_TYPE_WORD, $matches[0]);
}

private function getNextQuotedString(string $string, int $offset): string
Expand Down
Loading