Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve tokenizer regex matching #131

Merged
merged 4 commits into from
Jun 22, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 30 additions & 22 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -720,11 +720,13 @@ final class Tokenizer

// Regular expressions for tokenizing

private readonly string $regexBoundaries;
private readonly string $regexReserved;
private readonly string $regexReservedNewline;
private readonly string $regexReservedToplevel;
private readonly string $regexFunction;
private readonly string $nextTokenRegexNumber;
private readonly string $nextTokenRegexBoundaryCharacter;
private readonly string $nextTokenRegexReservedToplevel;
private readonly string $nextTokenRegexReservedNewline;
private readonly string $nextTokenRegexReserved;
private readonly string $nextTokenRegexFunction;
private readonly string $nextTokenRegexNonReserved;

/**
* Punctuation that can be used as a boundary between other tokens
Expand Down Expand Up @@ -770,24 +772,31 @@ public function __construct()
};

// Set up regular expressions
$this->regexBoundaries = '(' . implode(
$regexBoundaries = '(?>' . implode(
'|',
$this->quoteRegex($this->boundaries),
) . ')';
$this->regexReserved = '(' . implode(
$regexReserved = '(?>' . implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reserved)),
) . ')';
$this->regexReservedToplevel = str_replace(' ', '\s+', '(' . implode(
$regexReservedToplevel = '(?>' . str_replace(' ', '\s+', implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reservedToplevel)),
) . ')');
$this->regexReservedNewline = str_replace(' ', '\s+', '(' . implode(
)) . ')';
$regexReservedNewline = '(?>' . str_replace(' ', '\s+', implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reservedNewline)),
) . ')');
)) . ')';
$regexFunction = '(?>' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')';

$this->regexFunction = '(' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')';
$this->nextTokenRegexNumber = '/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $regexBoundaries . ')/';
$this->nextTokenRegexBoundaryCharacter = '/\G(' . $regexBoundaries . ')/';
$this->nextTokenRegexReservedToplevel = '/\G(' . $regexReservedToplevel . ')($|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReservedNewline = '/\G(' . $regexReservedNewline . ')($|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReserved = '/\G(' . $regexReserved . ')($|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexFunction = '/\G(' . $regexFunction . '[(]|\s|[)])/';
$this->nextTokenRegexNonReserved = '/\G(.*?)($|\s|["\'`]|' . $regexBoundaries . ')/';
}

/**
Expand Down Expand Up @@ -883,7 +892,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
$value = $firstChar . $this->getNextQuotedString($string, $offset + 1);
} else {
// Non-quoted variable name
preg_match('/\G(' . $firstChar . '[\w.$]+)/', $string, $matches, 0, $offset);
preg_match('/\G([@:][\w.$]+)/', $string, $matches, 0, $offset);
if ($matches) {
$value = $matches[1];
}
Expand All @@ -897,7 +906,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Number (decimal, binary, or hex)
if (
preg_match(
'/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexNumber,
greg0ire marked this conversation as resolved.
Show resolved Hide resolved
$string,
$matches,
0,
Expand All @@ -908,7 +917,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
}

// Boundary Character (punctuation and symbols)
if (preg_match('/\G(' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset)) {
if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]);
}

Expand All @@ -918,7 +927,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Top Level Reserved Word
if (
preg_match(
'/\G(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReservedToplevel,
$upper,
$matches,
0,
Expand All @@ -934,7 +943,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Newline Reserved Word
if (
preg_match(
'/\G(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReservedNewline,
$upper,
$matches,
0,
Expand All @@ -950,7 +959,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Other Reserved Word
if (
preg_match(
'/\G(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReserved,
$upper,
$matches,
0,
Expand All @@ -965,17 +974,16 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
}

// A function must be succeeded by '('
// this makes it so "count(" is considered a function, but "count" alone is not
// function
if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches, 0, $offset)) {
// this makes it so "count(" is considered a function, but "count" alone is not function
if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[1]) - 1),
);
}

// Non reserved word
preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset);
preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset);

return new Token(Token::TOKEN_TYPE_WORD, $matches[1]);
}
Expand Down