Skip to content

Commit 43ecce7

Browse files
committed
feat(engine-js): supports contiguous anchor simulation
1 parent b57415f commit 43ecce7

File tree

3 files changed

+79
-58
lines changed

3 files changed

+79
-58
lines changed

docs/references/engine-js-compat.md

+35-35
Original file line numberDiff line numberDiff line change
@@ -200,30 +200,30 @@ Languages that does not throw with the JavaScript RegExp engine, but will produc
200200

201201
| Language | Highlight Match | Patterns Parsable | Patterns Failed | Diff |
202202
| ------------- | :--------------------------------------------------------------------------------- | ----------------: | --------------: | ---: |
203-
| angular-html | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=angular-html) | 2 | - | 6 |
204-
| bash | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=bash) | 148 | - | 13 |
205-
| beancount | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=beancount) | 39 | - | 4 |
206-
| c | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=c) | 178 | - | 35 |
207-
| crystal | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=crystal) | 143 | - | 2 |
208-
| elixir | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=elixir) | 105 | - | 43 |
209-
| erlang | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=erlang) | 147 | - | 50 |
210-
| glsl | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=glsl) | 9 | - | 74 |
211-
| haml | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=haml) | 66 | - | 6 |
212-
| kusto | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=kusto) | 60 | - | 1 |
213-
| latex | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=latex) | 183 | - | 5 |
214-
| mermaid | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=mermaid) | 129 | - | 2 |
215-
| nginx | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=nginx) | 104 | - | 3 |
216-
| objective-cpp | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=objective-cpp) | 309 | - | 22 |
217-
| php | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=php) | 342 | - | 37 |
218-
| po | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=po) | 23 | - | 11 |
219-
| pug | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=pug) | 91 | - | 6 |
220-
| rst | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=rst) | 64 | - | 4 |
203+
| angular-html | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=angular-html) | 2 | - | 330 |
204+
| bash | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=bash) | 148 | - | 56 |
205+
| beancount | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=beancount) | 39 | - | 171 |
206+
| c | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=c) | 178 | - | 209 |
207+
| crystal | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=crystal) | 143 | - | 40 |
208+
| elixir | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=elixir) | 105 | - | 179 |
209+
| erlang | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=erlang) | 147 | - | 470 |
210+
| glsl | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=glsl) | 9 | - | 306 |
211+
| haml | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=haml) | 66 | - | 48 |
212+
| kusto | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=kusto) | 60 | - | 40 |
213+
| latex | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=latex) | 183 | - | 25 |
214+
| mermaid | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=mermaid) | 129 | - | 38 |
215+
| nginx | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=nginx) | 104 | - | 4 |
216+
| objective-cpp | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=objective-cpp) | 309 | - | 172 |
217+
| php | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=php) | 342 | - | 605 |
218+
| po | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=po) | 23 | - | 336 |
219+
| pug | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=pug) | 91 | - | 164 |
220+
| rst | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=rst) | 64 | - | 62 |
221221
| ruby | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=ruby) | 154 | - | 1 |
222-
| shellscript | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=shellscript) | 148 | - | 13 |
223-
| smalltalk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=smalltalk) | 35 | - | 8 |
224-
| splunk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=splunk) | 17 | - | 4 |
225-
| stata | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=stata) | 194 | - | 4 |
226-
| zsh | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=zsh) | 148 | - | 26 |
222+
| shellscript | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=shellscript) | 148 | - | 56 |
223+
| smalltalk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=smalltalk) | 35 | - | 40 |
224+
| splunk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=splunk) | 17 | - | 8 |
225+
| stata | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=stata) | 194 | - | 32 |
226+
| zsh | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=zsh) | 148 | - | 117 |
227227

228228
## Unsupported Languages
229229

@@ -234,17 +234,17 @@ Languages that throws with the JavaScript RegExp engine (contains syntaxes that
234234
| ada | ✅ OK | 201 | 1 | |
235235
| sass | ✅ OK | 67 | 2 | |
236236
| blade | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=blade) | 336 | 2 | |
237-
| mdc | ❌ Error | 37 | - | 22 |
237+
| mdc | ❌ Error | 37 | - | 377 |
238238
| powershell | ❌ Error | 87 | 1 | |
239-
| wolfram | ❌ Error | 500 | 1 | 2 |
240-
| razor | ❌ Error | 82 | 3 | 7 |
239+
| wolfram | ❌ Error | 500 | 1 | 12 |
240+
| razor | ❌ Error | 82 | 3 | 26 |
241241
| mdx | ❌ Error | 193 | 4 | |
242-
| swift | ❌ Error | 325 | 4 | 4 |
243-
| julia | ❌ Error | 90 | 5 | 5 |
244-
| kotlin | ❌ Error | 52 | 6 | 81 |
245-
| purescript | ❌ Error | 67 | 6 | 169 |
246-
| markdown | ❌ Error | 111 | 7 | 41 |
247-
| apex | ❌ Error | 173 | 14 | 44 |
248-
| haskell | ❌ Error | 136 | 21 | 3 |
249-
| cpp | ❌ Error | 238 | 22 | 5 |
250-
| csharp | ❌ Error | 278 | 33 | 34 |
242+
| swift | ❌ Error | 325 | 4 | 18 |
243+
| julia | ❌ Error | 90 | 5 | 49 |
244+
| kotlin | ❌ Error | 52 | 6 | 2986 |
245+
| purescript | ❌ Error | 67 | 6 | 1488 |
246+
| markdown | ❌ Error | 111 | 7 | 584 |
247+
| apex | ❌ Error | 173 | 14 | 242 |
248+
| haskell | ❌ Error | 136 | 21 | 12 |
249+
| cpp | ❌ Error | 238 | 22 | 25 |
250+
| csharp | ❌ Error | 278 | 33 | 232 |

packages/engine-javascript/src/index.ts

+23-10
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,18 @@ export function defaultJavaScriptRegexConstructor(pattern: string): RegExp {
4141

4242
export class JavaScriptScanner implements PatternScanner {
4343
regexps: (RegExp | null)[]
44+
contiguousAnchorSimulation: boolean[]
4445

4546
constructor(
4647
public patterns: string[],
4748
public cache: Map<string, RegExp | Error>,
4849
public forgiving: boolean,
4950
public regexConstructor: (pattern: string) => RegExp = defaultJavaScriptRegexConstructor,
5051
) {
51-
this.regexps = patterns.map((p) => {
52+
this.contiguousAnchorSimulation = Array.from({ length: patterns.length }, () => false)
53+
this.regexps = patterns.map((p, idx) => {
54+
if (p.startsWith('(^|\\G)') || p.startsWith('(\\G|^)'))
55+
this.contiguousAnchorSimulation[idx] = true
5256
const cached = cache?.get(p)
5357
if (cached) {
5458
if (cached instanceof RegExp) {
@@ -77,9 +81,9 @@ export class JavaScriptScanner implements PatternScanner {
7781
const str = typeof string === 'string'
7882
? string
7983
: string.content
80-
const pending: [index: number, match: RegExpExecArray][] = []
84+
const pending: [index: number, match: RegExpExecArray, offset: number][] = []
8185

82-
function toResult(index: number, match: RegExpExecArray) {
86+
function toResult(index: number, match: RegExpExecArray, offset = 0) {
8387
return {
8488
index,
8589
captureIndices: match.indices!.map((indice) => {
@@ -91,9 +95,9 @@ export class JavaScriptScanner implements PatternScanner {
9195
}
9296
}
9397
return {
94-
start: indice[0],
98+
start: indice[0] + offset,
9599
length: indice[1] - indice[0],
96-
end: indice[1],
100+
end: indice[1] + offset,
97101
}
98102
}),
99103
}
@@ -104,16 +108,25 @@ export class JavaScriptScanner implements PatternScanner {
104108
if (!regexp)
105109
continue
106110
try {
111+
let offset = 0
107112
regexp.lastIndex = startPosition
108-
const match = regexp.exec(str)
113+
let match = regexp.exec(str)
114+
115+
// If a regex starts with `(^|\\G)` or `(\\G|^)`, we simulate the behavior by cutting the string
116+
if (!match && this.contiguousAnchorSimulation[i]) {
117+
offset = startPosition
118+
regexp.lastIndex = 0
119+
match = regexp.exec(str.slice(startPosition))
120+
}
109121
if (!match)
110122
continue
123+
111124
// If the match is at the start position, return it immediately
112125
if (match.index === startPosition) {
113-
return toResult(i, match)
126+
return toResult(i, match, offset)
114127
}
115128
// Otherwise, store it for later
116-
pending.push([i, match])
129+
pending.push([i, match, offset])
117130
}
118131
catch (e) {
119132
if (this.forgiving)
@@ -125,9 +138,9 @@ export class JavaScriptScanner implements PatternScanner {
125138
// Find the closest match to the start position
126139
if (pending.length) {
127140
const minIndex = Math.min(...pending.map(m => m[1].index))
128-
for (const [i, match] of pending) {
141+
for (const [i, match, offset] of pending) {
129142
if (match.index === minIndex) {
130-
return toResult(i, match)
143+
return toResult(i, match, offset)
131144
}
132145
}
133146
}

scripts/report-engine-js-compat.ts

+21-13
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ async function run() {
6666
}
6767

6868
const highlightA = serializeTokens(shikiWasm, sample, lang)
69-
let highlightB: string | undefined
69+
let highlightB: { tokens: string, html: string } | undefined
7070
let highlightDiff: Diff[] = []
7171

7272
try {
@@ -103,13 +103,13 @@ async function run() {
103103
}
104104

105105
if (highlightMatch !== 'error')
106-
highlightMatch = highlightA === highlightB
106+
highlightMatch = highlightA.html === highlightB?.html
107107
highlightDiff = highlightB && highlightA !== highlightB
108-
? diffMain(highlightA, highlightB)
108+
? diffMain(highlightA.tokens, highlightB.tokens)
109109
: []
110110
diffCleanupSemantic(highlightDiff)
111111

112-
if (!highlightMatch) {
112+
if (highlightB && highlightMatch !== true) {
113113
console.log(c.yellow(`[${lang}] Mismatch`))
114114

115115
await fs.mkdir(new URL('./compares', import.meta.url), { recursive: true })
@@ -122,10 +122,10 @@ async function run() {
122122
'pre { flex: 1; margin: 0; padding: 0; }',
123123
'</style>',
124124
'<pre>',
125-
highlightA,
125+
highlightA.html,
126126
'</pre>',
127127
'<pre>',
128-
highlightB,
128+
highlightB?.html,
129129
'</pre>',
130130
].join('\n'),
131131
'utf-8',
@@ -143,8 +143,8 @@ async function run() {
143143
...highlightMatch === true
144144
? {}
145145
: {
146-
highlightA,
147-
highlightB,
146+
highlightA: highlightA.html,
147+
highlightB: highlightB?.html,
148148
},
149149
diff: highlightDiff,
150150
})
@@ -177,13 +177,13 @@ async function run() {
177177
['---', ':---', '---:', '---:', '---:'],
178178
...report
179179
.map((item) => {
180-
const diffCount = item.diff.filter(diff => diff[0] === 1).length
180+
const diffChars = item.diff.map(diff => diff[0] === 1 ? diff[1].length : 0).reduce((a, b) => a + b, 0)
181181
return [
182182
item.lang,
183183
item.highlightMatch === true ? '✅ OK' : item.highlightMatch === 'error' ? '❌ Error' : `[🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=${item.lang})`,
184184
item.patternsParsable === 0 ? '-' : item.patternsParsable.toString(),
185185
item.patternsFailed.length === 0 ? '-' : item.patternsFailed.length.toString(),
186-
diffCount ? diffCount.toString() : '',
186+
diffChars ? diffChars.toString() : '',
187187
] as [string, string, string, string, string]
188188
}),
189189
]
@@ -250,9 +250,17 @@ async function run() {
250250
}
251251

252252
function serializeTokens(shiki: HighlighterGeneric<BundledLanguage, BundledTheme>, sample: string, lang: string) {
253-
const tokens = shiki.codeToTokensBase(sample, { lang: lang as any, theme: 'vitesse-dark' })
254-
const str = tokens.flat(1).map(t => t.color?.padEnd(18, ' ') + t.content).join('\n')
255-
return str
253+
const tokens = shiki
254+
.codeToTokensBase(sample, { lang: lang as any, theme: 'vitesse-dark' })
255+
.flat(1)
256+
.map(t => t.color?.padEnd(18, ' ') + t.content)
257+
.join('\n')
258+
const html = shiki
259+
.codeToHtml(sample, { lang: lang as any, theme: 'vitesse-dark' })
260+
return {
261+
tokens,
262+
html,
263+
}
256264
}
257265

258266
function getPatternsOfGrammar(grammar: any): Set<string> {

0 commit comments

Comments
 (0)