-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpegex.lua
382 lines (347 loc) · 12.4 KB
/
pegex.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
-- Copyright (C) 2014 Chris Emerson <[email protected]>
-- See LICENSE for details (MIT license).
-- Support for regular expressions (parsed and implemented with LPeg).
local M = {}
local lpeg = require('lpeg')
local P = lpeg.P
local R = lpeg.R
local S = lpeg.S
local C = lpeg.C
local V = lpeg.V
local B = lpeg.B
local Carg = lpeg.Carg
local Cb = lpeg.Cb
local Cc = lpeg.Cc
local Cf = lpeg.Cf
local Cp = lpeg.Cp
local Cg = lpeg.Cg
local Ct = lpeg.Ct
local Cmt = lpeg.Cmt
-- We use the algorithm to convert from a regular expression to a Peg
-- expression from:
-- http://www.inf.puc-rio.br/~roberto/docs/ry10-01.pdf
local function sub1(x)
return x - 1
end
-- Parts of a regular expression, returning an LPEG pattern which matches it.
local _start = Cg(Cp(), "_start")
local _end = Cg(Cp()/sub1, "_end")
local mt = {
__index = {
match = function(t, s, index)
local result = t._pat:match(s, index)
if result == nil then return result end
-- Post-process to put the matches into a nicer form
local groups = nil
for k,v in pairs(result) do
if k:sub(1,1) == "s" then
local grpname= k:sub(2)
local endpos = result["e"..grpname]
if v and endpos then
if grpname:match("(%d+)") then
grpname = tonumber(grpname)
end
groups = groups or {}
groups[grpname] = {v,endpos}
result[k] = nil
result["e"..grpname] = nil
end
end
end
result.groups = groups
return result
end,
},
}
-- Make special character sets
local function make_b_s()
return { { " \t\v\n\r", [0]="set" }, [0]="charset" }
end
local function make_b_S()
return { { " \t\v\n\r", [0]="set" }, [0]="charset",
negate=true}
end
local function make_b_w()
return { [0]="charset",
{ [0]="range", "a", "z" },
{ [0]="range", "A", "Z" },
{ [0]="range", "0", "9" },
{ [0]="char", "_" },
}
end
local function make_b_W()
return { [0]="charset",
{ [0]="range", "a", "z" },
{ [0]="range", "A", "Z" },
{ [0]="range", "0", "9" },
{ [0]="char", "_" },
negate=true
}
end
local function make_b_d()
return { [0]="charset",
{ [0]="range", "0", "9" },
}
end
local function make_b_D()
return { [0]="charset",
{ [0]="range", "0", "9" },
negate=true,
}
end
local function make_charset(c)
return function() return { [0]="charset", { [0]="char", c } } end
end
local function make_char(c)
return function() return { [0]="char", c } end
end
local special = S"()\\?*+|.^$"
local any = P"." * Cc({[0] = "."})
-- Perl-style character classes
local b_s = P"\\s" / make_b_s
local b_S = P"\\S" / make_b_S
local b_w = P"\\w" / make_b_w
local b_W = P"\\W" / make_b_W
local b_d = P"\\d" / make_b_d
local b_D = P"\\D" / make_b_D
local b_t = P"\\t" / make_charset('\t')
local b_n = P"\\n" / make_charset('\n')
local b_r = P"\\r" / make_charset('\r')
local b_f = P"\\f" / make_charset('\f')
local b_e = P"\\e" / make_charset('\x1b')
local b_a = P"\\a" / make_charset('\x07')
local backcharset = b_s + b_S + b_w + b_W + b_d + b_D +
b_t + b_n + b_r + b_f + b_e + b_a
local charset_special = S"]-"
local charset_escapes = (b_t + b_n + b_r + b_f + b_e + b_a) /
function(c) return c[1] end
local charset_char = C(P(1) - charset_special) /
function(c) return { [0] = "char", c } end
local range = (C(P(1) - charset_special) * P"-" * C(P(1) - charset_special)) /
function(a,b) return { [0]="range", a, b } end
local charset = (P"[" *
Ct((Cg(P"^"*Cc(true), "negate") + P(0))
* (range + charset_escapes + charset_char)^0) *
P"]") /
function(x) x[0] = "charset" return x end
local char = C(P(1) - special) / function(c) return { [0] = "char", c } end
local escapechar = (P"\\" * C(special)) / function(c) return { [0] = "char", c } end
local backref = (P"\\" * C(R"19")) / function(c) return { tonumber(c), [0] = "backref" } end
local wordchar = R("AZ", "az", "09") + S("_")
local nonwordchar = 1 - wordchar
-- word boundaries
local word_start = P"\\<" * Cc({[0] = "\\<"})
local word_end = P"\\>" * Cc({[0] = "\\>"})
-- {n} etc. Returns two captures - (min, max); max can be nil (no max)
local count_exact = (P"{" * C(R"09" ^ 1) * P"}") / function(c) return tonumber(c), tonumber(c) end
local count_minmax = (P"{" * C(R"09" ^ 1) * P"," * C(R"09" ^ 1) * P"}") / function(min,max) return tonumber(min), tonumber(max) end
local count_min = (P"{" * C(R"09" ^ 1) * P",}") / function(c) return tonumber(c), nil end
local brace_count = count_exact + count_minmax + count_min
-- Grouping
local newgrp = (Cb("groups") * Cp()) /
function(groups, pos)
local grp = #groups+1
groups[grp] = {pos}
groups.open[#groups.open+1] = grp
end
-- endgrp leaves the group number or name as a capture
local endgrp = (Cb("groups") * Cp()) /
function(groups, pos)
local grp = groups.open[#groups.open]
groups.open[#groups.open] = nil
groups[grp][2] = pos
return grp
end
local bra = P"(" * newgrp
local ket = P")" * endgrp
local anonbra = P"(?:"
local anonket = P")"
local pattern = P{
"pattern",
-- A complete pattern, starting from an empty pattern.
pattern = Cg(Carg(1),"groups") * Ct((P"^"*Cg(Cc(1),"anchorstart") + P(0)) * V"subpat" * (P"$"*(-P(1))*Cg(Cc(1),"anchorend") + (-P(1)))) /
function(t) t[0] = "pattern" ; return t end,
-- A set of alternate branches
subpat = (V"branch" * (P"|" * V"branch") ^ 0) /
function(...) return { [0] = "alt", ... } end,
branch = V"concat",
-- A set of concatenated pieces
-- Pass a dummy capture to avoid the special case of no captures confusing
-- the function.
concat = Cc(nil) * (V"piece" ^ 0) /
function(_, ...) return { [0] = "concat", ... } end,
piece = V"atom_multi",
atom_multi = V"atom_plus" + V"atom_star" + V"atom_query" + V"atom_count" + V"atom",
atom_plus = (V"atom" * P"+") /
function(atom) return { [0] = "+", atom } end,
atom_star = (V"atom" * P"*") /
function(atom) return { [0] = "*", atom } end,
atom_query = (V"atom" * P"?") /
function(atom) return { [0] = "?", atom } end,
atom_count = (V"atom" * brace_count) /
function(atom, min, max) return { [0] = "{}", min=min, max=max, atom } end,
anongroup = (anonbra * V"subpat" * anonket),
group = (bra * V"subpat" * ket) /
function(subpat, grpname) return { [0] = "group", subpat, grpname } end,
atom = any + word_start + word_end + escapechar + charset + V"anongroup" + V"group" + char + backref + backcharset,
}
local function foldr(f, t, init)
local res = init
local start = #t
if res == nil then
res = t[start]
start = start - 1
end
for i=start,1,-1 do
res = f(t[i], res)
end
return res
end
local function map(f, t)
local result = {}
for i=1,#t do
result[i] = f(t[i])
end
return result
end
local function add(a,b)
return a+b
end
-- Convert a charset fragment to a PEG
local function charset_to_peg(charfrag)
local t = charfrag[0]
if t == "char" then
assert(#charfrag == 1)
return P(charfrag[1])
elseif t == "range" then
assert(#charfrag == 2)
return R(charfrag[1] .. charfrag[2])
elseif t == "set" then
return S(charfrag[1])
else
error("Got charset bit: "..tostring(t).."/"..tostring(t and t[0]))
end
end
local function re_to_peg(retab, k, patternProps)
local t = retab[0]
if t == "pattern" then
assert(#retab == 1)
local pat = re_to_peg(retab[1], k, patternProps)
-- If the pattern is anchored at the end, make it fail to match
-- if there's another byte. This must be done *before* wrapping
-- with the start/end markers, as once they've matched it's too late
-- to match the next item.
if retab.anchorend then
-- Disallow matching anything afterwards.
pat = pat * (-P(1))
end
-- Add match start/end markers
pat = _start * pat * _end
if not retab.anchorstart then
-- Match the pattern, or a character and try again.
pat = P{pat + 1*V(1)}
end
return pat
elseif t == "group" then
assert(#retab == 2)
-- print(debug.traceback())
patternProps.numGroups = patternProps.numGroups + 1
local grpname = tostring(retab[2])
local newk = Cg(Cp()/sub1, "e"..grpname) * k
local pat = re_to_peg(retab[1], newk, patternProps)
pat = Cg(Cp(), "s"..grpname) * pat
return pat
elseif t == "alt" then
if #retab == 1 then
return re_to_peg(retab[1], k, patternProps)
else
local parts = map(function(x)
return re_to_peg(x, k, patternProps)
end, retab)
return foldr(add, parts)
end
elseif t == "concat" then
return foldr(function(retab_f, k_f) return re_to_peg(retab_f, k_f, patternProps) end, retab, k)
elseif t == "char" then
assert(#retab == 1)
return P(retab[1]) * k
elseif t == "charset" then
local charset_pat = foldr(add, map(charset_to_peg, retab))
if retab.negate then
charset_pat = 1 - charset_pat
end
return charset_pat * k
elseif t == "*" then
return P{"A", A=re_to_peg(retab[1], V"A", patternProps) + k}
elseif t == "+" then
return re_to_peg(retab[1], P{"A", A=re_to_peg(retab[1], V"A", patternProps) + k}, patternProps)
elseif t == "." then
assert(#retab == 0)
return (P(1) - P"\n") * k
elseif t == "?" then
assert(#retab == 1)
return re_to_peg(retab[1], k, patternProps) + k
elseif t == "{}" then
assert(#retab == 1)
-- Rewrite this in terms of ? and *.
-- X{3,} => XXXX*
-- X{3,5} => XXXX?X?
local subpat = retab[1]
local min = retab.min
local max = retab.max
local rewritten = { [0] = "concat" }
for i=1,min do
rewritten[#rewritten+1] = subpat
end
if max == nil then
rewritten[#rewritten+1] = { [0] = "*", subpat }
else
local optional = { [0] = "?", subpat }
for i=min+1,max do
rewritten[#rewritten+1] = optional
end
end
return re_to_peg(rewritten, k, patternProps)
elseif t == "\\<" then
assert(#retab == 0)
return -B(wordchar) * #wordchar * k
elseif t == "\\>" then
assert(#retab == 0)
return B(wordchar) * (-#wordchar) * k
elseif t == "backref" then
local grpname = retab[1]
return Cmt(P(0) * Cb("s"..grpname) * Cb("e"..grpname),
function(subject, pos, s, e)
local backval = subject:sub(s, e)
local here = subject:sub(pos, pos+e-s)
if backval == here then
return pos+e-s+1
else
return false
end
end)
else
error("Not implemented op: " ..tostring(t) .. "/" .. tostring(retab))
end
end
function M.parse(re)
return pattern:match(re, 1, {open={}})
end
function M.compile(re)
-- Since the RE->Peg construction starts backwards (using the
-- continuation), it's more convenient to parse the regular expression
-- backwards.
local retab = M.parse(re)
if retab == nil then
error("Failed to parse regular expression: {"..re.."}", 2)
end
local patternProps = { numGroups = 0 }
local _pat = re_to_peg(retab, P(0), patternProps)
return setmetatable({
_pat = Ct(_pat),
numgroups = patternProps.numGroups
}, mt)
end
-- Increase match complexity
lpeg.setmaxstack(1000)
return M