-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfactory.c
177 lines (156 loc) · 5.44 KB
/
factory.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/* factory.c
*
* Create a regex core object from the tokens of tokens from the
* parser.
*/
#include <assert.h>
#include <stdlib.h>
#include <stdbool.h>
#include "factory.h"
#include "clist.h"
static core_t* _build_core(tlist_t* tokens, int index, clist_t* subs) {
assert(tokens);
assert(!tlist_empty(tokens));
core_t* core = core_new(index);
branch_t* branch = core_add_branch(core, NULL);
atom_t* curr = NULL;
do {
token_t* token = tlist_token(tokens);
switch (token->flag) {
/* Empty regular expression matches any string. Do nothing.
*/
case EMPTY:
break;
/* Alternator character. This creates a new branch within
* the core object.
*/
case ALTERNATOR:
branch = core_add_branch(core, branch);
break;
/* String case. Give the string to the new atom.
*/
case STRING:
curr = branch_add_atom(branch);
atom_set_string(curr, token->data.string);
break;
/* Character tree case. Same as literal case except that
* the tree is already made by the lexer.
*/
case CLASS:
curr = branch_add_atom(branch);
atom_set_class(curr, token->data.class);
atom_set_invert(curr, false);
break;
case NCLASS:
curr = branch_add_atom(branch);
atom_set_class(curr, token->data.class);
atom_set_invert(curr, true);
break;
/* Repetition cases. Sets a range in the current atom.
*/
case RANGE:
assert(curr);
atom_set_range(curr, token->data.range.begin,
token->data.range.end);
break;
/* Group case. Create a new core object and put it inside
* the new atom.
*/
case GROUP:
curr = branch_add_atom(branch);
atom_set_core(curr,
_build_core(token->data.group, token->ngr, subs), 0);
break;
case ATOMIC:
curr = branch_add_atom(branch);
atom_set_core(curr,
_build_core(token->data.group, token->ngr, subs), 1);
break;
/* Backreference case. Set the backreference group number
* inside the new atom.
*/
case REFERENCE:
curr = branch_add_atom(branch);
atom_set_reference(curr, token->ngr);
break;
/* Lookahead case. Set the group in the new atom and tree
* invert to be true in the negated case.
*/
case LOOKAHEAD:
curr = branch_add_atom(branch);
atom_set_core(curr, _build_core(token->data.group,
token->ngr, subs), 2);
atom_set_invert(curr, false);
break;
case NLOOKAHEAD:
curr = branch_add_atom(branch);
atom_set_core(curr, _build_core(token->data.group,
token->ngr, subs), 2);
atom_set_invert(curr, true);
break;
/* Subroutine call. We need to save the pointer to the new
* atom, along with the index of the core that atom will
* need to point, so that we can connect the atoms to their
* cores after the cores are built.
*/
case SUBROUTINE:
curr = branch_add_atom(branch);
subs = clist_new(subs, curr, token->ngr);
break;
/* Match the empty string the at one of the following
* locations: between a word character and a non-word
* character, between a word character and a null terminating
* character, and at the beginning of the string if the first
* character is a word character.
*/
case WORDANCH:
curr = branch_add_atom(branch);
atom_set_anchor(curr, 1);
break;
case NWORDANCH:
curr = branch_add_atom(branch);
atom_set_anchor(curr, 1);
atom_set_invert(curr, true);
break;
/* Match the empty string at the beginning of a string.
*/
case STANCH:
curr = branch_add_atom(branch);
atom_set_anchor(curr, 2);
atom_set_invert(curr, true);
break;
/* Match the empty string at the end of a string.
*/
case EDGEANCH:
curr = branch_add_atom(branch);
atom_set_anchor(curr, 2);
break;
/* Make the current atom lazy instead of greedy.
*/
case LAZY:
assert(curr);
atom_set_greedy(curr, false);
break;
/* Should have been converted already.
*/
case POSSESSIVE: case LITERAL: case NAME:
assert(false);
}
tlist_pop_front(tokens);
} while (!tlist_empty(tokens));
tlist_free(tokens);
return core;
}
core_t* build_core(tlist_t* tokens) {
clist_t* subs = clist_new(NULL, NULL, 0);
clist_t* cl = subs;
core_t* core = _build_core(tokens, 0, subs);
for (atom_t* curr = clist_pointer(cl); curr; ) {
atom_set_core(curr, core_find_core(core, clist_index(cl)), 4);
cl = clist_next(cl);
curr = !cl ? NULL : clist_pointer(cl);
}
clist_free(subs);
return core;
}
/********************************************************************/