1 | package syntax
|
---|
2 |
|
---|
3 | import (
|
---|
4 | "bytes"
|
---|
5 | "fmt"
|
---|
6 | "math"
|
---|
7 | )
|
---|
8 |
|
---|
9 | // similar to prog.go in the go regex package...also with comment 'may not belong in this package'
|
---|
10 |
|
---|
11 | // File provides operator constants for use by the Builder and the Machine.
|
---|
12 |
|
---|
13 | // Implementation notes:
|
---|
14 | //
|
---|
15 | // Regexps are built into RegexCodes, which contain an operation array,
|
---|
16 | // a string table, and some constants.
|
---|
17 | //
|
---|
18 | // Each operation is one of the codes below, followed by the integer
|
---|
19 | // operands specified for each op.
|
---|
20 | //
|
---|
21 | // Strings and sets are indices into a string table.
|
---|
22 |
|
---|
23 | type InstOp int
|
---|
24 |
|
---|
25 | const (
|
---|
26 | // lef/back operands description
|
---|
27 |
|
---|
28 | Onerep InstOp = 0 // lef,back char,min,max a {n}
|
---|
29 | Notonerep = 1 // lef,back char,min,max .{n}
|
---|
30 | Setrep = 2 // lef,back set,min,max [\d]{n}
|
---|
31 |
|
---|
32 | Oneloop = 3 // lef,back char,min,max a {,n}
|
---|
33 | Notoneloop = 4 // lef,back char,min,max .{,n}
|
---|
34 | Setloop = 5 // lef,back set,min,max [\d]{,n}
|
---|
35 |
|
---|
36 | Onelazy = 6 // lef,back char,min,max a {,n}?
|
---|
37 | Notonelazy = 7 // lef,back char,min,max .{,n}?
|
---|
38 | Setlazy = 8 // lef,back set,min,max [\d]{,n}?
|
---|
39 |
|
---|
40 | One = 9 // lef char a
|
---|
41 | Notone = 10 // lef char [^a]
|
---|
42 | Set = 11 // lef set [a-z\s] \w \s \d
|
---|
43 |
|
---|
44 | Multi = 12 // lef string abcd
|
---|
45 | Ref = 13 // lef group \#
|
---|
46 |
|
---|
47 | Bol = 14 // ^
|
---|
48 | Eol = 15 // $
|
---|
49 | Boundary = 16 // \b
|
---|
50 | Nonboundary = 17 // \B
|
---|
51 | Beginning = 18 // \A
|
---|
52 | Start = 19 // \G
|
---|
53 | EndZ = 20 // \Z
|
---|
54 | End = 21 // \Z
|
---|
55 |
|
---|
56 | Nothing = 22 // Reject!
|
---|
57 |
|
---|
58 | // Primitive control structures
|
---|
59 |
|
---|
60 | Lazybranch = 23 // back jump straight first
|
---|
61 | Branchmark = 24 // back jump branch first for loop
|
---|
62 | Lazybranchmark = 25 // back jump straight first for loop
|
---|
63 | Nullcount = 26 // back val set counter, null mark
|
---|
64 | Setcount = 27 // back val set counter, make mark
|
---|
65 | Branchcount = 28 // back jump,limit branch++ if zero<=c<limit
|
---|
66 | Lazybranchcount = 29 // back jump,limit same, but straight first
|
---|
67 | Nullmark = 30 // back save position
|
---|
68 | Setmark = 31 // back save position
|
---|
69 | Capturemark = 32 // back group define group
|
---|
70 | Getmark = 33 // back recall position
|
---|
71 | Setjump = 34 // back save backtrack state
|
---|
72 | Backjump = 35 // zap back to saved state
|
---|
73 | Forejump = 36 // zap backtracking state
|
---|
74 | Testref = 37 // backtrack if ref undefined
|
---|
75 | Goto = 38 // jump just go
|
---|
76 |
|
---|
77 | Prune = 39 // prune it baby
|
---|
78 | Stop = 40 // done!
|
---|
79 |
|
---|
80 | ECMABoundary = 41 // \b
|
---|
81 | NonECMABoundary = 42 // \B
|
---|
82 |
|
---|
83 | // Modifiers for alternate modes
|
---|
84 |
|
---|
85 | Mask = 63 // Mask to get unmodified ordinary operator
|
---|
86 | Rtl = 64 // bit to indicate that we're reverse scanning.
|
---|
87 | Back = 128 // bit to indicate that we're backtracking.
|
---|
88 | Back2 = 256 // bit to indicate that we're backtracking on a second branch.
|
---|
89 | Ci = 512 // bit to indicate that we're case-insensitive.
|
---|
90 | )
|
---|
91 |
|
---|
92 | type Code struct {
|
---|
93 | Codes []int // the code
|
---|
94 | Strings [][]rune // string table
|
---|
95 | Sets []*CharSet //character set table
|
---|
96 | TrackCount int // how many instructions use backtracking
|
---|
97 | Caps map[int]int // mapping of user group numbers -> impl group slots
|
---|
98 | Capsize int // number of impl group slots
|
---|
99 | FcPrefix *Prefix // the set of candidate first characters (may be null)
|
---|
100 | BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null)
|
---|
101 | Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc)
|
---|
102 | RightToLeft bool // true if right to left
|
---|
103 | }
|
---|
104 |
|
---|
105 | func opcodeBacktracks(op InstOp) bool {
|
---|
106 | op &= Mask
|
---|
107 |
|
---|
108 | switch op {
|
---|
109 | case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
|
---|
110 | Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
|
---|
111 | Forejump, Goto:
|
---|
112 | return true
|
---|
113 |
|
---|
114 | default:
|
---|
115 | return false
|
---|
116 | }
|
---|
117 | }
|
---|
118 |
|
---|
119 | func opcodeSize(op InstOp) int {
|
---|
120 | op &= Mask
|
---|
121 |
|
---|
122 | switch op {
|
---|
123 | case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
|
---|
124 | End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
|
---|
125 | return 1
|
---|
126 |
|
---|
127 | case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
|
---|
128 | Prune, Set:
|
---|
129 | return 2
|
---|
130 |
|
---|
131 | case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
|
---|
132 | Setlazy, Setrep, Setloop:
|
---|
133 | return 3
|
---|
134 |
|
---|
135 | default:
|
---|
136 | panic(fmt.Errorf("Unexpected op code: %v", op))
|
---|
137 | }
|
---|
138 | }
|
---|
139 |
|
---|
140 | var codeStr = []string{
|
---|
141 | "Onerep", "Notonerep", "Setrep",
|
---|
142 | "Oneloop", "Notoneloop", "Setloop",
|
---|
143 | "Onelazy", "Notonelazy", "Setlazy",
|
---|
144 | "One", "Notone", "Set",
|
---|
145 | "Multi", "Ref",
|
---|
146 | "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
|
---|
147 | "Nothing",
|
---|
148 | "Lazybranch", "Branchmark", "Lazybranchmark",
|
---|
149 | "Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
|
---|
150 | "Nullmark", "Setmark", "Capturemark", "Getmark",
|
---|
151 | "Setjump", "Backjump", "Forejump", "Testref", "Goto",
|
---|
152 | "Prune", "Stop",
|
---|
153 | "ECMABoundary", "NonECMABoundary",
|
---|
154 | }
|
---|
155 |
|
---|
156 | func operatorDescription(op InstOp) string {
|
---|
157 | desc := codeStr[op&Mask]
|
---|
158 | if (op & Ci) != 0 {
|
---|
159 | desc += "-Ci"
|
---|
160 | }
|
---|
161 | if (op & Rtl) != 0 {
|
---|
162 | desc += "-Rtl"
|
---|
163 | }
|
---|
164 | if (op & Back) != 0 {
|
---|
165 | desc += "-Back"
|
---|
166 | }
|
---|
167 | if (op & Back2) != 0 {
|
---|
168 | desc += "-Back2"
|
---|
169 | }
|
---|
170 |
|
---|
171 | return desc
|
---|
172 | }
|
---|
173 |
|
---|
174 | // OpcodeDescription is a humman readable string of the specific offset
|
---|
175 | func (c *Code) OpcodeDescription(offset int) string {
|
---|
176 | buf := &bytes.Buffer{}
|
---|
177 |
|
---|
178 | op := InstOp(c.Codes[offset])
|
---|
179 | fmt.Fprintf(buf, "%06d ", offset)
|
---|
180 |
|
---|
181 | if opcodeBacktracks(op & Mask) {
|
---|
182 | buf.WriteString("*")
|
---|
183 | } else {
|
---|
184 | buf.WriteString(" ")
|
---|
185 | }
|
---|
186 | buf.WriteString(operatorDescription(op))
|
---|
187 | buf.WriteString("(")
|
---|
188 | op &= Mask
|
---|
189 |
|
---|
190 | switch op {
|
---|
191 | case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
|
---|
192 | buf.WriteString("Ch = ")
|
---|
193 | buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
|
---|
194 |
|
---|
195 | case Set, Setrep, Setloop, Setlazy:
|
---|
196 | buf.WriteString("Set = ")
|
---|
197 | buf.WriteString(c.Sets[c.Codes[offset+1]].String())
|
---|
198 |
|
---|
199 | case Multi:
|
---|
200 | fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
|
---|
201 |
|
---|
202 | case Ref, Testref:
|
---|
203 | fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
|
---|
204 |
|
---|
205 | case Capturemark:
|
---|
206 | fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
|
---|
207 | if c.Codes[offset+2] != -1 {
|
---|
208 | fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
|
---|
209 | }
|
---|
210 |
|
---|
211 | case Nullcount, Setcount:
|
---|
212 | fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
|
---|
213 |
|
---|
214 | case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
|
---|
215 | fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
|
---|
216 | }
|
---|
217 |
|
---|
218 | switch op {
|
---|
219 | case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
|
---|
220 | buf.WriteString(", Rep = ")
|
---|
221 | if c.Codes[offset+2] == math.MaxInt32 {
|
---|
222 | buf.WriteString("inf")
|
---|
223 | } else {
|
---|
224 | fmt.Fprintf(buf, "%d", c.Codes[offset+2])
|
---|
225 | }
|
---|
226 |
|
---|
227 | case Branchcount, Lazybranchcount:
|
---|
228 | buf.WriteString(", Limit = ")
|
---|
229 | if c.Codes[offset+2] == math.MaxInt32 {
|
---|
230 | buf.WriteString("inf")
|
---|
231 | } else {
|
---|
232 | fmt.Fprintf(buf, "%d", c.Codes[offset+2])
|
---|
233 | }
|
---|
234 |
|
---|
235 | }
|
---|
236 |
|
---|
237 | buf.WriteString(")")
|
---|
238 |
|
---|
239 | return buf.String()
|
---|
240 | }
|
---|
241 |
|
---|
242 | func (c *Code) Dump() string {
|
---|
243 | buf := &bytes.Buffer{}
|
---|
244 |
|
---|
245 | if c.RightToLeft {
|
---|
246 | fmt.Fprintln(buf, "Direction: right-to-left")
|
---|
247 | } else {
|
---|
248 | fmt.Fprintln(buf, "Direction: left-to-right")
|
---|
249 | }
|
---|
250 | if c.FcPrefix == nil {
|
---|
251 | fmt.Fprintln(buf, "Firstchars: n/a")
|
---|
252 | } else {
|
---|
253 | fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
|
---|
254 | }
|
---|
255 |
|
---|
256 | if c.BmPrefix == nil {
|
---|
257 | fmt.Fprintln(buf, "Prefix: n/a")
|
---|
258 | } else {
|
---|
259 | fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String()))
|
---|
260 | }
|
---|
261 |
|
---|
262 | fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors)
|
---|
263 | fmt.Fprintln(buf)
|
---|
264 |
|
---|
265 | if c.BmPrefix != nil {
|
---|
266 | fmt.Fprintln(buf, "BoyerMoore:")
|
---|
267 | fmt.Fprintln(buf, c.BmPrefix.Dump(" "))
|
---|
268 | }
|
---|
269 | for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
|
---|
270 | fmt.Fprintln(buf, c.OpcodeDescription(i))
|
---|
271 | }
|
---|
272 |
|
---|
273 | return buf.String()
|
---|
274 | }
|
---|