1 | /* |
2 | * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #pragma once |
27 | |
28 | #include "ConcurrentJSLock.h" |
29 | #include "YarrFlags.h" |
30 | #include "YarrPattern.h" |
31 | |
32 | namespace WTF { |
33 | class BumpPointerAllocator; |
34 | } |
35 | using WTF::BumpPointerAllocator; |
36 | |
37 | namespace JSC { namespace Yarr { |
38 | |
39 | class ByteDisjunction; |
40 | |
41 | struct ByteTerm { |
42 | union { |
43 | struct { |
44 | union { |
45 | UChar32 patternCharacter; |
46 | struct { |
47 | UChar32 lo; |
48 | UChar32 hi; |
49 | } casedCharacter; |
50 | CharacterClass* characterClass; |
51 | unsigned subpatternId; |
52 | }; |
53 | union { |
54 | ByteDisjunction* parenthesesDisjunction; |
55 | unsigned parenthesesWidth; |
56 | }; |
57 | QuantifierType quantityType; |
58 | unsigned quantityMinCount; |
59 | unsigned quantityMaxCount; |
60 | } atom; |
61 | struct { |
62 | int next; |
63 | int end; |
64 | bool onceThrough; |
65 | } alternative; |
66 | struct { |
67 | bool m_bol : 1; |
68 | bool m_eol : 1; |
69 | } anchors; |
70 | unsigned checkInputCount; |
71 | }; |
72 | unsigned frameLocation; |
73 | enum Type : uint8_t { |
74 | TypeBodyAlternativeBegin, |
75 | TypeBodyAlternativeDisjunction, |
76 | TypeBodyAlternativeEnd, |
77 | TypeAlternativeBegin, |
78 | TypeAlternativeDisjunction, |
79 | TypeAlternativeEnd, |
80 | TypeSubpatternBegin, |
81 | TypeSubpatternEnd, |
82 | TypeAssertionBOL, |
83 | TypeAssertionEOL, |
84 | TypeAssertionWordBoundary, |
85 | TypePatternCharacterOnce, |
86 | TypePatternCharacterFixed, |
87 | TypePatternCharacterGreedy, |
88 | TypePatternCharacterNonGreedy, |
89 | TypePatternCasedCharacterOnce, |
90 | TypePatternCasedCharacterFixed, |
91 | TypePatternCasedCharacterGreedy, |
92 | TypePatternCasedCharacterNonGreedy, |
93 | TypeCharacterClass, |
94 | TypeBackReference, |
95 | TypeParenthesesSubpattern, |
96 | TypeParenthesesSubpatternOnceBegin, |
97 | TypeParenthesesSubpatternOnceEnd, |
98 | TypeParenthesesSubpatternTerminalBegin, |
99 | TypeParenthesesSubpatternTerminalEnd, |
100 | TypeParentheticalAssertionBegin, |
101 | TypeParentheticalAssertionEnd, |
102 | TypeCheckInput, |
103 | TypeUncheckInput, |
104 | TypeDotStarEnclosure, |
105 | } type; |
106 | bool m_capture : 1; |
107 | bool m_invert : 1; |
108 | unsigned inputPosition; |
109 | |
110 | ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) |
111 | : frameLocation(frameLocation) |
112 | , m_capture(false) |
113 | , m_invert(false) |
114 | { |
115 | atom.patternCharacter = ch; |
116 | atom.quantityType = quantityType; |
117 | atom.quantityMinCount = quantityCount.unsafeGet(); |
118 | atom.quantityMaxCount = quantityCount.unsafeGet(); |
119 | inputPosition = inputPos; |
120 | |
121 | switch (quantityType) { |
122 | case QuantifierFixedCount: |
123 | type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; |
124 | break; |
125 | case QuantifierGreedy: |
126 | type = ByteTerm::TypePatternCharacterGreedy; |
127 | break; |
128 | case QuantifierNonGreedy: |
129 | type = ByteTerm::TypePatternCharacterNonGreedy; |
130 | break; |
131 | } |
132 | } |
133 | |
134 | ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) |
135 | : frameLocation(frameLocation) |
136 | , m_capture(false) |
137 | , m_invert(false) |
138 | { |
139 | switch (quantityType) { |
140 | case QuantifierFixedCount: |
141 | type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; |
142 | break; |
143 | case QuantifierGreedy: |
144 | type = ByteTerm::TypePatternCasedCharacterGreedy; |
145 | break; |
146 | case QuantifierNonGreedy: |
147 | type = ByteTerm::TypePatternCasedCharacterNonGreedy; |
148 | break; |
149 | } |
150 | |
151 | atom.casedCharacter.lo = lo; |
152 | atom.casedCharacter.hi = hi; |
153 | atom.quantityType = quantityType; |
154 | atom.quantityMinCount = quantityCount.unsafeGet(); |
155 | atom.quantityMaxCount = quantityCount.unsafeGet(); |
156 | inputPosition = inputPos; |
157 | } |
158 | |
159 | ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos) |
160 | : type(ByteTerm::TypeCharacterClass) |
161 | , m_capture(false) |
162 | , m_invert(invert) |
163 | { |
164 | atom.characterClass = characterClass; |
165 | atom.quantityType = QuantifierFixedCount; |
166 | atom.quantityMinCount = 1; |
167 | atom.quantityMaxCount = 1; |
168 | inputPosition = inputPos; |
169 | } |
170 | |
171 | ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos) |
172 | : type(type) |
173 | , m_capture(capture) |
174 | , m_invert(false) |
175 | { |
176 | atom.subpatternId = subpatternId; |
177 | atom.parenthesesDisjunction = parenthesesInfo; |
178 | atom.quantityType = QuantifierFixedCount; |
179 | atom.quantityMinCount = 1; |
180 | atom.quantityMaxCount = 1; |
181 | inputPosition = inputPos; |
182 | } |
183 | |
184 | ByteTerm(Type type, bool invert = false) |
185 | : type(type) |
186 | , m_capture(false) |
187 | , m_invert(invert) |
188 | { |
189 | atom.quantityType = QuantifierFixedCount; |
190 | atom.quantityMinCount = 1; |
191 | atom.quantityMaxCount = 1; |
192 | } |
193 | |
194 | ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos) |
195 | : type(type) |
196 | , m_capture(capture) |
197 | , m_invert(invert) |
198 | { |
199 | atom.subpatternId = subpatternId; |
200 | atom.quantityType = QuantifierFixedCount; |
201 | atom.quantityMinCount = 1; |
202 | atom.quantityMaxCount = 1; |
203 | inputPosition = inputPos; |
204 | } |
205 | |
206 | static ByteTerm BOL(unsigned inputPos) |
207 | { |
208 | ByteTerm term(TypeAssertionBOL); |
209 | term.inputPosition = inputPos; |
210 | return term; |
211 | } |
212 | |
213 | static ByteTerm CheckInput(Checked<unsigned> count) |
214 | { |
215 | ByteTerm term(TypeCheckInput); |
216 | term.checkInputCount = count.unsafeGet(); |
217 | return term; |
218 | } |
219 | |
220 | static ByteTerm UncheckInput(Checked<unsigned> count) |
221 | { |
222 | ByteTerm term(TypeUncheckInput); |
223 | term.checkInputCount = count.unsafeGet(); |
224 | return term; |
225 | } |
226 | |
227 | static ByteTerm EOL(unsigned inputPos) |
228 | { |
229 | ByteTerm term(TypeAssertionEOL); |
230 | term.inputPosition = inputPos; |
231 | return term; |
232 | } |
233 | |
234 | static ByteTerm WordBoundary(bool invert, unsigned inputPos) |
235 | { |
236 | ByteTerm term(TypeAssertionWordBoundary, invert); |
237 | term.inputPosition = inputPos; |
238 | return term; |
239 | } |
240 | |
241 | static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos) |
242 | { |
243 | return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); |
244 | } |
245 | |
246 | static ByteTerm BodyAlternativeBegin(bool onceThrough) |
247 | { |
248 | ByteTerm term(TypeBodyAlternativeBegin); |
249 | term.alternative.next = 0; |
250 | term.alternative.end = 0; |
251 | term.alternative.onceThrough = onceThrough; |
252 | return term; |
253 | } |
254 | |
255 | static ByteTerm BodyAlternativeDisjunction(bool onceThrough) |
256 | { |
257 | ByteTerm term(TypeBodyAlternativeDisjunction); |
258 | term.alternative.next = 0; |
259 | term.alternative.end = 0; |
260 | term.alternative.onceThrough = onceThrough; |
261 | return term; |
262 | } |
263 | |
264 | static ByteTerm BodyAlternativeEnd() |
265 | { |
266 | ByteTerm term(TypeBodyAlternativeEnd); |
267 | term.alternative.next = 0; |
268 | term.alternative.end = 0; |
269 | term.alternative.onceThrough = false; |
270 | return term; |
271 | } |
272 | |
273 | static ByteTerm AlternativeBegin() |
274 | { |
275 | ByteTerm term(TypeAlternativeBegin); |
276 | term.alternative.next = 0; |
277 | term.alternative.end = 0; |
278 | term.alternative.onceThrough = false; |
279 | return term; |
280 | } |
281 | |
282 | static ByteTerm AlternativeDisjunction() |
283 | { |
284 | ByteTerm term(TypeAlternativeDisjunction); |
285 | term.alternative.next = 0; |
286 | term.alternative.end = 0; |
287 | term.alternative.onceThrough = false; |
288 | return term; |
289 | } |
290 | |
291 | static ByteTerm AlternativeEnd() |
292 | { |
293 | ByteTerm term(TypeAlternativeEnd); |
294 | term.alternative.next = 0; |
295 | term.alternative.end = 0; |
296 | term.alternative.onceThrough = false; |
297 | return term; |
298 | } |
299 | |
300 | static ByteTerm SubpatternBegin() |
301 | { |
302 | return ByteTerm(TypeSubpatternBegin); |
303 | } |
304 | |
305 | static ByteTerm SubpatternEnd() |
306 | { |
307 | return ByteTerm(TypeSubpatternEnd); |
308 | } |
309 | |
310 | static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) |
311 | { |
312 | ByteTerm term(TypeDotStarEnclosure); |
313 | term.anchors.m_bol = bolAnchor; |
314 | term.anchors.m_eol = eolAnchor; |
315 | return term; |
316 | } |
317 | |
318 | bool invert() |
319 | { |
320 | return m_invert; |
321 | } |
322 | |
323 | bool capture() |
324 | { |
325 | return m_capture; |
326 | } |
327 | }; |
328 | |
329 | class ByteDisjunction { |
330 | WTF_MAKE_FAST_ALLOCATED; |
331 | public: |
332 | ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) |
333 | : m_numSubpatterns(numSubpatterns) |
334 | , m_frameSize(frameSize) |
335 | { |
336 | } |
337 | |
338 | size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); } |
339 | |
340 | Vector<ByteTerm> terms; |
341 | unsigned m_numSubpatterns; |
342 | unsigned m_frameSize; |
343 | }; |
344 | |
345 | struct BytecodePattern { |
346 | WTF_MAKE_FAST_ALLOCATED; |
347 | public: |
348 | BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock) |
349 | : m_body(WTFMove(body)) |
350 | , m_flags(pattern.m_flags) |
351 | , m_allocator(allocator) |
352 | , m_lock(lock) |
353 | { |
354 | m_body->terms.shrinkToFit(); |
355 | |
356 | newlineCharacterClass = pattern.newlineCharacterClass(); |
357 | if (unicode() && ignoreCase()) |
358 | wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass(); |
359 | else |
360 | wordcharCharacterClass = pattern.wordcharCharacterClass(); |
361 | |
362 | m_allParenthesesInfo.swap(parenthesesInfoToAdopt); |
363 | m_allParenthesesInfo.shrinkToFit(); |
364 | |
365 | m_userCharacterClasses.swap(pattern.m_userCharacterClasses); |
366 | m_userCharacterClasses.shrinkToFit(); |
367 | } |
368 | |
369 | size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); } |
370 | |
371 | bool ignoreCase() const { return m_flags.contains(Flags::IgnoreCase); } |
372 | bool multiline() const { return m_flags.contains(Flags::Multiline); } |
373 | bool sticky() const { return m_flags.contains(Flags::Sticky); } |
374 | bool unicode() const { return m_flags.contains(Flags::Unicode); } |
375 | bool dotAll() const { return m_flags.contains(Flags::DotAll); } |
376 | |
377 | std::unique_ptr<ByteDisjunction> m_body; |
378 | OptionSet<Flags> m_flags; |
379 | // Each BytecodePattern is associated with a RegExp, each RegExp is associated |
380 | // with a VM. Cache a pointer to our VM's m_regExpAllocator. |
381 | BumpPointerAllocator* m_allocator; |
382 | ConcurrentJSLock* m_lock; |
383 | |
384 | CharacterClass* newlineCharacterClass; |
385 | CharacterClass* wordcharCharacterClass; |
386 | |
387 | private: |
388 | Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; |
389 | Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; |
390 | }; |
391 | |
392 | JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ConcurrentJSLock* = nullptr); |
393 | JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); |
394 | unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); |
395 | unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); |
396 | |
397 | } } // namespace JSC::Yarr |
398 | |