1/*
2 * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#pragma once
27
28#include "ConcurrentJSLock.h"
29#include "YarrFlags.h"
30#include "YarrPattern.h"
31
32namespace WTF {
33class BumpPointerAllocator;
34}
35using WTF::BumpPointerAllocator;
36
37namespace JSC { namespace Yarr {
38
39class ByteDisjunction;
40
41struct ByteTerm {
42 union {
43 struct {
44 union {
45 UChar32 patternCharacter;
46 struct {
47 UChar32 lo;
48 UChar32 hi;
49 } casedCharacter;
50 CharacterClass* characterClass;
51 unsigned subpatternId;
52 };
53 union {
54 ByteDisjunction* parenthesesDisjunction;
55 unsigned parenthesesWidth;
56 };
57 QuantifierType quantityType;
58 unsigned quantityMinCount;
59 unsigned quantityMaxCount;
60 } atom;
61 struct {
62 int next;
63 int end;
64 bool onceThrough;
65 } alternative;
66 struct {
67 bool m_bol : 1;
68 bool m_eol : 1;
69 } anchors;
70 unsigned checkInputCount;
71 };
72 unsigned frameLocation;
73 enum Type : uint8_t {
74 TypeBodyAlternativeBegin,
75 TypeBodyAlternativeDisjunction,
76 TypeBodyAlternativeEnd,
77 TypeAlternativeBegin,
78 TypeAlternativeDisjunction,
79 TypeAlternativeEnd,
80 TypeSubpatternBegin,
81 TypeSubpatternEnd,
82 TypeAssertionBOL,
83 TypeAssertionEOL,
84 TypeAssertionWordBoundary,
85 TypePatternCharacterOnce,
86 TypePatternCharacterFixed,
87 TypePatternCharacterGreedy,
88 TypePatternCharacterNonGreedy,
89 TypePatternCasedCharacterOnce,
90 TypePatternCasedCharacterFixed,
91 TypePatternCasedCharacterGreedy,
92 TypePatternCasedCharacterNonGreedy,
93 TypeCharacterClass,
94 TypeBackReference,
95 TypeParenthesesSubpattern,
96 TypeParenthesesSubpatternOnceBegin,
97 TypeParenthesesSubpatternOnceEnd,
98 TypeParenthesesSubpatternTerminalBegin,
99 TypeParenthesesSubpatternTerminalEnd,
100 TypeParentheticalAssertionBegin,
101 TypeParentheticalAssertionEnd,
102 TypeCheckInput,
103 TypeUncheckInput,
104 TypeDotStarEnclosure,
105 } type;
106 bool m_capture : 1;
107 bool m_invert : 1;
108 unsigned inputPosition;
109
110 ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
111 : frameLocation(frameLocation)
112 , m_capture(false)
113 , m_invert(false)
114 {
115 atom.patternCharacter = ch;
116 atom.quantityType = quantityType;
117 atom.quantityMinCount = quantityCount.unsafeGet();
118 atom.quantityMaxCount = quantityCount.unsafeGet();
119 inputPosition = inputPos;
120
121 switch (quantityType) {
122 case QuantifierFixedCount:
123 type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
124 break;
125 case QuantifierGreedy:
126 type = ByteTerm::TypePatternCharacterGreedy;
127 break;
128 case QuantifierNonGreedy:
129 type = ByteTerm::TypePatternCharacterNonGreedy;
130 break;
131 }
132 }
133
134 ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
135 : frameLocation(frameLocation)
136 , m_capture(false)
137 , m_invert(false)
138 {
139 switch (quantityType) {
140 case QuantifierFixedCount:
141 type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
142 break;
143 case QuantifierGreedy:
144 type = ByteTerm::TypePatternCasedCharacterGreedy;
145 break;
146 case QuantifierNonGreedy:
147 type = ByteTerm::TypePatternCasedCharacterNonGreedy;
148 break;
149 }
150
151 atom.casedCharacter.lo = lo;
152 atom.casedCharacter.hi = hi;
153 atom.quantityType = quantityType;
154 atom.quantityMinCount = quantityCount.unsafeGet();
155 atom.quantityMaxCount = quantityCount.unsafeGet();
156 inputPosition = inputPos;
157 }
158
159 ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos)
160 : type(ByteTerm::TypeCharacterClass)
161 , m_capture(false)
162 , m_invert(invert)
163 {
164 atom.characterClass = characterClass;
165 atom.quantityType = QuantifierFixedCount;
166 atom.quantityMinCount = 1;
167 atom.quantityMaxCount = 1;
168 inputPosition = inputPos;
169 }
170
171 ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos)
172 : type(type)
173 , m_capture(capture)
174 , m_invert(false)
175 {
176 atom.subpatternId = subpatternId;
177 atom.parenthesesDisjunction = parenthesesInfo;
178 atom.quantityType = QuantifierFixedCount;
179 atom.quantityMinCount = 1;
180 atom.quantityMaxCount = 1;
181 inputPosition = inputPos;
182 }
183
184 ByteTerm(Type type, bool invert = false)
185 : type(type)
186 , m_capture(false)
187 , m_invert(invert)
188 {
189 atom.quantityType = QuantifierFixedCount;
190 atom.quantityMinCount = 1;
191 atom.quantityMaxCount = 1;
192 }
193
194 ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos)
195 : type(type)
196 , m_capture(capture)
197 , m_invert(invert)
198 {
199 atom.subpatternId = subpatternId;
200 atom.quantityType = QuantifierFixedCount;
201 atom.quantityMinCount = 1;
202 atom.quantityMaxCount = 1;
203 inputPosition = inputPos;
204 }
205
206 static ByteTerm BOL(unsigned inputPos)
207 {
208 ByteTerm term(TypeAssertionBOL);
209 term.inputPosition = inputPos;
210 return term;
211 }
212
213 static ByteTerm CheckInput(Checked<unsigned> count)
214 {
215 ByteTerm term(TypeCheckInput);
216 term.checkInputCount = count.unsafeGet();
217 return term;
218 }
219
220 static ByteTerm UncheckInput(Checked<unsigned> count)
221 {
222 ByteTerm term(TypeUncheckInput);
223 term.checkInputCount = count.unsafeGet();
224 return term;
225 }
226
227 static ByteTerm EOL(unsigned inputPos)
228 {
229 ByteTerm term(TypeAssertionEOL);
230 term.inputPosition = inputPos;
231 return term;
232 }
233
234 static ByteTerm WordBoundary(bool invert, unsigned inputPos)
235 {
236 ByteTerm term(TypeAssertionWordBoundary, invert);
237 term.inputPosition = inputPos;
238 return term;
239 }
240
241 static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos)
242 {
243 return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
244 }
245
246 static ByteTerm BodyAlternativeBegin(bool onceThrough)
247 {
248 ByteTerm term(TypeBodyAlternativeBegin);
249 term.alternative.next = 0;
250 term.alternative.end = 0;
251 term.alternative.onceThrough = onceThrough;
252 return term;
253 }
254
255 static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
256 {
257 ByteTerm term(TypeBodyAlternativeDisjunction);
258 term.alternative.next = 0;
259 term.alternative.end = 0;
260 term.alternative.onceThrough = onceThrough;
261 return term;
262 }
263
264 static ByteTerm BodyAlternativeEnd()
265 {
266 ByteTerm term(TypeBodyAlternativeEnd);
267 term.alternative.next = 0;
268 term.alternative.end = 0;
269 term.alternative.onceThrough = false;
270 return term;
271 }
272
273 static ByteTerm AlternativeBegin()
274 {
275 ByteTerm term(TypeAlternativeBegin);
276 term.alternative.next = 0;
277 term.alternative.end = 0;
278 term.alternative.onceThrough = false;
279 return term;
280 }
281
282 static ByteTerm AlternativeDisjunction()
283 {
284 ByteTerm term(TypeAlternativeDisjunction);
285 term.alternative.next = 0;
286 term.alternative.end = 0;
287 term.alternative.onceThrough = false;
288 return term;
289 }
290
291 static ByteTerm AlternativeEnd()
292 {
293 ByteTerm term(TypeAlternativeEnd);
294 term.alternative.next = 0;
295 term.alternative.end = 0;
296 term.alternative.onceThrough = false;
297 return term;
298 }
299
300 static ByteTerm SubpatternBegin()
301 {
302 return ByteTerm(TypeSubpatternBegin);
303 }
304
305 static ByteTerm SubpatternEnd()
306 {
307 return ByteTerm(TypeSubpatternEnd);
308 }
309
310 static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
311 {
312 ByteTerm term(TypeDotStarEnclosure);
313 term.anchors.m_bol = bolAnchor;
314 term.anchors.m_eol = eolAnchor;
315 return term;
316 }
317
318 bool invert()
319 {
320 return m_invert;
321 }
322
323 bool capture()
324 {
325 return m_capture;
326 }
327};
328
329class ByteDisjunction {
330 WTF_MAKE_FAST_ALLOCATED;
331public:
332 ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
333 : m_numSubpatterns(numSubpatterns)
334 , m_frameSize(frameSize)
335 {
336 }
337
338 size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); }
339
340 Vector<ByteTerm> terms;
341 unsigned m_numSubpatterns;
342 unsigned m_frameSize;
343};
344
345struct BytecodePattern {
346 WTF_MAKE_FAST_ALLOCATED;
347public:
348 BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
349 : m_body(WTFMove(body))
350 , m_flags(pattern.m_flags)
351 , m_allocator(allocator)
352 , m_lock(lock)
353 {
354 m_body->terms.shrinkToFit();
355
356 newlineCharacterClass = pattern.newlineCharacterClass();
357 if (unicode() && ignoreCase())
358 wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass();
359 else
360 wordcharCharacterClass = pattern.wordcharCharacterClass();
361
362 m_allParenthesesInfo.swap(parenthesesInfoToAdopt);
363 m_allParenthesesInfo.shrinkToFit();
364
365 m_userCharacterClasses.swap(pattern.m_userCharacterClasses);
366 m_userCharacterClasses.shrinkToFit();
367 }
368
369 size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); }
370
371 bool ignoreCase() const { return m_flags.contains(Flags::IgnoreCase); }
372 bool multiline() const { return m_flags.contains(Flags::Multiline); }
373 bool sticky() const { return m_flags.contains(Flags::Sticky); }
374 bool unicode() const { return m_flags.contains(Flags::Unicode); }
375 bool dotAll() const { return m_flags.contains(Flags::DotAll); }
376
377 std::unique_ptr<ByteDisjunction> m_body;
378 OptionSet<Flags> m_flags;
379 // Each BytecodePattern is associated with a RegExp, each RegExp is associated
380 // with a VM. Cache a pointer to our VM's m_regExpAllocator.
381 BumpPointerAllocator* m_allocator;
382 ConcurrentJSLock* m_lock;
383
384 CharacterClass* newlineCharacterClass;
385 CharacterClass* wordcharCharacterClass;
386
387private:
388 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
389 Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses;
390};
391
392JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ConcurrentJSLock* = nullptr);
393JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output);
394unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
395unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
396
397} } // namespace JSC::Yarr
398