1/*
2 * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrInterpreter.h"
29
30#include "Options.h"
31#include "SuperSampler.h"
32#include "Yarr.h"
33#include "YarrCanonicalize.h"
34#include <wtf/BumpPointerAllocator.h>
35#include <wtf/CheckedArithmetic.h>
36#include <wtf/DataLog.h>
37#include <wtf/text/CString.h>
38#include <wtf/text/WTFString.h>
39
40namespace JSC { namespace Yarr {
41
42template<typename CharType>
43class Interpreter {
44public:
45 struct ParenthesesDisjunctionContext;
46
47 struct BackTrackInfoParentheses {
48 uintptr_t matchAmount;
49 ParenthesesDisjunctionContext* lastContext;
50 };
51
52 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
53 {
54 context->next = backTrack->lastContext;
55 backTrack->lastContext = context;
56 ++backTrack->matchAmount;
57 }
58
59 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
60 {
61 RELEASE_ASSERT(backTrack->matchAmount);
62 RELEASE_ASSERT(backTrack->lastContext);
63 backTrack->lastContext = backTrack->lastContext->next;
64 --backTrack->matchAmount;
65 }
66
67 struct DisjunctionContext
68 {
69 DisjunctionContext() = default;
70
71 void* operator new(size_t, void* where)
72 {
73 return where;
74 }
75
76 static size_t allocationSize(unsigned numberOfFrames)
77 {
78 static_assert(alignof(DisjunctionContext) <= sizeof(void*), "");
79 size_t rawSize = (sizeof(DisjunctionContext) - sizeof(uintptr_t) + Checked<size_t>(numberOfFrames) * sizeof(uintptr_t)).unsafeGet();
80 size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize);
81 RELEASE_ASSERT(roundedSize >= rawSize);
82 return roundedSize;
83 }
84
85 int term { 0 };
86 unsigned matchBegin;
87 unsigned matchEnd;
88 uintptr_t frame[1];
89 };
90
91 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
92 {
93 size_t size = DisjunctionContext::allocationSize(disjunction->m_frameSize);
94 allocatorPool = allocatorPool->ensureCapacity(size);
95 RELEASE_ASSERT(allocatorPool);
96 return new (allocatorPool->alloc(size)) DisjunctionContext();
97 }
98
99 void freeDisjunctionContext(DisjunctionContext* context)
100 {
101 allocatorPool = allocatorPool->dealloc(context);
102 }
103
104 struct ParenthesesDisjunctionContext
105 {
106 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
107 {
108 unsigned firstSubpatternId = term.atom.subpatternId;
109 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
110
111 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
112 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
113 output[(firstSubpatternId << 1) + i] = offsetNoMatch;
114 }
115
116 new (getDisjunctionContext(term)) DisjunctionContext();
117 }
118
119 void* operator new(size_t, void* where)
120 {
121 return where;
122 }
123
124 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
125 {
126 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
127 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
128 }
129
130 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
131 {
132 return bitwise_cast<DisjunctionContext*>(bitwise_cast<uintptr_t>(this) + allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns));
133 }
134
135 static size_t allocationSize(unsigned numberOfSubpatterns)
136 {
137 static_assert(alignof(ParenthesesDisjunctionContext) <= sizeof(void*), "");
138 size_t rawSize = (sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (Checked<size_t>(numberOfSubpatterns) * 2U) * sizeof(unsigned)).unsafeGet();
139 size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize);
140 RELEASE_ASSERT(roundedSize >= rawSize);
141 return roundedSize;
142 }
143
144 ParenthesesDisjunctionContext* next { nullptr };
145 unsigned subpatternBackup[1];
146 };
147
148 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
149 {
150 size_t size = (Checked<size_t>(ParenthesesDisjunctionContext::allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)) + DisjunctionContext::allocationSize(disjunction->m_frameSize)).unsafeGet();
151 allocatorPool = allocatorPool->ensureCapacity(size);
152 RELEASE_ASSERT(allocatorPool);
153 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
154 }
155
156 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
157 {
158 allocatorPool = allocatorPool->dealloc(context);
159 }
160
161 class InputStream {
162 public:
163 InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs)
164 : input(input)
165 , pos(start)
166 , length(length)
167 , decodeSurrogatePairs(decodeSurrogatePairs)
168 {
169 }
170
171 void next()
172 {
173 ++pos;
174 }
175
176 void rewind(unsigned amount)
177 {
178 ASSERT(pos >= amount);
179 pos -= amount;
180 }
181
182 int read()
183 {
184 ASSERT(pos < length);
185 if (pos < length)
186 return input[pos];
187 return -1;
188 }
189
190 int readPair()
191 {
192 ASSERT(pos + 1 < length);
193 return input[pos] | input[pos + 1] << 16;
194 }
195
196 int readChecked(unsigned negativePositionOffest)
197 {
198 RELEASE_ASSERT(pos >= negativePositionOffest);
199 unsigned p = pos - negativePositionOffest;
200 ASSERT(p < length);
201 int result = input[p];
202 if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) {
203 if (atEnd())
204 return -1;
205
206 result = U16_GET_SUPPLEMENTARY(result, input[p + 1]);
207 next();
208 }
209 return result;
210 }
211
212 int readSurrogatePairChecked(unsigned negativePositionOffset)
213 {
214 RELEASE_ASSERT(pos >= negativePositionOffset);
215 unsigned p = pos - negativePositionOffset;
216 ASSERT(p < length);
217 if (p + 1 >= length)
218 return -1;
219
220 int first = input[p];
221 int second = input[p + 1];
222 if (U16_IS_LEAD(first) && U16_IS_TRAIL(second))
223 return U16_GET_SUPPLEMENTARY(first, second);
224
225 return -1;
226 }
227
228 int reread(unsigned from)
229 {
230 ASSERT(from < length);
231 int result = input[from];
232 if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1]))
233 result = U16_GET_SUPPLEMENTARY(result, input[from + 1]);
234 return result;
235 }
236
237 int prev()
238 {
239 ASSERT(!(pos > length));
240 if (pos && length)
241 return input[pos - 1];
242 return -1;
243 }
244
245 unsigned getPos()
246 {
247 return pos;
248 }
249
250 void setPos(unsigned p)
251 {
252 pos = p;
253 }
254
255 bool atStart()
256 {
257 return pos == 0;
258 }
259
260 bool atEnd()
261 {
262 return pos == length;
263 }
264
265 unsigned end()
266 {
267 return length;
268 }
269
270 bool checkInput(unsigned count)
271 {
272 if (((pos + count) <= length) && ((pos + count) >= pos)) {
273 pos += count;
274 return true;
275 }
276 return false;
277 }
278
279 void uncheckInput(unsigned count)
280 {
281 RELEASE_ASSERT(pos >= count);
282 pos -= count;
283 }
284
285 bool atStart(unsigned negativePositionOffset)
286 {
287 return pos == negativePositionOffset;
288 }
289
290 bool atEnd(unsigned negativePositionOffest)
291 {
292 RELEASE_ASSERT(pos >= negativePositionOffest);
293 return (pos - negativePositionOffest) == length;
294 }
295
296 bool isAvailableInput(unsigned offset)
297 {
298 return (((pos + offset) <= length) && ((pos + offset) >= pos));
299 }
300
301 private:
302 const CharType* input;
303 unsigned pos;
304 unsigned length;
305 bool decodeSurrogatePairs;
306 };
307
308 bool testCharacterClass(CharacterClass* characterClass, int ch)
309 {
310 auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) {
311 for (unsigned i = 0; i < matches.size(); ++i) {
312 if (ch == matches[i])
313 return true;
314 }
315
316 return false;
317 };
318
319 auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) {
320 size_t low = 0;
321 size_t high = matches.size() - 1;
322
323 while (low <= high) {
324 size_t mid = low + (high - low) / 2;
325 int diff = ch - matches[mid];
326 if (!diff)
327 return true;
328
329 if (diff < 0) {
330 if (mid == low)
331 return false;
332 high = mid - 1;
333 } else
334 low = mid + 1;
335 }
336 return false;
337 };
338
339 auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
340 for (unsigned i = 0; i < ranges.size(); ++i) {
341 if ((ch >= ranges[i].begin) && (ch <= ranges[i].end))
342 return true;
343 }
344
345 return false;
346 };
347
348 auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
349 size_t low = 0;
350 size_t high = ranges.size() - 1;
351
352 while (low <= high) {
353 size_t mid = low + (high - low) / 2;
354 int rangeBeginDiff = ch - ranges[mid].begin;
355 if (rangeBeginDiff >= 0 && ch <= ranges[mid].end)
356 return true;
357
358 if (rangeBeginDiff < 0) {
359 if (mid == low)
360 return false;
361 high = mid - 1;
362 } else
363 low = mid + 1;
364 }
365 return false;
366 };
367
368 if (characterClass->m_anyCharacter)
369 return true;
370
371 const size_t thresholdForBinarySearch = 6;
372
373 if (!isASCII(ch)) {
374 if (characterClass->m_matchesUnicode.size()) {
375 if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) {
376 if (binarySearchMatches(characterClass->m_matchesUnicode))
377 return true;
378 } else if (linearSearchMatches(characterClass->m_matchesUnicode))
379 return true;
380 }
381
382 if (characterClass->m_rangesUnicode.size()) {
383 if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) {
384 if (binarySearchRanges(characterClass->m_rangesUnicode))
385 return true;
386 } else if (linearSearchRanges(characterClass->m_rangesUnicode))
387 return true;
388 }
389 } else {
390 if (characterClass->m_matches.size()) {
391 if (characterClass->m_matches.size() > thresholdForBinarySearch) {
392 if (binarySearchMatches(characterClass->m_matches))
393 return true;
394 } else if (linearSearchMatches(characterClass->m_matches))
395 return true;
396 }
397
398 if (characterClass->m_ranges.size()) {
399 if (characterClass->m_ranges.size() > thresholdForBinarySearch) {
400 if (binarySearchRanges(characterClass->m_ranges))
401 return true;
402 } else if (linearSearchRanges(characterClass->m_ranges))
403 return true;
404 }
405 }
406
407 return false;
408 }
409
410 bool checkCharacter(int testChar, unsigned negativeInputOffset)
411 {
412 return testChar == input.readChecked(negativeInputOffset);
413 }
414
415 bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset)
416 {
417 return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset);
418 }
419
420 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
421 {
422 int ch = input.readChecked(negativeInputOffset);
423 return (loChar == ch) || (hiChar == ch);
424 }
425
426 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
427 {
428 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
429 return invert ? !match : match;
430 }
431
432 bool checkCharacterClassDontAdvanceInputForNonBMP(CharacterClass* characterClass, unsigned negativeInputOffset)
433 {
434 int readCharacter = characterClass->hasOnlyNonBMPCharacters() ? input.readSurrogatePairChecked(negativeInputOffset) : input.readChecked(negativeInputOffset);
435 return testCharacterClass(characterClass, readCharacter);
436 }
437
438 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
439 {
440 unsigned matchSize = (unsigned)(matchEnd - matchBegin);
441
442 if (!input.checkInput(matchSize))
443 return false;
444
445 for (unsigned i = 0; i < matchSize; ++i) {
446 int oldCh = input.reread(matchBegin + i);
447 int ch;
448 if (!U_IS_BMP(oldCh)) {
449 ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i);
450 ++i;
451 } else
452 ch = input.readChecked(negativeInputOffset + matchSize - i);
453
454 if (oldCh == ch)
455 continue;
456
457 if (pattern->ignoreCase()) {
458 // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode
459 // patterns, Unicode values are never allowed to match against ASCII ones.
460 // For Unicode, we need to check all canonical equivalents of a character.
461 if (!unicode && (isASCII(oldCh) || isASCII(ch))) {
462 if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
463 continue;
464 } else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2))
465 continue;
466 }
467
468 input.uncheckInput(matchSize);
469 return false;
470 }
471
472 return true;
473 }
474
475 bool matchAssertionBOL(ByteTerm& term)
476 {
477 return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
478 }
479
480 bool matchAssertionEOL(ByteTerm& term)
481 {
482 if (term.inputPosition)
483 return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
484
485 return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read()));
486 }
487
488 bool matchAssertionWordBoundary(ByteTerm& term)
489 {
490 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
491 bool readIsWordchar;
492 if (term.inputPosition)
493 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
494 else
495 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
496
497 bool wordBoundary = prevIsWordchar != readIsWordchar;
498 return term.invert() ? !wordBoundary : wordBoundary;
499 }
500
501 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
502 {
503 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
504
505 switch (term.atom.quantityType) {
506 case QuantifierFixedCount:
507 break;
508
509 case QuantifierGreedy:
510 if (backTrack->matchAmount) {
511 --backTrack->matchAmount;
512 input.uncheckInput(U16_LENGTH(term.atom.patternCharacter));
513 return true;
514 }
515 break;
516
517 case QuantifierNonGreedy:
518 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
519 ++backTrack->matchAmount;
520 if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
521 return true;
522 }
523 input.setPos(backTrack->begin);
524 break;
525 }
526
527 return false;
528 }
529
530 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
531 {
532 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
533
534 switch (term.atom.quantityType) {
535 case QuantifierFixedCount:
536 break;
537
538 case QuantifierGreedy:
539 if (backTrack->matchAmount) {
540 --backTrack->matchAmount;
541 input.uncheckInput(1);
542 return true;
543 }
544 break;
545
546 case QuantifierNonGreedy:
547 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
548 ++backTrack->matchAmount;
549 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
550 return true;
551 }
552 input.uncheckInput(backTrack->matchAmount);
553 break;
554 }
555
556 return false;
557 }
558
559 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
560 {
561 ASSERT(term.type == ByteTerm::TypeCharacterClass);
562 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
563
564 switch (term.atom.quantityType) {
565 case QuantifierFixedCount: {
566 if (unicode) {
567 CharacterClass* charClass = term.atom.characterClass;
568 backTrack->begin = input.getPos();
569 unsigned matchAmount = 0;
570 for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
571 if (term.invert()) {
572 if (!checkCharacterClass(charClass, term.invert(), term.inputPosition - matchAmount)) {
573 input.setPos(backTrack->begin);
574 return false;
575 }
576 } else {
577 unsigned matchOffset = matchAmount * (charClass->hasOnlyNonBMPCharacters() ? 2 : 1);
578 if (!checkCharacterClassDontAdvanceInputForNonBMP(charClass, term.inputPosition - matchOffset)) {
579 input.setPos(backTrack->begin);
580 return false;
581 }
582 }
583 }
584
585 return true;
586 }
587
588 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
589 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
590 return false;
591 }
592 return true;
593 }
594
595 case QuantifierGreedy: {
596 unsigned position = input.getPos();
597 backTrack->begin = position;
598 unsigned matchAmount = 0;
599 while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
600 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
601 input.setPos(position);
602 break;
603 }
604 ++matchAmount;
605 position = input.getPos();
606 }
607 backTrack->matchAmount = matchAmount;
608
609 return true;
610 }
611
612 case QuantifierNonGreedy:
613 backTrack->begin = input.getPos();
614 backTrack->matchAmount = 0;
615 return true;
616 }
617
618 RELEASE_ASSERT_NOT_REACHED();
619 return false;
620 }
621
622 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
623 {
624 ASSERT(term.type == ByteTerm::TypeCharacterClass);
625 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
626
627 switch (term.atom.quantityType) {
628 case QuantifierFixedCount:
629 if (unicode)
630 input.setPos(backTrack->begin);
631 break;
632
633 case QuantifierGreedy:
634 if (backTrack->matchAmount) {
635 if (unicode) {
636 // Rematch one less match
637 input.setPos(backTrack->begin);
638 --backTrack->matchAmount;
639 for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) {
640 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
641 input.uncheckInput(1);
642 break;
643 }
644 }
645 return true;
646 }
647 --backTrack->matchAmount;
648 input.uncheckInput(1);
649 return true;
650 }
651 break;
652
653 case QuantifierNonGreedy:
654 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
655 ++backTrack->matchAmount;
656 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
657 return true;
658 }
659 input.setPos(backTrack->begin);
660 break;
661 }
662
663 return false;
664 }
665
666 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
667 {
668 ASSERT(term.type == ByteTerm::TypeBackReference);
669 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
670
671 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
672 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
673
674 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
675 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
676 // Eg.: /(a\1)/
677 if (matchEnd == offsetNoMatch)
678 return true;
679
680 if (matchBegin == offsetNoMatch)
681 return true;
682
683 ASSERT(matchBegin <= matchEnd);
684
685 if (matchBegin == matchEnd)
686 return true;
687
688 switch (term.atom.quantityType) {
689 case QuantifierFixedCount: {
690 backTrack->begin = input.getPos();
691 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
692 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
693 input.setPos(backTrack->begin);
694 return false;
695 }
696 }
697 return true;
698 }
699
700 case QuantifierGreedy: {
701 unsigned matchAmount = 0;
702 while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
703 ++matchAmount;
704 backTrack->matchAmount = matchAmount;
705 return true;
706 }
707
708 case QuantifierNonGreedy:
709 backTrack->begin = input.getPos();
710 backTrack->matchAmount = 0;
711 return true;
712 }
713
714 RELEASE_ASSERT_NOT_REACHED();
715 return false;
716 }
717
718 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
719 {
720 ASSERT(term.type == ByteTerm::TypeBackReference);
721 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
722
723 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
724 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
725
726 if (matchBegin == offsetNoMatch)
727 return false;
728
729 ASSERT(matchBegin <= matchEnd);
730
731 if (matchBegin == matchEnd)
732 return false;
733
734 switch (term.atom.quantityType) {
735 case QuantifierFixedCount:
736 // for quantityMaxCount == 1, could rewind.
737 input.setPos(backTrack->begin);
738 break;
739
740 case QuantifierGreedy:
741 if (backTrack->matchAmount) {
742 --backTrack->matchAmount;
743 input.rewind(matchEnd - matchBegin);
744 return true;
745 }
746 break;
747
748 case QuantifierNonGreedy:
749 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
750 ++backTrack->matchAmount;
751 return true;
752 }
753 input.setPos(backTrack->begin);
754 break;
755 }
756
757 return false;
758 }
759
760 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
761 {
762 if (term.capture()) {
763 unsigned subpatternId = term.atom.subpatternId;
764 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition;
765 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition;
766 }
767 }
768 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
769 {
770 unsigned firstSubpatternId = term.atom.subpatternId;
771 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
772 context->restoreOutput(output, firstSubpatternId, count);
773 }
774 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
775 {
776 while (backTrack->matchAmount) {
777 ParenthesesDisjunctionContext* context = backTrack->lastContext;
778
779 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
780 if (result == JSRegExpMatch)
781 return JSRegExpMatch;
782
783 resetMatches(term, context);
784 popParenthesesDisjunctionContext(backTrack);
785 freeParenthesesDisjunctionContext(context);
786
787 if (result != JSRegExpNoMatch)
788 return result;
789 }
790
791 return JSRegExpNoMatch;
792 }
793
794 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
795 {
796 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
797 ASSERT(term.atom.quantityMaxCount == 1);
798
799 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
800
801 switch (term.atom.quantityType) {
802 case QuantifierGreedy: {
803 // set this speculatively; if we get to the parens end this will be true.
804 backTrack->begin = input.getPos();
805 break;
806 }
807 case QuantifierNonGreedy: {
808 backTrack->begin = notFound;
809 context->term += term.atom.parenthesesWidth;
810 return true;
811 }
812 case QuantifierFixedCount:
813 break;
814 }
815
816 if (term.capture()) {
817 unsigned subpatternId = term.atom.subpatternId;
818 output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
819 }
820
821 return true;
822 }
823
824 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
825 {
826 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
827 ASSERT(term.atom.quantityMaxCount == 1);
828
829 if (term.capture()) {
830 unsigned subpatternId = term.atom.subpatternId;
831 output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition;
832 }
833
834 if (term.atom.quantityType == QuantifierFixedCount)
835 return true;
836
837 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
838 return backTrack->begin != input.getPos();
839 }
840
841 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
842 {
843 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
844 ASSERT(term.atom.quantityMaxCount == 1);
845
846 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
847
848 if (term.capture()) {
849 unsigned subpatternId = term.atom.subpatternId;
850 output[(subpatternId << 1)] = offsetNoMatch;
851 output[(subpatternId << 1) + 1] = offsetNoMatch;
852 }
853
854 switch (term.atom.quantityType) {
855 case QuantifierGreedy:
856 // if we backtrack to this point, there is another chance - try matching nothing.
857 ASSERT(backTrack->begin != notFound);
858 backTrack->begin = notFound;
859 context->term += term.atom.parenthesesWidth;
860 return true;
861 case QuantifierNonGreedy:
862 ASSERT(backTrack->begin != notFound);
863 FALLTHROUGH;
864 case QuantifierFixedCount:
865 break;
866 }
867
868 return false;
869 }
870
871 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
872 {
873 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
874 ASSERT(term.atom.quantityMaxCount == 1);
875
876 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
877
878 switch (term.atom.quantityType) {
879 case QuantifierGreedy:
880 if (backTrack->begin == notFound) {
881 context->term -= term.atom.parenthesesWidth;
882 return false;
883 }
884 FALLTHROUGH;
885 case QuantifierNonGreedy:
886 if (backTrack->begin == notFound) {
887 backTrack->begin = input.getPos();
888 if (term.capture()) {
889 // Technically this access to inputPosition should be accessing the begin term's
890 // inputPosition, but for repeats other than fixed these values should be
891 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
892 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
893 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
894 unsigned subpatternId = term.atom.subpatternId;
895 output[subpatternId << 1] = input.getPos() - term.inputPosition;
896 }
897 context->term -= term.atom.parenthesesWidth;
898 return true;
899 }
900 FALLTHROUGH;
901 case QuantifierFixedCount:
902 break;
903 }
904
905 return false;
906 }
907
908 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
909 {
910 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
911 ASSERT(term.atom.quantityType == QuantifierGreedy);
912 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
913 ASSERT(!term.capture());
914
915 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
916 backTrack->begin = input.getPos();
917 return true;
918 }
919
920 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
921 {
922 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
923
924 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
925 // Empty match is a failed match.
926 if (backTrack->begin == input.getPos())
927 return false;
928
929 // Successful match! Okay, what's next? - loop around and try to match more!
930 context->term -= (term.atom.parenthesesWidth + 1);
931 return true;
932 }
933
934 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
935 {
936 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
937 ASSERT(term.atom.quantityType == QuantifierGreedy);
938 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
939 ASSERT(!term.capture());
940
941 // If we backtrack to this point, we have failed to match this iteration of the parens.
942 // Since this is greedy / zero minimum a failed is also accepted as a match!
943 context->term += term.atom.parenthesesWidth;
944 return true;
945 }
946
947 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
948 {
949 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
950 // should always be returned as a successful match - we should never backtrack to here.
951 RELEASE_ASSERT_NOT_REACHED();
952 return false;
953 }
954
955 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
956 {
957 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
958 ASSERT(term.atom.quantityMaxCount == 1);
959
960 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
961
962 backTrack->begin = input.getPos();
963 return true;
964 }
965
966 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
967 {
968 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
969 ASSERT(term.atom.quantityMaxCount == 1);
970
971 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
972
973 input.setPos(backTrack->begin);
974
975 // We've reached the end of the parens; if they are inverted, this is failure.
976 if (term.invert()) {
977 context->term -= term.atom.parenthesesWidth;
978 return false;
979 }
980
981 return true;
982 }
983
984 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
985 {
986 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
987 ASSERT(term.atom.quantityMaxCount == 1);
988
989 // We've failed to match parens; if they are inverted, this is win!
990 if (term.invert()) {
991 context->term += term.atom.parenthesesWidth;
992 return true;
993 }
994
995 return false;
996 }
997
998 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
999 {
1000 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
1001 ASSERT(term.atom.quantityMaxCount == 1);
1002
1003 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
1004
1005 input.setPos(backTrack->begin);
1006
1007 context->term -= term.atom.parenthesesWidth;
1008 return false;
1009 }
1010
1011 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
1012 {
1013 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1014
1015 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1016 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1017
1018 backTrack->matchAmount = 0;
1019 backTrack->lastContext = 0;
1020
1021 ASSERT(term.atom.quantityType != QuantifierFixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount);
1022
1023 unsigned minimumMatchCount = term.atom.quantityMinCount;
1024 JSRegExpResult fixedMatchResult;
1025
1026 // Handle fixed matches and the minimum part of a variable length match.
1027 if (minimumMatchCount) {
1028 // While we haven't yet reached our fixed limit,
1029 while (backTrack->matchAmount < minimumMatchCount) {
1030 // Try to do a match, and it it succeeds, add it to the list.
1031 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1032 fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1033 if (fixedMatchResult == JSRegExpMatch)
1034 appendParenthesesDisjunctionContext(backTrack, context);
1035 else {
1036 // The match failed; try to find an alternate point to carry on from.
1037 resetMatches(term, context);
1038 freeParenthesesDisjunctionContext(context);
1039
1040 if (fixedMatchResult != JSRegExpNoMatch)
1041 return fixedMatchResult;
1042 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1043 if (backtrackResult != JSRegExpMatch)
1044 return backtrackResult;
1045 }
1046 }
1047
1048 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1049 recordParenthesesMatch(term, context);
1050 }
1051
1052 switch (term.atom.quantityType) {
1053 case QuantifierFixedCount: {
1054 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1055 return JSRegExpMatch;
1056 }
1057
1058 case QuantifierGreedy: {
1059 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1060 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1061 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1062 if (result == JSRegExpMatch)
1063 appendParenthesesDisjunctionContext(backTrack, context);
1064 else {
1065 resetMatches(term, context);
1066 freeParenthesesDisjunctionContext(context);
1067
1068 if (result != JSRegExpNoMatch)
1069 return result;
1070
1071 break;
1072 }
1073 }
1074
1075 if (backTrack->matchAmount) {
1076 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1077 recordParenthesesMatch(term, context);
1078 }
1079 return JSRegExpMatch;
1080 }
1081
1082 case QuantifierNonGreedy:
1083 return JSRegExpMatch;
1084 }
1085
1086 RELEASE_ASSERT_NOT_REACHED();
1087 return JSRegExpErrorNoMatch;
1088 }
1089
1090 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
1091 //
1092 // Greedy matches never should try just adding more - you should already have done
1093 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
1094 // you backtrack an item off the list needs checking, since we'll never have matched
1095 // the one less case. Tracking forwards, still add as much as possible.
1096 //
1097 // Non-greedy, we've already done the one less case, so don't match on popping.
1098 // We haven't done the one more case, so always try to add that.
1099 //
1100 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
1101 {
1102 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1103
1104 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1105 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1106
1107 switch (term.atom.quantityType) {
1108 case QuantifierFixedCount: {
1109 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1110
1111 ParenthesesDisjunctionContext* context = 0;
1112 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
1113
1114 if (result != JSRegExpMatch)
1115 return result;
1116
1117 // While we haven't yet reached our fixed limit,
1118 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1119 // Try to do a match, and it it succeeds, add it to the list.
1120 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1121 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1122
1123 if (result == JSRegExpMatch)
1124 appendParenthesesDisjunctionContext(backTrack, context);
1125 else {
1126 // The match failed; try to find an alternate point to carry on from.
1127 resetMatches(term, context);
1128 freeParenthesesDisjunctionContext(context);
1129
1130 if (result != JSRegExpNoMatch)
1131 return result;
1132 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1133 if (backtrackResult != JSRegExpMatch)
1134 return backtrackResult;
1135 }
1136 }
1137
1138 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1139 context = backTrack->lastContext;
1140 recordParenthesesMatch(term, context);
1141 return JSRegExpMatch;
1142 }
1143
1144 case QuantifierGreedy: {
1145 if (!backTrack->matchAmount)
1146 return JSRegExpNoMatch;
1147
1148 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1149 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1150 if (result == JSRegExpMatch) {
1151 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1152 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1153 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1154 if (parenthesesResult == JSRegExpMatch)
1155 appendParenthesesDisjunctionContext(backTrack, context);
1156 else {
1157 resetMatches(term, context);
1158 freeParenthesesDisjunctionContext(context);
1159
1160 if (parenthesesResult != JSRegExpNoMatch)
1161 return parenthesesResult;
1162
1163 break;
1164 }
1165 }
1166 } else {
1167 resetMatches(term, context);
1168 popParenthesesDisjunctionContext(backTrack);
1169 freeParenthesesDisjunctionContext(context);
1170
1171 if (result != JSRegExpNoMatch || backTrack->matchAmount < term.atom.quantityMinCount)
1172 return result;
1173 }
1174
1175 if (backTrack->matchAmount) {
1176 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1177 recordParenthesesMatch(term, context);
1178 }
1179 return JSRegExpMatch;
1180 }
1181
1182 case QuantifierNonGreedy: {
1183 // If we've not reached the limit, try to add one more match.
1184 if (backTrack->matchAmount < term.atom.quantityMaxCount) {
1185 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1186 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1187 if (result == JSRegExpMatch) {
1188 appendParenthesesDisjunctionContext(backTrack, context);
1189 recordParenthesesMatch(term, context);
1190 return JSRegExpMatch;
1191 }
1192
1193 resetMatches(term, context);
1194 freeParenthesesDisjunctionContext(context);
1195
1196 if (result != JSRegExpNoMatch)
1197 return result;
1198 }
1199
1200 // Nope - okay backtrack looking for an alternative.
1201 while (backTrack->matchAmount) {
1202 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1203 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1204 if (result == JSRegExpMatch) {
1205 // successful backtrack! we're back in the game!
1206 if (backTrack->matchAmount) {
1207 context = backTrack->lastContext;
1208 recordParenthesesMatch(term, context);
1209 }
1210 return JSRegExpMatch;
1211 }
1212
1213 // pop a match off the stack
1214 resetMatches(term, context);
1215 popParenthesesDisjunctionContext(backTrack);
1216 freeParenthesesDisjunctionContext(context);
1217
1218 if (result != JSRegExpNoMatch)
1219 return result;
1220 }
1221
1222 return JSRegExpNoMatch;
1223 }
1224 }
1225
1226 RELEASE_ASSERT_NOT_REACHED();
1227 return JSRegExpErrorNoMatch;
1228 }
1229
1230 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1231 {
1232 UNUSED_PARAM(term);
1233
1234 if (pattern->dotAll()) {
1235 context->matchBegin = startOffset;
1236 context->matchEnd = input.end();
1237 return true;
1238 }
1239
1240 unsigned matchBegin = context->matchBegin;
1241
1242 if (matchBegin > startOffset) {
1243 for (matchBegin--; true; matchBegin--) {
1244 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
1245 ++matchBegin;
1246 break;
1247 }
1248
1249 if (matchBegin == startOffset)
1250 break;
1251 }
1252 }
1253
1254 unsigned matchEnd = input.getPos();
1255
1256 for (; (matchEnd != input.end())
1257 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
1258
1259 if (((matchBegin && term.anchors.m_bol)
1260 || ((matchEnd != input.end()) && term.anchors.m_eol))
1261 && !pattern->multiline())
1262 return false;
1263
1264 context->matchBegin = matchBegin;
1265 context->matchEnd = matchEnd;
1266 return true;
1267 }
1268
1269#define MATCH_NEXT() { ++context->term; goto matchAgain; }
1270#define BACKTRACK() { --context->term; goto backtrack; }
1271#define currentTerm() (disjunction->terms[context->term])
1272 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1273 {
1274 if (!--remainingMatchCount)
1275 return JSRegExpErrorHitLimit;
1276
1277 if (btrack)
1278 BACKTRACK();
1279
1280 context->matchBegin = input.getPos();
1281 context->term = 0;
1282
1283 matchAgain:
1284 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1285
1286 switch (currentTerm().type) {
1287 case ByteTerm::TypeSubpatternBegin:
1288 MATCH_NEXT();
1289 case ByteTerm::TypeSubpatternEnd:
1290 context->matchEnd = input.getPos();
1291 return JSRegExpMatch;
1292
1293 case ByteTerm::TypeBodyAlternativeBegin:
1294 MATCH_NEXT();
1295 case ByteTerm::TypeBodyAlternativeDisjunction:
1296 case ByteTerm::TypeBodyAlternativeEnd:
1297 context->matchEnd = input.getPos();
1298 return JSRegExpMatch;
1299
1300 case ByteTerm::TypeAlternativeBegin:
1301 MATCH_NEXT();
1302 case ByteTerm::TypeAlternativeDisjunction:
1303 case ByteTerm::TypeAlternativeEnd: {
1304 int offset = currentTerm().alternative.end;
1305 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1306 backTrack->offset = offset;
1307 context->term += offset;
1308 MATCH_NEXT();
1309 }
1310
1311 case ByteTerm::TypeAssertionBOL:
1312 if (matchAssertionBOL(currentTerm()))
1313 MATCH_NEXT();
1314 BACKTRACK();
1315 case ByteTerm::TypeAssertionEOL:
1316 if (matchAssertionEOL(currentTerm()))
1317 MATCH_NEXT();
1318 BACKTRACK();
1319 case ByteTerm::TypeAssertionWordBoundary:
1320 if (matchAssertionWordBoundary(currentTerm()))
1321 MATCH_NEXT();
1322 BACKTRACK();
1323
1324 case ByteTerm::TypePatternCharacterOnce:
1325 case ByteTerm::TypePatternCharacterFixed: {
1326 if (unicode) {
1327 if (!U_IS_BMP(currentTerm().atom.patternCharacter)) {
1328 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1329 if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) {
1330 BACKTRACK();
1331 }
1332 }
1333 MATCH_NEXT();
1334 }
1335 }
1336 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1337
1338 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1339 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) {
1340 input.setPos(position);
1341 BACKTRACK();
1342 }
1343 }
1344 MATCH_NEXT();
1345 }
1346 case ByteTerm::TypePatternCharacterGreedy: {
1347 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1348 unsigned matchAmount = 0;
1349 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1350 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1351 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
1352 input.setPos(position);
1353 break;
1354 }
1355 ++matchAmount;
1356 position = input.getPos();
1357 }
1358 backTrack->matchAmount = matchAmount;
1359
1360 MATCH_NEXT();
1361 }
1362 case ByteTerm::TypePatternCharacterNonGreedy: {
1363 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1364 backTrack->begin = input.getPos();
1365 backTrack->matchAmount = 0;
1366 MATCH_NEXT();
1367 }
1368
1369 case ByteTerm::TypePatternCasedCharacterOnce:
1370 case ByteTerm::TypePatternCasedCharacterFixed: {
1371 if (unicode) {
1372 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1373 ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter));
1374
1375 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1376
1377 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1378 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) {
1379 input.setPos(position);
1380 BACKTRACK();
1381 }
1382 }
1383 MATCH_NEXT();
1384 }
1385
1386 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1387 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
1388 BACKTRACK();
1389 }
1390 MATCH_NEXT();
1391 }
1392 case ByteTerm::TypePatternCasedCharacterGreedy: {
1393 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1394
1395 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1396 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1397
1398 unsigned matchAmount = 0;
1399 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1400 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
1401 input.uncheckInput(1);
1402 break;
1403 }
1404 ++matchAmount;
1405 }
1406 backTrack->matchAmount = matchAmount;
1407
1408 MATCH_NEXT();
1409 }
1410 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1411 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1412
1413 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1414 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1415
1416 backTrack->matchAmount = 0;
1417 MATCH_NEXT();
1418 }
1419
1420 case ByteTerm::TypeCharacterClass:
1421 if (matchCharacterClass(currentTerm(), context))
1422 MATCH_NEXT();
1423 BACKTRACK();
1424 case ByteTerm::TypeBackReference:
1425 if (matchBackReference(currentTerm(), context))
1426 MATCH_NEXT();
1427 BACKTRACK();
1428 case ByteTerm::TypeParenthesesSubpattern: {
1429 JSRegExpResult result = matchParentheses(currentTerm(), context);
1430
1431 if (result == JSRegExpMatch) {
1432 MATCH_NEXT();
1433 } else if (result != JSRegExpNoMatch)
1434 return result;
1435
1436 BACKTRACK();
1437 }
1438 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1439 if (matchParenthesesOnceBegin(currentTerm(), context))
1440 MATCH_NEXT();
1441 BACKTRACK();
1442 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1443 if (matchParenthesesOnceEnd(currentTerm(), context))
1444 MATCH_NEXT();
1445 BACKTRACK();
1446 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1447 if (matchParenthesesTerminalBegin(currentTerm(), context))
1448 MATCH_NEXT();
1449 BACKTRACK();
1450 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1451 if (matchParenthesesTerminalEnd(currentTerm(), context))
1452 MATCH_NEXT();
1453 BACKTRACK();
1454 case ByteTerm::TypeParentheticalAssertionBegin:
1455 if (matchParentheticalAssertionBegin(currentTerm(), context))
1456 MATCH_NEXT();
1457 BACKTRACK();
1458 case ByteTerm::TypeParentheticalAssertionEnd:
1459 if (matchParentheticalAssertionEnd(currentTerm(), context))
1460 MATCH_NEXT();
1461 BACKTRACK();
1462
1463 case ByteTerm::TypeCheckInput:
1464 if (input.checkInput(currentTerm().checkInputCount))
1465 MATCH_NEXT();
1466 BACKTRACK();
1467
1468 case ByteTerm::TypeUncheckInput:
1469 input.uncheckInput(currentTerm().checkInputCount);
1470 MATCH_NEXT();
1471
1472 case ByteTerm::TypeDotStarEnclosure:
1473 if (matchDotStarEnclosure(currentTerm(), context))
1474 return JSRegExpMatch;
1475 BACKTRACK();
1476 }
1477
1478 // We should never fall-through to here.
1479 RELEASE_ASSERT_NOT_REACHED();
1480
1481 backtrack:
1482 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1483
1484 switch (currentTerm().type) {
1485 case ByteTerm::TypeSubpatternBegin:
1486 return JSRegExpNoMatch;
1487 case ByteTerm::TypeSubpatternEnd:
1488 RELEASE_ASSERT_NOT_REACHED();
1489
1490 case ByteTerm::TypeBodyAlternativeBegin:
1491 case ByteTerm::TypeBodyAlternativeDisjunction: {
1492 int offset = currentTerm().alternative.next;
1493 context->term += offset;
1494 if (offset > 0)
1495 MATCH_NEXT();
1496
1497 if (input.atEnd() || pattern->sticky())
1498 return JSRegExpNoMatch;
1499
1500 input.next();
1501
1502 context->matchBegin = input.getPos();
1503
1504 if (currentTerm().alternative.onceThrough)
1505 context->term += currentTerm().alternative.next;
1506
1507 MATCH_NEXT();
1508 }
1509 case ByteTerm::TypeBodyAlternativeEnd:
1510 RELEASE_ASSERT_NOT_REACHED();
1511
1512 case ByteTerm::TypeAlternativeBegin:
1513 case ByteTerm::TypeAlternativeDisjunction: {
1514 int offset = currentTerm().alternative.next;
1515 context->term += offset;
1516 if (offset > 0)
1517 MATCH_NEXT();
1518 BACKTRACK();
1519 }
1520 case ByteTerm::TypeAlternativeEnd: {
1521 // We should never backtrack back into an alternative of the main body of the regex.
1522 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1523 unsigned offset = backTrack->offset;
1524 context->term -= offset;
1525 BACKTRACK();
1526 }
1527
1528 case ByteTerm::TypeAssertionBOL:
1529 case ByteTerm::TypeAssertionEOL:
1530 case ByteTerm::TypeAssertionWordBoundary:
1531 BACKTRACK();
1532
1533 case ByteTerm::TypePatternCharacterOnce:
1534 case ByteTerm::TypePatternCharacterFixed:
1535 case ByteTerm::TypePatternCharacterGreedy:
1536 case ByteTerm::TypePatternCharacterNonGreedy:
1537 if (backtrackPatternCharacter(currentTerm(), context))
1538 MATCH_NEXT();
1539 BACKTRACK();
1540 case ByteTerm::TypePatternCasedCharacterOnce:
1541 case ByteTerm::TypePatternCasedCharacterFixed:
1542 case ByteTerm::TypePatternCasedCharacterGreedy:
1543 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1544 if (backtrackPatternCasedCharacter(currentTerm(), context))
1545 MATCH_NEXT();
1546 BACKTRACK();
1547 case ByteTerm::TypeCharacterClass:
1548 if (backtrackCharacterClass(currentTerm(), context))
1549 MATCH_NEXT();
1550 BACKTRACK();
1551 case ByteTerm::TypeBackReference:
1552 if (backtrackBackReference(currentTerm(), context))
1553 MATCH_NEXT();
1554 BACKTRACK();
1555 case ByteTerm::TypeParenthesesSubpattern: {
1556 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1557
1558 if (result == JSRegExpMatch) {
1559 MATCH_NEXT();
1560 } else if (result != JSRegExpNoMatch)
1561 return result;
1562
1563 BACKTRACK();
1564 }
1565 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1566 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1567 MATCH_NEXT();
1568 BACKTRACK();
1569 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1570 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1571 MATCH_NEXT();
1572 BACKTRACK();
1573 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1574 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1575 MATCH_NEXT();
1576 BACKTRACK();
1577 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1578 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1579 MATCH_NEXT();
1580 BACKTRACK();
1581 case ByteTerm::TypeParentheticalAssertionBegin:
1582 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1583 MATCH_NEXT();
1584 BACKTRACK();
1585 case ByteTerm::TypeParentheticalAssertionEnd:
1586 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1587 MATCH_NEXT();
1588 BACKTRACK();
1589
1590 case ByteTerm::TypeCheckInput:
1591 input.uncheckInput(currentTerm().checkInputCount);
1592 BACKTRACK();
1593
1594 case ByteTerm::TypeUncheckInput:
1595 input.checkInput(currentTerm().checkInputCount);
1596 BACKTRACK();
1597
1598 case ByteTerm::TypeDotStarEnclosure:
1599 RELEASE_ASSERT_NOT_REACHED();
1600 }
1601
1602 RELEASE_ASSERT_NOT_REACHED();
1603 return JSRegExpErrorNoMatch;
1604 }
1605
1606 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1607 {
1608 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1609
1610 if (result == JSRegExpMatch) {
1611 while (context->matchBegin == context->matchEnd) {
1612 result = matchDisjunction(disjunction, context, true);
1613 if (result != JSRegExpMatch)
1614 return result;
1615 }
1616 return JSRegExpMatch;
1617 }
1618
1619 return result;
1620 }
1621
1622 unsigned interpret()
1623 {
1624 // FIXME: https://bugs.webkit.org/show_bug.cgi?id=195970
1625 // [Yarr Interpreter] The interpreter doesn't have checks for stack overflow due to deep recursion
1626 if (!input.isAvailableInput(0))
1627 return offsetNoMatch;
1628
1629 if (pattern->m_lock)
1630 pattern->m_lock->lock();
1631
1632 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
1633 output[i << 1] = offsetNoMatch;
1634
1635 allocatorPool = pattern->m_allocator->startAllocator();
1636 RELEASE_ASSERT(allocatorPool);
1637
1638 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1639
1640 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
1641 if (result == JSRegExpMatch) {
1642 output[0] = context->matchBegin;
1643 output[1] = context->matchEnd;
1644 }
1645
1646 freeDisjunctionContext(context);
1647
1648 pattern->m_allocator->stopAllocator();
1649
1650 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
1651
1652 if (pattern->m_lock)
1653 pattern->m_lock->unlock();
1654
1655 return output[0];
1656 }
1657
1658 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
1659 : pattern(pattern)
1660 , unicode(pattern->unicode())
1661 , output(output)
1662 , input(input, start, length, pattern->unicode())
1663 , startOffset(start)
1664 , remainingMatchCount(matchLimit)
1665 {
1666 }
1667
1668private:
1669 BytecodePattern* pattern;
1670 bool unicode;
1671 unsigned* output;
1672 InputStream input;
1673 WTF::BumpPointerPool* allocatorPool { nullptr };
1674 unsigned startOffset;
1675 unsigned remainingMatchCount;
1676};
1677
1678class ByteCompiler {
1679 struct ParenthesesStackEntry {
1680 unsigned beginTerm;
1681 unsigned savedAlternativeIndex;
1682 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1683 : beginTerm(beginTerm)
1684 , savedAlternativeIndex(savedAlternativeIndex)
1685 {
1686 }
1687 };
1688
1689public:
1690 ByteCompiler(YarrPattern& pattern)
1691 : m_pattern(pattern)
1692 {
1693 m_currentAlternativeIndex = 0;
1694 }
1695
1696 std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
1697 {
1698 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1699 emitDisjunction(m_pattern.m_body);
1700 regexEnd();
1701
1702#ifndef NDEBUG
1703 if (Options::dumpCompiledRegExpPatterns())
1704 dumpDisjunction(m_bodyDisjunction.get());
1705#endif
1706
1707 return std::make_unique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock);
1708 }
1709
1710 void checkInput(unsigned count)
1711 {
1712 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1713 }
1714
1715 void uncheckInput(unsigned count)
1716 {
1717 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1718 }
1719
1720 void assertionBOL(unsigned inputPosition)
1721 {
1722 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1723 }
1724
1725 void assertionEOL(unsigned inputPosition)
1726 {
1727 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1728 }
1729
1730 void assertionWordBoundary(bool invert, unsigned inputPosition)
1731 {
1732 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1733 }
1734
1735 void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1736 {
1737 if (m_pattern.ignoreCase()) {
1738 UChar32 lo = u_tolower(ch);
1739 UChar32 hi = u_toupper(ch);
1740
1741 if (lo != hi) {
1742 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType));
1743 return;
1744 }
1745 }
1746
1747 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType));
1748 }
1749
1750 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1751 {
1752 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1753
1754 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1755 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1756 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1757 }
1758
1759 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1760 {
1761 ASSERT(subpatternId);
1762
1763 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1764
1765 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1766 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1767 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1768 }
1769
1770 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1771 {
1772 int beginTerm = m_bodyDisjunction->terms.size();
1773
1774 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1775 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1776 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1777 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1778
1779 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1780 m_currentAlternativeIndex = beginTerm + 1;
1781 }
1782
1783 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1784 {
1785 int beginTerm = m_bodyDisjunction->terms.size();
1786
1787 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1788 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1789 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1790 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1791
1792 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1793 m_currentAlternativeIndex = beginTerm + 1;
1794 }
1795
1796 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1797 {
1798 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1799 // then fix this up at the end! - simplifying this should make it much clearer.
1800 // https://bugs.webkit.org/show_bug.cgi?id=50136
1801
1802 int beginTerm = m_bodyDisjunction->terms.size();
1803
1804 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1805 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1806 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1807 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1808
1809 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1810 m_currentAlternativeIndex = beginTerm + 1;
1811 }
1812
1813 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1814 {
1815 int beginTerm = m_bodyDisjunction->terms.size();
1816
1817 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1818 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1819 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1820 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1821
1822 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1823 m_currentAlternativeIndex = beginTerm + 1;
1824 }
1825
1826 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1827 {
1828 unsigned beginTerm = popParenthesesStack();
1829 closeAlternative(beginTerm + 1);
1830 unsigned endTerm = m_bodyDisjunction->terms.size();
1831
1832 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1833
1834 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1835 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1836
1837 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1838 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1839 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1840 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1841
1842 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1843 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1844 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1845 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1846 }
1847
1848 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1849 {
1850 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
1851 }
1852
1853 unsigned popParenthesesStack()
1854 {
1855 ASSERT(m_parenthesesStack.size());
1856 int stackEnd = m_parenthesesStack.size() - 1;
1857 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1858 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1859 m_parenthesesStack.shrink(stackEnd);
1860
1861 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1862 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1863
1864 return beginTerm;
1865 }
1866
1867 void closeAlternative(int beginTerm)
1868 {
1869 int origBeginTerm = beginTerm;
1870 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1871 int endIndex = m_bodyDisjunction->terms.size();
1872
1873 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1874
1875 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1876 m_bodyDisjunction->terms.remove(beginTerm);
1877 else {
1878 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1879 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1880 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1881 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1882 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1883 }
1884
1885 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1886
1887 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1888 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1889 }
1890 }
1891
1892 void closeBodyAlternative()
1893 {
1894 int beginTerm = 0;
1895 int origBeginTerm = 0;
1896 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1897 int endIndex = m_bodyDisjunction->terms.size();
1898
1899 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1900
1901 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1902 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1903 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1904 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1905 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1906 }
1907
1908 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1909
1910 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1911 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1912 }
1913
1914 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1915 {
1916 unsigned beginTerm = popParenthesesStack();
1917 closeAlternative(beginTerm + 1);
1918 unsigned endTerm = m_bodyDisjunction->terms.size();
1919
1920 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1921
1922 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1923
1924 bool capture = parenthesesBegin.capture();
1925 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1926
1927 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1928 auto parenthesesDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
1929
1930 unsigned firstTermInParentheses = beginTerm + 1;
1931 parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
1932
1933 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1934 for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
1935 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1936 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1937
1938 m_bodyDisjunction->terms.shrink(beginTerm);
1939
1940 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
1941 m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction));
1942
1943 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1944 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1945 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1946 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1947 }
1948
1949 void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1950 {
1951 unsigned beginTerm = popParenthesesStack();
1952 closeAlternative(beginTerm + 1);
1953 unsigned endTerm = m_bodyDisjunction->terms.size();
1954
1955 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1956
1957 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1958 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1959
1960 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1961 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1962 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1963 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1964
1965 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1966 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1967 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1968 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1969 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1970 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1971 }
1972
1973 void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1974 {
1975 unsigned beginTerm = popParenthesesStack();
1976 closeAlternative(beginTerm + 1);
1977 unsigned endTerm = m_bodyDisjunction->terms.size();
1978
1979 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1980
1981 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1982 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1983
1984 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1985 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1986 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1987 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1988
1989 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1990 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1991 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1992 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1993 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1994 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1995 }
1996
1997 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1998 {
1999 m_bodyDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
2000 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
2001 m_bodyDisjunction->terms[0].frameLocation = 0;
2002 m_currentAlternativeIndex = 0;
2003 }
2004
2005 void regexEnd()
2006 {
2007 closeBodyAlternative();
2008 }
2009
2010 void alternativeBodyDisjunction(bool onceThrough)
2011 {
2012 int newAlternativeIndex = m_bodyDisjunction->terms.size();
2013 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
2014 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
2015
2016 m_currentAlternativeIndex = newAlternativeIndex;
2017 }
2018
2019 void alternativeDisjunction()
2020 {
2021 int newAlternativeIndex = m_bodyDisjunction->terms.size();
2022 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
2023 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
2024
2025 m_currentAlternativeIndex = newAlternativeIndex;
2026 }
2027
2028 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
2029 {
2030 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
2031 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
2032
2033 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
2034
2035 if (alt) {
2036 if (disjunction == m_pattern.m_body)
2037 alternativeBodyDisjunction(alternative->onceThrough());
2038 else
2039 alternativeDisjunction();
2040 }
2041
2042 unsigned minimumSize = alternative->m_minimumSize;
2043 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
2044 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
2045
2046 if (countToCheck) {
2047 checkInput(countToCheck);
2048 currentCountAlreadyChecked += countToCheck;
2049 }
2050
2051 for (auto& term : alternative->m_terms) {
2052 switch (term.type) {
2053 case PatternTerm::TypeAssertionBOL:
2054 assertionBOL(currentCountAlreadyChecked - term.inputPosition);
2055 break;
2056
2057 case PatternTerm::TypeAssertionEOL:
2058 assertionEOL(currentCountAlreadyChecked - term.inputPosition);
2059 break;
2060
2061 case PatternTerm::TypeAssertionWordBoundary:
2062 assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
2063 break;
2064
2065 case PatternTerm::TypePatternCharacter:
2066 atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType);
2067 break;
2068
2069 case PatternTerm::TypeCharacterClass:
2070 atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType);
2071 break;
2072
2073 case PatternTerm::TypeBackReference:
2074 atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType);
2075 break;
2076
2077 case PatternTerm::TypeForwardReference:
2078 break;
2079
2080 case PatternTerm::TypeParenthesesSubpattern: {
2081 unsigned disjunctionAlreadyCheckedCount = 0;
2082 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
2083 unsigned alternativeFrameLocation = term.frameLocation;
2084 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
2085 if (term.quantityType == QuantifierFixedCount)
2086 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
2087 else
2088 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
2089 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2090 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2091 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
2092 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
2093 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
2094 } else if (term.parentheses.isTerminal) {
2095 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2096 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2097 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesTerminal);
2098 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
2099 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
2100 } else {
2101 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2102 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2103 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0);
2104 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
2105 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
2106 }
2107 break;
2108 }
2109
2110 case PatternTerm::TypeParentheticalAssertion: {
2111 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
2112
2113 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2114 unsigned positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
2115 unsigned uncheckAmount = 0;
2116 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
2117 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
2118 uncheckInput(uncheckAmount);
2119 currentCountAlreadyChecked -= uncheckAmount;
2120 }
2121
2122 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
2123 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
2124 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType);
2125 if (uncheckAmount) {
2126 checkInput(uncheckAmount);
2127 currentCountAlreadyChecked += uncheckAmount;
2128 }
2129 break;
2130 }
2131
2132 case PatternTerm::TypeDotStarEnclosure:
2133 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
2134 break;
2135 }
2136 }
2137 }
2138 }
2139#ifndef NDEBUG
2140 void dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting = 0)
2141 {
2142 PrintStream& out = WTF::dataFile();
2143
2144 unsigned termIndexNest = 0;
2145
2146 if (!nesting) {
2147 out.printf("ByteDisjunction(%p):\n", disjunction);
2148 nesting = 1;
2149 } else {
2150 termIndexNest = nesting - 1;
2151 nesting = 2;
2152 }
2153
2154 auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) {
2155 for (unsigned nestingDepth = 0; nestingDepth < termIndexNest; nestingDepth++)
2156 out.print(" ");
2157 out.printf("%4zu", index);
2158 for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++)
2159 out.print(" ");
2160 };
2161
2162 auto dumpQuantity = [&](ByteTerm& term) {
2163 if (term.atom.quantityType == QuantifierFixedCount && term.atom.quantityMinCount == 1 && term.atom.quantityMaxCount == 1)
2164 return;
2165
2166 out.print(" {", term.atom.quantityMinCount);
2167 if (term.atom.quantityMinCount != term.atom.quantityMaxCount) {
2168 if (term.atom.quantityMaxCount == UINT_MAX)
2169 out.print(",inf");
2170 else
2171 out.print(",", term.atom.quantityMaxCount);
2172 }
2173 out.print("}");
2174 if (term.atom.quantityType == QuantifierGreedy)
2175 out.print(" greedy");
2176 else if (term.atom.quantityType == QuantifierNonGreedy)
2177 out.print(" non-greedy");
2178 };
2179
2180 auto dumpCaptured = [&](ByteTerm& term) {
2181 if (term.capture())
2182 out.print(" captured (#", term.atom.subpatternId, ")");
2183 };
2184
2185 auto dumpInverted = [&](ByteTerm& term) {
2186 if (term.invert())
2187 out.print(" inverted");
2188 };
2189
2190 auto dumpInputPosition = [&](ByteTerm& term) {
2191 out.printf(" inputPosition %u", term.inputPosition);
2192 };
2193
2194 auto dumpFrameLocation = [&](ByteTerm& term) {
2195 out.printf(" frameLocation %u", term.frameLocation);
2196 };
2197
2198 auto dumpCharacter = [&](ByteTerm& term) {
2199 out.print(" ");
2200 dumpUChar32(out, term.atom.patternCharacter);
2201 };
2202
2203 auto dumpCharClass = [&](ByteTerm& term) {
2204 out.print(" ");
2205 dumpCharacterClass(out, &m_pattern, term.atom.characterClass);
2206 };
2207
2208 for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) {
2209 ByteTerm term = disjunction->terms[idx];
2210
2211 bool outputNewline = true;
2212
2213 switch (term.type) {
2214 case ByteTerm::TypeBodyAlternativeBegin:
2215 outputTermIndexAndNest(idx, nesting++);
2216 out.print("BodyAlternativeBegin");
2217 if (term.alternative.onceThrough)
2218 out.print(" onceThrough");
2219 dumpFrameLocation(term);
2220 break;
2221 case ByteTerm::TypeBodyAlternativeDisjunction:
2222 outputTermIndexAndNest(idx, nesting - 1);
2223 out.print("BodyAlternativeDisjunction");
2224 dumpFrameLocation(term);
2225 break;
2226 case ByteTerm::TypeBodyAlternativeEnd:
2227 outputTermIndexAndNest(idx, --nesting);
2228 out.print("BodyAlternativeEnd");
2229 dumpFrameLocation(term);
2230 break;
2231 case ByteTerm::TypeAlternativeBegin:
2232 outputTermIndexAndNest(idx, nesting++);
2233 out.print("AlternativeBegin");
2234 dumpFrameLocation(term);
2235 break;
2236 case ByteTerm::TypeAlternativeDisjunction:
2237 outputTermIndexAndNest(idx, nesting - 1);
2238 out.print("AlternativeDisjunction");
2239 dumpFrameLocation(term);
2240 break;
2241 case ByteTerm::TypeAlternativeEnd:
2242 outputTermIndexAndNest(idx, --nesting);
2243 out.print("AlternativeEnd");
2244 dumpFrameLocation(term);
2245 break;
2246 case ByteTerm::TypeSubpatternBegin:
2247 outputTermIndexAndNest(idx, nesting++);
2248 out.print("SubpatternBegin");
2249 break;
2250 case ByteTerm::TypeSubpatternEnd:
2251 outputTermIndexAndNest(idx, --nesting);
2252 out.print("SubpatternEnd");
2253 break;
2254 case ByteTerm::TypeAssertionBOL:
2255 outputTermIndexAndNest(idx, nesting);
2256 out.print("AssertionBOL");
2257 break;
2258 case ByteTerm::TypeAssertionEOL:
2259 outputTermIndexAndNest(idx, nesting);
2260 out.print("AssertionEOL");
2261 break;
2262 case ByteTerm::TypeAssertionWordBoundary:
2263 outputTermIndexAndNest(idx, nesting);
2264 out.print("AssertionWordBoundary");
2265 break;
2266 case ByteTerm::TypePatternCharacterOnce:
2267 outputTermIndexAndNest(idx, nesting);
2268 out.print("PatternCharacterOnce");
2269 dumpInverted(term);
2270 dumpInputPosition(term);
2271 dumpFrameLocation(term);
2272 dumpCharacter(term);
2273 dumpQuantity(term);
2274 break;
2275 case ByteTerm::TypePatternCharacterFixed:
2276 outputTermIndexAndNest(idx, nesting);
2277 out.print("PatternCharacterFixed");
2278 dumpInverted(term);
2279 dumpInputPosition(term);
2280 dumpFrameLocation(term);
2281 dumpCharacter(term);
2282 out.print(" {", term.atom.quantityMinCount, "}");
2283 break;
2284 case ByteTerm::TypePatternCharacterGreedy:
2285 outputTermIndexAndNest(idx, nesting);
2286 out.print("PatternCharacterGreedy");
2287 dumpInverted(term);
2288 dumpInputPosition(term);
2289 dumpFrameLocation(term);
2290 dumpCharacter(term);
2291 dumpQuantity(term);
2292 break;
2293 case ByteTerm::TypePatternCharacterNonGreedy:
2294 outputTermIndexAndNest(idx, nesting);
2295 out.print("PatternCharacterNonGreedy");
2296 dumpInverted(term);
2297 dumpInputPosition(term);
2298 dumpFrameLocation(term);
2299 dumpCharacter(term);
2300 dumpQuantity(term);
2301 break;
2302 case ByteTerm::TypePatternCasedCharacterOnce:
2303 outputTermIndexAndNest(idx, nesting);
2304 out.print("PatternCasedCharacterOnce");
2305 break;
2306 case ByteTerm::TypePatternCasedCharacterFixed:
2307 outputTermIndexAndNest(idx, nesting);
2308 out.print("PatternCasedCharacterFixed");
2309 break;
2310 case ByteTerm::TypePatternCasedCharacterGreedy:
2311 outputTermIndexAndNest(idx, nesting);
2312 out.print("PatternCasedCharacterGreedy");
2313 break;
2314 case ByteTerm::TypePatternCasedCharacterNonGreedy:
2315 outputTermIndexAndNest(idx, nesting);
2316 out.print("PatternCasedCharacterNonGreedy");
2317 break;
2318 case ByteTerm::TypeCharacterClass:
2319 outputTermIndexAndNest(idx, nesting);
2320 out.print("CharacterClass");
2321 dumpInverted(term);
2322 dumpInputPosition(term);
2323 dumpFrameLocation(term);
2324 dumpCharClass(term);
2325 dumpQuantity(term);
2326 break;
2327 case ByteTerm::TypeBackReference:
2328 outputTermIndexAndNest(idx, nesting);
2329 out.print("BackReference #", term.atom.subpatternId);
2330 dumpQuantity(term);
2331 break;
2332 case ByteTerm::TypeParenthesesSubpattern:
2333 outputTermIndexAndNest(idx, nesting);
2334 out.print("ParenthesesSubpattern");
2335 dumpCaptured(term);
2336 dumpInverted(term);
2337 dumpInputPosition(term);
2338 dumpFrameLocation(term);
2339 dumpQuantity(term);
2340 out.print("\n");
2341 outputNewline = false;
2342 dumpDisjunction(term.atom.parenthesesDisjunction, nesting);
2343 break;
2344 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
2345 outputTermIndexAndNest(idx, nesting++);
2346 out.print("ParenthesesSubpatternOnceBegin");
2347 dumpCaptured(term);
2348 dumpInverted(term);
2349 dumpInputPosition(term);
2350 dumpFrameLocation(term);
2351 break;
2352 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
2353 outputTermIndexAndNest(idx, --nesting);
2354 out.print("ParenthesesSubpatternOnceEnd");
2355 dumpFrameLocation(term);
2356 break;
2357 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
2358 outputTermIndexAndNest(idx, nesting++);
2359 out.print("ParenthesesSubpatternTerminalBegin");
2360 dumpInverted(term);
2361 dumpInputPosition(term);
2362 dumpFrameLocation(term);
2363 break;
2364 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
2365 outputTermIndexAndNest(idx, --nesting);
2366 out.print("ParenthesesSubpatternTerminalEnd");
2367 dumpFrameLocation(term);
2368 break;
2369 case ByteTerm::TypeParentheticalAssertionBegin:
2370 outputTermIndexAndNest(idx, nesting++);
2371 out.print("ParentheticalAssertionBegin");
2372 dumpInverted(term);
2373 dumpInputPosition(term);
2374 dumpFrameLocation(term);
2375 break;
2376 case ByteTerm::TypeParentheticalAssertionEnd:
2377 outputTermIndexAndNest(idx, --nesting);
2378 out.print("ParentheticalAssertionEnd");
2379 dumpFrameLocation(term);
2380 break;
2381 case ByteTerm::TypeCheckInput:
2382 outputTermIndexAndNest(idx, nesting);
2383 out.print("CheckInput ", term.checkInputCount);
2384 break;
2385 case ByteTerm::TypeUncheckInput:
2386 outputTermIndexAndNest(idx, nesting);
2387 out.print("UncheckInput ", term.checkInputCount);
2388 break;
2389 case ByteTerm::TypeDotStarEnclosure:
2390 outputTermIndexAndNest(idx, nesting);
2391 out.print("DotStarEnclosure");
2392 break;
2393 }
2394 if (outputNewline)
2395 out.print("\n");
2396 }
2397 }
2398#endif
2399
2400private:
2401 YarrPattern& m_pattern;
2402 std::unique_ptr<ByteDisjunction> m_bodyDisjunction;
2403 unsigned m_currentAlternativeIndex;
2404 Vector<ParenthesesStackEntry> m_parenthesesStack;
2405 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
2406};
2407
2408std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
2409{
2410 return ByteCompiler(pattern).compile(allocator, lock);
2411}
2412
2413unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
2414{
2415 SuperSamplerScope superSamplerScope(false);
2416 if (input.is8Bit())
2417 return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
2418 return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
2419}
2420
2421unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
2422{
2423 SuperSamplerScope superSamplerScope(false);
2424 return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
2425}
2426
2427unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
2428{
2429 SuperSamplerScope superSamplerScope(false);
2430 return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
2431}
2432
2433// These should be the same for both UChar & LChar.
2434COMPILE_ASSERT(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
2435COMPILE_ASSERT(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
2436COMPILE_ASSERT(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
2437COMPILE_ASSERT(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
2438COMPILE_ASSERT(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
2439COMPILE_ASSERT(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
2440COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
2441
2442
2443} }
2444