1/*
2 * Copyright (C) 2014-2015 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#pragma once
27
28#if ENABLE(CONTENT_EXTENSIONS)
29
30#include "ContentExtensionsDebugging.h"
31#include "NFA.h"
32#include <unicode/utypes.h>
33#include <wtf/ASCIICType.h>
34#include <wtf/HashMap.h>
35#include <wtf/Vector.h>
36#include <wtf/text/StringBuilder.h>
37#include <wtf/text/WTFString.h>
38
39namespace WebCore {
40
41namespace ContentExtensions {
42
43enum class AtomQuantifier : uint8_t {
44 One,
45 ZeroOrOne,
46 ZeroOrMore,
47 OneOrMore
48};
49
50class Term {
51public:
52 Term();
53 Term(char character, bool isCaseSensitive);
54
55 enum UniversalTransitionTag { UniversalTransition };
56 explicit Term(UniversalTransitionTag);
57
58 enum CharacterSetTermTag { CharacterSetTerm };
59 explicit Term(CharacterSetTermTag, bool isInverted);
60
61 enum GroupTermTag { GroupTerm };
62 explicit Term(GroupTermTag);
63
64 enum EndOfLineAssertionTermTag { EndOfLineAssertionTerm };
65 explicit Term(EndOfLineAssertionTermTag);
66
67 Term(const Term& other);
68 Term(Term&& other);
69
70 enum EmptyValueTag { EmptyValue };
71 Term(EmptyValueTag);
72
73 ~Term();
74
75 bool isValid() const;
76
77 // CharacterSet terms only.
78 void addCharacter(UChar character, bool isCaseSensitive);
79
80 // Group terms only.
81 void extendGroupSubpattern(const Term&);
82
83 void quantify(const AtomQuantifier&);
84
85 ImmutableCharNFANodeBuilder generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const;
86 void generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;
87
88 bool isEndOfLineAssertion() const;
89
90 bool matchesAtLeastOneCharacter() const;
91
92 // Matches any string, the empty string included.
93 // This is very conservative. Patterns matching any string can return false here.
94 bool isKnownToMatchAnyString() const;
95
96 // Return true if the term can only match input of a single fixed length.
97 bool hasFixedLength() const;
98
99 Term& operator=(const Term& other);
100 Term& operator=(Term&& other);
101
102 bool operator==(const Term& other) const;
103
104 unsigned hash() const;
105
106 bool isEmptyValue() const;
107
108#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
109 String toString() const;
110#endif
111
112#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
113 size_t memoryUsed() const;
114#endif
115
116private:
117 // This is exact for character sets but conservative for groups.
118 // The return value can be false for a group equivalent to a universal transition.
119 bool isUniversalTransition() const;
120
121 void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const;
122 void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;
123
124 void destroy();
125
126 enum class TermType : uint8_t {
127 Empty,
128 CharacterSet,
129 Group
130 };
131
132 TermType m_termType { TermType::Empty };
133 AtomQuantifier m_quantifier { AtomQuantifier::One };
134
135 class CharacterSet {
136 public:
137 void set(UChar character)
138 {
139 RELEASE_ASSERT(character < 128);
140 m_characters[character / 64] |= (uint64_t(1) << (character % 64));
141 }
142
143 bool get(UChar character) const
144 {
145 RELEASE_ASSERT(character < 128);
146 return m_characters[character / 64] & (uint64_t(1) << (character % 64));
147 }
148
149 void invert()
150 {
151 ASSERT(!m_inverted);
152 m_inverted = true;
153 }
154
155 bool inverted() const { return m_inverted; }
156
157 unsigned bitCount() const
158 {
159 return WTF::bitCount(m_characters[0]) + WTF::bitCount(m_characters[1]);
160 }
161
162 bool operator==(const CharacterSet& other) const
163 {
164 return other.m_inverted == m_inverted
165 && other.m_characters[0] == m_characters[0]
166 && other.m_characters[1] == m_characters[1];
167 }
168
169 unsigned hash() const
170 {
171 return WTF::pairIntHash(WTF::pairIntHash(WTF::intHash(m_characters[0]), WTF::intHash(m_characters[1])), m_inverted);
172 }
173 private:
174 bool m_inverted { false };
175 uint64_t m_characters[2] { 0, 0 };
176 };
177
178 struct Group {
179 Vector<Term> terms;
180
181 bool operator==(const Group& other) const
182 {
183 return other.terms == terms;
184 }
185
186 unsigned hash() const
187 {
188 unsigned hash = 6421749;
189 for (const Term& term : terms) {
190 unsigned termHash = term.hash();
191 hash = (hash << 16) ^ ((termHash << 11) ^ hash);
192 hash += hash >> 11;
193 }
194 return hash;
195 }
196 };
197
198 union AtomData {
199 AtomData()
200 : invalidTerm(0)
201 {
202 }
203 ~AtomData()
204 {
205 }
206
207 char invalidTerm;
208 CharacterSet characterSet;
209 Group group;
210 } m_atomData;
211};
212
213#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
214inline String quantifierToString(AtomQuantifier quantifier)
215{
216 switch (quantifier) {
217 case AtomQuantifier::One:
218 return "";
219 case AtomQuantifier::ZeroOrOne:
220 return "?";
221 case AtomQuantifier::ZeroOrMore:
222 return "*";
223 case AtomQuantifier::OneOrMore:
224 return "+";
225 }
226}
227
228inline String Term::toString() const
229{
230 switch (m_termType) {
231 case TermType::Empty:
232 return "(Empty)";
233 case TermType::CharacterSet: {
234 StringBuilder builder;
235 builder.append('[');
236 for (UChar c = 0; c < 128; c++) {
237 if (m_atomData.characterSet.get(c)) {
238 if (isASCIIPrintable(c) && !isASCIISpace(c))
239 builder.append(c);
240 else {
241 builder.append('\\');
242 builder.append('u');
243 builder.appendNumber(c);
244 }
245 }
246 }
247 builder.append(']');
248 builder.append(quantifierToString(m_quantifier));
249 return builder.toString();
250 }
251 case TermType::Group: {
252 StringBuilder builder;
253 builder.append('(');
254 for (const Term& term : m_atomData.group.terms)
255 builder.append(term.toString());
256 builder.append(')');
257 builder.append(quantifierToString(m_quantifier));
258 return builder.toString();
259 }
260 }
261}
262#endif
263
264inline Term::Term()
265{
266}
267
268inline Term::Term(char character, bool isCaseSensitive)
269 : m_termType(TermType::CharacterSet)
270{
271 new (NotNull, &m_atomData.characterSet) CharacterSet();
272 addCharacter(character, isCaseSensitive);
273}
274
275inline Term::Term(UniversalTransitionTag)
276 : m_termType(TermType::CharacterSet)
277{
278 new (NotNull, &m_atomData.characterSet) CharacterSet();
279 for (UChar i = 1; i < 128; ++i)
280 m_atomData.characterSet.set(i);
281}
282
283inline Term::Term(CharacterSetTermTag, bool isInverted)
284 : m_termType(TermType::CharacterSet)
285{
286 new (NotNull, &m_atomData.characterSet) CharacterSet();
287 if (isInverted)
288 m_atomData.characterSet.invert();
289}
290
291inline Term::Term(GroupTermTag)
292 : m_termType(TermType::Group)
293{
294 new (NotNull, &m_atomData.group) Group();
295}
296
297inline Term::Term(EndOfLineAssertionTermTag)
298 : Term(CharacterSetTerm, false)
299{
300 m_atomData.characterSet.set(0);
301}
302
303inline Term::Term(const Term& other)
304 : m_termType(other.m_termType)
305 , m_quantifier(other.m_quantifier)
306{
307 switch (m_termType) {
308 case TermType::Empty:
309 break;
310 case TermType::CharacterSet:
311 new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet);
312 break;
313 case TermType::Group:
314 new (NotNull, &m_atomData.group) Group(other.m_atomData.group);
315 break;
316 }
317}
318
319inline Term::Term(Term&& other)
320 : m_termType(WTFMove(other.m_termType))
321 , m_quantifier(WTFMove(other.m_quantifier))
322{
323 switch (m_termType) {
324 case TermType::Empty:
325 break;
326 case TermType::CharacterSet:
327 new (NotNull, &m_atomData.characterSet) CharacterSet(WTFMove(other.m_atomData.characterSet));
328 break;
329 case TermType::Group:
330 new (NotNull, &m_atomData.group) Group(WTFMove(other.m_atomData.group));
331 break;
332 }
333 other.destroy();
334}
335
336inline Term::Term(EmptyValueTag)
337 : m_termType(TermType::Empty)
338{
339}
340
341inline Term::~Term()
342{
343 destroy();
344}
345
346inline bool Term::isValid() const
347{
348 return m_termType != TermType::Empty;
349}
350
351inline void Term::addCharacter(UChar character, bool isCaseSensitive)
352{
353 ASSERT(isASCII(character));
354
355 ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
356 if (m_termType != TermType::CharacterSet)
357 return;
358
359 if (isCaseSensitive || !isASCIIAlpha(character))
360 m_atomData.characterSet.set(character);
361 else {
362 m_atomData.characterSet.set(toASCIIUpper(character));
363 m_atomData.characterSet.set(toASCIILower(character));
364 }
365}
366
367inline void Term::extendGroupSubpattern(const Term& term)
368{
369 ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
370 if (m_termType != TermType::Group)
371 return;
372 m_atomData.group.terms.append(term);
373}
374
375inline void Term::quantify(const AtomQuantifier& quantifier)
376{
377 ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
378 m_quantifier = quantifier;
379}
380
381inline ImmutableCharNFANodeBuilder Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const
382{
383 ImmutableCharNFANodeBuilder newEnd(nfa);
384 generateGraph(nfa, source, newEnd.nodeId());
385 newEnd.setActions(finalActions.begin(), finalActions.end());
386 return newEnd;
387}
388
389inline void Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
390{
391 ASSERT(isValid());
392
393 switch (m_quantifier) {
394 case AtomQuantifier::One: {
395 generateSubgraphForAtom(nfa, source, destination);
396 break;
397 }
398 case AtomQuantifier::ZeroOrOne: {
399 generateSubgraphForAtom(nfa, source, destination);
400 source.addEpsilonTransition(destination);
401 break;
402 }
403 case AtomQuantifier::ZeroOrMore: {
404 ImmutableCharNFANodeBuilder repeatStart(nfa);
405 source.addEpsilonTransition(repeatStart);
406
407 ImmutableCharNFANodeBuilder repeatEnd(nfa);
408 generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
409 repeatEnd.addEpsilonTransition(repeatStart);
410
411 repeatEnd.addEpsilonTransition(destination);
412 source.addEpsilonTransition(destination);
413 break;
414 }
415 case AtomQuantifier::OneOrMore: {
416 ImmutableCharNFANodeBuilder repeatStart(nfa);
417 source.addEpsilonTransition(repeatStart);
418
419 ImmutableCharNFANodeBuilder repeatEnd(nfa);
420 generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
421 repeatEnd.addEpsilonTransition(repeatStart);
422
423 repeatEnd.addEpsilonTransition(destination);
424 break;
425 }
426 }
427}
428
429inline bool Term::isEndOfLineAssertion() const
430{
431 return m_termType == TermType::CharacterSet && m_atomData.characterSet.bitCount() == 1 && m_atomData.characterSet.get(0);
432}
433
434inline bool Term::matchesAtLeastOneCharacter() const
435{
436 ASSERT(isValid());
437
438 if (m_quantifier == AtomQuantifier::ZeroOrOne || m_quantifier == AtomQuantifier::ZeroOrMore)
439 return false;
440 if (isEndOfLineAssertion())
441 return false;
442
443 if (m_termType == TermType::Group) {
444 for (const Term& term : m_atomData.group.terms) {
445 if (term.matchesAtLeastOneCharacter())
446 return true;
447 }
448 return false;
449 }
450 return true;
451}
452
453inline bool Term::isKnownToMatchAnyString() const
454{
455 ASSERT(isValid());
456
457 switch (m_termType) {
458 case TermType::Empty:
459 ASSERT_NOT_REACHED();
460 break;
461 case TermType::CharacterSet:
462 // ".*" is the only simple term matching any string.
463 return isUniversalTransition() && m_quantifier == AtomQuantifier::ZeroOrMore;
464 break;
465 case TermType::Group: {
466 // There are infinitely many ways to match anything with groups, we just handle simple cases
467 if (m_atomData.group.terms.size() != 1)
468 return false;
469
470 const Term& firstTermInGroup = m_atomData.group.terms.first();
471 // -(.*) with any quantifier.
472 if (firstTermInGroup.isKnownToMatchAnyString())
473 return true;
474
475 if (firstTermInGroup.isUniversalTransition()) {
476 // -(.)*, (.+)*, (.?)* etc.
477 if (m_quantifier == AtomQuantifier::ZeroOrMore)
478 return true;
479
480 // -(.+)?.
481 if (m_quantifier == AtomQuantifier::ZeroOrOne && firstTermInGroup.m_quantifier == AtomQuantifier::OneOrMore)
482 return true;
483
484 // -(.?)+.
485 if (m_quantifier == AtomQuantifier::OneOrMore && firstTermInGroup.m_quantifier == AtomQuantifier::ZeroOrOne)
486 return true;
487 }
488 break;
489 }
490 }
491 return false;
492}
493
494inline bool Term::hasFixedLength() const
495{
496 ASSERT(isValid());
497
498 switch (m_termType) {
499 case TermType::Empty:
500 ASSERT_NOT_REACHED();
501 break;
502 case TermType::CharacterSet:
503 return m_quantifier == AtomQuantifier::One;
504 case TermType::Group: {
505 if (m_quantifier != AtomQuantifier::One)
506 return false;
507 for (const Term& term : m_atomData.group.terms) {
508 if (!term.hasFixedLength())
509 return false;
510 }
511 return true;
512 }
513 }
514 return false;
515}
516
517inline Term& Term::operator=(const Term& other)
518{
519 destroy();
520 new (NotNull, this) Term(other);
521 return *this;
522}
523
524inline Term& Term::operator=(Term&& other)
525{
526 destroy();
527 new (NotNull, this) Term(WTFMove(other));
528 return *this;
529}
530
531inline bool Term::operator==(const Term& other) const
532{
533 if (other.m_termType != m_termType || other.m_quantifier != m_quantifier)
534 return false;
535
536 switch (m_termType) {
537 case TermType::Empty:
538 return true;
539 case TermType::CharacterSet:
540 return m_atomData.characterSet == other.m_atomData.characterSet;
541 case TermType::Group:
542 return m_atomData.group == other.m_atomData.group;
543 }
544 ASSERT_NOT_REACHED();
545 return false;
546}
547
548inline unsigned Term::hash() const
549{
550 unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier);
551 unsigned secondary = 0;
552 switch (m_termType) {
553 case TermType::Empty:
554 secondary = 52184393;
555 break;
556 case TermType::CharacterSet:
557 secondary = m_atomData.characterSet.hash();
558 break;
559 case TermType::Group:
560 secondary = m_atomData.group.hash();
561 break;
562 }
563 return WTF::pairIntHash(primary, secondary);
564}
565
566inline bool Term::isEmptyValue() const
567{
568 return m_termType == TermType::Empty;
569}
570
571inline bool Term::isUniversalTransition() const
572{
573 ASSERT(isValid());
574
575 switch (m_termType) {
576 case TermType::Empty:
577 ASSERT_NOT_REACHED();
578 break;
579 case TermType::CharacterSet:
580 return (m_atomData.characterSet.inverted() && !m_atomData.characterSet.bitCount())
581 || (!m_atomData.characterSet.inverted() && m_atomData.characterSet.bitCount() == 127 && !m_atomData.characterSet.get(0));
582 case TermType::Group:
583 return m_atomData.group.terms.size() == 1 && m_atomData.group.terms.first().isUniversalTransition();
584 }
585 return false;
586}
587
588inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const
589{
590 generateSubgraphForAtom(nfa, source, destination.nodeId());
591}
592
593inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
594{
595 switch (m_termType) {
596 case TermType::Empty:
597 ASSERT_NOT_REACHED();
598 source.addEpsilonTransition(destination);
599 break;
600 case TermType::CharacterSet: {
601 if (!m_atomData.characterSet.inverted()) {
602 UChar i = 0;
603 while (true) {
604 while (i < 128 && !m_atomData.characterSet.get(i))
605 ++i;
606 if (i == 128)
607 break;
608
609 UChar start = i;
610 ++i;
611 while (i < 128 && m_atomData.characterSet.get(i))
612 ++i;
613 source.addTransition(start, i - 1, destination);
614 }
615 } else {
616 UChar i = 1;
617 while (true) {
618 while (i < 128 && m_atomData.characterSet.get(i))
619 ++i;
620 if (i == 128)
621 break;
622
623 UChar start = i;
624 ++i;
625 while (i < 128 && !m_atomData.characterSet.get(i))
626 ++i;
627 source.addTransition(start, i - 1, destination);
628 }
629 }
630 break;
631 }
632 case TermType::Group: {
633 if (m_atomData.group.terms.isEmpty()) {
634 // FIXME: any kind of empty term could be avoided in the parser. This case should turned into an assertion.
635 source.addEpsilonTransition(destination);
636 return;
637 }
638
639 if (m_atomData.group.terms.size() == 1) {
640 m_atomData.group.terms.first().generateGraph(nfa, source, destination);
641 return;
642 }
643
644 ImmutableCharNFANodeBuilder lastTarget = m_atomData.group.terms.first().generateGraph(nfa, source, ActionList());
645 for (unsigned i = 1; i < m_atomData.group.terms.size() - 1; ++i) {
646 const Term& currentTerm = m_atomData.group.terms[i];
647 ImmutableCharNFANodeBuilder newNode = currentTerm.generateGraph(nfa, lastTarget, ActionList());
648 lastTarget = WTFMove(newNode);
649 }
650 const Term& lastTerm = m_atomData.group.terms.last();
651 lastTerm.generateGraph(nfa, lastTarget, destination);
652 break;
653 }
654 }
655}
656
657inline void Term::destroy()
658{
659 switch (m_termType) {
660 case TermType::Empty:
661 break;
662 case TermType::CharacterSet:
663 m_atomData.characterSet.~CharacterSet();
664 break;
665 case TermType::Group:
666 m_atomData.group.~Group();
667 break;
668 }
669 m_termType = TermType::Empty;
670}
671
672#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
673inline size_t Term::memoryUsed() const
674{
675 size_t extraMemory = 0;
676 if (m_termType == TermType::Group) {
677 for (const Term& term : m_atomData.group.terms)
678 extraMemory += term.memoryUsed();
679 }
680 return sizeof(Term) + extraMemory;
681}
682#endif
683
684} // namespace ContentExtensions
685} // namespace WebCore
686
687#endif // ENABLE(CONTENT_EXTENSIONS)
688