Term.h source code [webkit/Source/WebCore/contentextensions/Term.h]

1	/*
2	* Copyright (C) 2014-2015 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23	* THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#pragma once
27
28	#if ENABLE(CONTENT_EXTENSIONS)
29
30	#include "ContentExtensionsDebugging.h"
31	#include "NFA.h"
32	#include <unicode/utypes.h>
33	#include <wtf/ASCIICType.h>
34	#include <wtf/HashMap.h>
35	#include <wtf/Vector.h>
36	#include <wtf/text/StringBuilder.h>
37	#include <wtf/text/WTFString.h>
38
39	namespace WebCore {
40
41	namespace ContentExtensions {
42
43	enum class AtomQuantifier : uint8_t {
44	One,
45	ZeroOrOne,
46	ZeroOrMore,
47	OneOrMore
48	};
49
50	class Term {
51	public:
52	Term();
53	Term(char character, bool isCaseSensitive);
54
55	enum UniversalTransitionTag { UniversalTransition };
56	explicit Term(UniversalTransitionTag);
57
58	enum CharacterSetTermTag { CharacterSetTerm };
59	explicit Term(CharacterSetTermTag, bool isInverted);
60
61	enum GroupTermTag { GroupTerm };
62	explicit Term(GroupTermTag);
63
64	enum EndOfLineAssertionTermTag { EndOfLineAssertionTerm };
65	explicit Term(EndOfLineAssertionTermTag);
66
67	Term(const Term& other);
68	Term(Term&& other);
69
70	enum EmptyValueTag { EmptyValue };
71	Term(EmptyValueTag);
72
73	~Term();
74
75	bool isValid() const;
76
77	// CharacterSet terms only.
78	void addCharacter(UChar character, bool isCaseSensitive);
79
80	// Group terms only.
81	void extendGroupSubpattern(const Term&);
82
83	void quantify(const AtomQuantifier&);
84
85	ImmutableCharNFANodeBuilder generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const;
86	void generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;
87
88	bool isEndOfLineAssertion() const;
89
90	bool matchesAtLeastOneCharacter() const;
91
92	// Matches any string, the empty string included.
93	// This is very conservative. Patterns matching any string can return false here.
94	bool isKnownToMatchAnyString() const;
95
96	// Return true if the term can only match input of a single fixed length.
97	bool hasFixedLength() const;
98
99	Term& operator=(const Term& other);
100	Term& operator=(Term&& other);
101
102	bool operator==(const Term& other) const;
103
104	unsigned hash() const;
105
106	bool isEmptyValue() const;
107
108	#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
109	String toString() const;
110	#endif
111
112	#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
113	size_t memoryUsed() const;
114	#endif
115
116	private:
117	// This is exact for character sets but conservative for groups.
118	// The return value can be false for a group equivalent to a universal transition.
119	bool isUniversalTransition() const;
120
121	void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const;
122	void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;
123
124	void destroy();
125
126	enum class TermType : uint8_t {
127	Empty,
128	CharacterSet,
129	Group
130	};
131
132	TermType m_termType { TermType::Empty };
133	AtomQuantifier m_quantifier { AtomQuantifier::One };
134
135	class CharacterSet {
136	public:
137	void set(UChar character)
138	{
139	RELEASE_ASSERT(character < `128`);
140	m_characters[character / `64`] \|= (uint64_t(`1`) << (character % `64`));
141	}
142
143	bool get(UChar character) const
144	{
145	RELEASE_ASSERT(character < `128`);
146	return m_characters[character / `64`] & (uint64_t(`1`) << (character % `64`));
147	}
148
149	void invert()
150	{
151	ASSERT(!m_inverted);
152	m_inverted = true;
153	}
154
155	bool inverted() const { return m_inverted; }
156
157	unsigned bitCount() const
158	{
159	return WTF::bitCount(m_characters[`0`]) + WTF::bitCount(m_characters[`1`]);
160	}
161
162	bool operator==(const CharacterSet& other) const
163	{
164	return other.m_inverted == m_inverted
165	&& other.m_characters[`0`] == m_characters[`0`]
166	&& other.m_characters[`1`] == m_characters[`1`];
167	}
168
169	unsigned hash() const
170	{
171	return WTF::pairIntHash(WTF::pairIntHash(WTF::intHash(m_characters[`0`]), WTF::intHash(m_characters[`1`])), m_inverted);
172	}
173	private:
174	bool m_inverted { false };
175	uint64_t m_characters[`2`] { `0`, `0` };
176	};
177
178	struct Group {
179	Vector<Term> terms;
180
181	bool operator==(const Group& other) const
182	{
183	return other.terms == terms;
184	}
185
186	unsigned hash() const
187	{
188	unsigned hash = `6421749`;
189	for (const Term& term : terms) {
190	unsigned termHash = term.hash();
191	hash = (hash << `16`) ^ ((termHash << `11`) ^ hash);
192	hash += hash >> `11`;
193	}
194	return hash;
195	}
196	};
197
198	union AtomData {
199	AtomData()
200	: invalidTerm(`0`)
201	{
202	}
203	~AtomData()
204	{
205	}
206
207	char invalidTerm;
208	CharacterSet characterSet;
209	Group group;
210	} m_atomData;
211	};
212
213	#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
214	inline String quantifierToString(AtomQuantifier quantifier)
215	{
216	switch (quantifier) {
217	case AtomQuantifier::One:
218	return "";
219	case AtomQuantifier::ZeroOrOne:
220	return "?";
221	case AtomQuantifier::ZeroOrMore:
222	return "*";
223	case AtomQuantifier::OneOrMore:
224	return "+";
225	}
226	}
227
228	inline String Term::toString() const
229	{
230	switch (m_termType) {
231	case TermType::Empty:
232	return "(Empty)";
233	case TermType::CharacterSet: {
234	StringBuilder builder;
235	builder.append(`'['`);
236	for (UChar c = `0`; c < `128`; c++) {
237	if (m_atomData.characterSet.get(c)) {
238	if (isASCIIPrintable(c) && !isASCIISpace(c))
239	builder.append(c);
240	else {
241	builder.append(`'\\'`);
242	builder.append(`'u'`);
243	builder.appendNumber(c);
244	}
245	}
246	}
247	builder.append(`']'`);
248	builder.append(quantifierToString(m_quantifier));
249	return builder.toString();
250	}
251	case TermType::Group: {
252	StringBuilder builder;
253	builder.append(`'('`);
254	for (const Term& term : m_atomData.group.terms)
255	builder.append(term.toString());
256	builder.append(`')'`);
257	builder.append(quantifierToString(m_quantifier));
258	return builder.toString();
259	}
260	}
261	}
262	#endif
263
264	inline Term::Term()
265	{
266	}
267
268	inline Term::Term(char character, bool isCaseSensitive)
269	: m_termType(TermType::CharacterSet)
270	{
271	new (NotNull, &m_atomData.characterSet) CharacterSet ();
272	addCharacter(character, isCaseSensitive);
273	}
274
275	inline Term::Term(UniversalTransitionTag)
276	: m_termType(TermType::CharacterSet)
277	{
278	new (NotNull, &m_atomData.characterSet) CharacterSet ();
279	for (UChar i = `1`; i < `128`; ++i)
280	m_atomData.characterSet.set(i);
281	}
282
283	inline Term::Term(CharacterSetTermTag, bool isInverted)
284	: m_termType(TermType::CharacterSet)
285	{
286	new (NotNull, &m_atomData.characterSet) CharacterSet ();
287	if (isInverted)
288	m_atomData.characterSet.invert();
289	}
290
291	inline Term::Term(GroupTermTag)
292	: m_termType(TermType::Group)
293	{
294	new (NotNull, &m_atomData.group) Group ();
295	}
296
297	inline Term::Term(EndOfLineAssertionTermTag)
298	: Term (CharacterSetTerm, false)
299	{
300	m_atomData.characterSet.set(`0`);
301	}
302
303	inline Term::Term(const Term& other)
304	: m_termType(other.m_termType)
305	, m_quantifier(other.m_quantifier)
306	{
307	switch (m_termType) {
308	case TermType::Empty:
309	break;
310	case TermType::CharacterSet:
311	new (NotNull, &m_atomData.characterSet) CharacterSet (other.m_atomData.characterSet);
312	break;
313	case TermType::Group:
314	new (NotNull, &m_atomData.group) Group (other.m_atomData.group);
315	break;
316	}
317	}
318
319	inline Term::Term(Term&& other)
320	: m_termType(WTFMove(other.m_termType))
321	, m_quantifier(WTFMove(other.m_quantifier))
322	{
323	switch (m_termType) {
324	case TermType::Empty:
325	break;
326	case TermType::CharacterSet:
327	new (NotNull, &m_atomData.characterSet) CharacterSet (WTFMove(other.m_atomData.characterSet));
328	break;
329	case TermType::Group:
330	new (NotNull, &m_atomData.group) Group (WTFMove(other.m_atomData.group));
331	break;
332	}
333	other.destroy();
334	}
335
336	inline Term::Term(EmptyValueTag)
337	: m_termType(TermType::Empty)
338	{
339	}
340
341	inline Term::~Term()
342	{
343	destroy();
344	}
345
346	inline bool Term::isValid() const
347	{
348	return m_termType != TermType::Empty;
349	}
350
351	inline void Term::addCharacter(UChar character, bool isCaseSensitive)
352	{
353	ASSERT(isASCII(character));
354
355	ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
356	if (m_termType != TermType::CharacterSet)
357	return;
358
359	if (isCaseSensitive \|\| !isASCIIAlpha(character))
360	m_atomData.characterSet.set(character);
361	else {
362	m_atomData.characterSet.set(toASCIIUpper(character));
363	m_atomData.characterSet.set(toASCIILower(character));
364	}
365	}
366
367	inline void Term::extendGroupSubpattern(const Term& term)
368	{
369	ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
370	if (m_termType != TermType::Group)
371	return;
372	m_atomData.group.terms.append(term);
373	}
374
375	inline void Term::quantify(const AtomQuantifier& quantifier)
376	{
377	ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
378	m_quantifier = quantifier;
379	}
380
381	inline ImmutableCharNFANodeBuilder Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const
382	{
383	ImmutableCharNFANodeBuilder newEnd(nfa);
384	generateGraph(nfa, source, newEnd.nodeId());
385	newEnd.setActions(finalActions.begin(), finalActions.end());
386	return newEnd;
387	}
388
389	inline void Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
390	{
391	ASSERT(isValid());
392
393	switch (m_quantifier) {
394	case AtomQuantifier::One: {
395	generateSubgraphForAtom(nfa, source, destination);
396	break;
397	}
398	case AtomQuantifier::ZeroOrOne: {
399	generateSubgraphForAtom(nfa, source, destination);
400	source.addEpsilonTransition(destination);
401	break;
402	}
403	case AtomQuantifier::ZeroOrMore: {
404	ImmutableCharNFANodeBuilder repeatStart(nfa);
405	source.addEpsilonTransition(repeatStart);
406
407	ImmutableCharNFANodeBuilder repeatEnd(nfa);
408	generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
409	repeatEnd.addEpsilonTransition(repeatStart);
410
411	repeatEnd.addEpsilonTransition(destination);
412	source.addEpsilonTransition(destination);
413	break;
414	}
415	case AtomQuantifier::OneOrMore: {
416	ImmutableCharNFANodeBuilder repeatStart(nfa);
417	source.addEpsilonTransition(repeatStart);
418
419	ImmutableCharNFANodeBuilder repeatEnd(nfa);
420	generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
421	repeatEnd.addEpsilonTransition(repeatStart);
422
423	repeatEnd.addEpsilonTransition(destination);
424	break;
425	}
426	}
427	}
428
429	inline bool Term::isEndOfLineAssertion() const
430	{
431	return m_termType == TermType::CharacterSet && m_atomData.characterSet.bitCount() == `1` && m_atomData.characterSet.get(`0`);
432	}
433
434	inline bool Term::matchesAtLeastOneCharacter() const
435	{
436	ASSERT(isValid());
437
438	if (m_quantifier == AtomQuantifier::ZeroOrOne \|\| m_quantifier == AtomQuantifier::ZeroOrMore)
439	return false;
440	if (isEndOfLineAssertion())
441	return false;
442
443	if (m_termType == TermType::Group) {
444	for (const Term& term : m_atomData.group.terms) {
445	if (term.matchesAtLeastOneCharacter())
446	return true;
447	}
448	return false;
449	}
450	return true;
451	}
452
453	inline bool Term::isKnownToMatchAnyString() const
454	{
455	ASSERT(isValid());
456
457	switch (m_termType) {
458	case TermType::Empty:
459	ASSERT_NOT_REACHED();
460	break;
461	case TermType::CharacterSet:
462	// "." is the only simple term matching any string.*
463	return isUniversalTransition() && m_quantifier == AtomQuantifier::ZeroOrMore;
464	break;
465	case TermType::Group: {
466	// There are infinitely many ways to match anything with groups, we just handle simple cases
467	if (m_atomData.group.terms.size() != `1`)
468	return false;
469
470	const Term& firstTermInGroup = m_atomData.group.terms.first();
471	// -(.) with any quantifier.*
472	if (firstTermInGroup.isKnownToMatchAnyString())
473	return true;
474
475	if (firstTermInGroup.isUniversalTransition()) {
476	// -(.), (.+), (.?) etc.*
477	if (m_quantifier == AtomQuantifier::ZeroOrMore)
478	return true;
479
480	// -(.+)?.
481	if (m_quantifier == AtomQuantifier::ZeroOrOne && firstTermInGroup.m_quantifier == AtomQuantifier::OneOrMore)
482	return true;
483
484	// -(.?)+.
485	if (m_quantifier == AtomQuantifier::OneOrMore && firstTermInGroup.m_quantifier == AtomQuantifier::ZeroOrOne)
486	return true;
487	}
488	break;
489	}
490	}
491	return false;
492	}
493
494	inline bool Term::hasFixedLength() const
495	{
496	ASSERT(isValid());
497
498	switch (m_termType) {
499	case TermType::Empty:
500	ASSERT_NOT_REACHED();
501	break;
502	case TermType::CharacterSet:
503	return m_quantifier == AtomQuantifier::One;
504	case TermType::Group: {
505	if (m_quantifier != AtomQuantifier::One)
506	return false;
507	for (const Term& term : m_atomData.group.terms) {
508	if (!term.hasFixedLength())
509	return false;
510	}
511	return true;
512	}
513	}
514	return false;
515	}
516
517	inline Term& Term::operator=(const Term& other)
518	{
519	destroy();
520	new (NotNull, this) Term (other);
521	return *this;
522	}
523
524	inline Term& Term::operator=(Term&& other)
525	{
526	destroy();
527	new (NotNull, this) Term (WTFMove(other));
528	return *this;
529	}
530
531	inline bool Term::operator==(const Term& other) const
532	{
533	if (other.m_termType != m_termType \|\| other.m_quantifier != m_quantifier)
534	return false;
535
536	switch (m_termType) {
537	case TermType::Empty:
538	return true;
539	case TermType::CharacterSet:
540	return m_atomData.characterSet == other.m_atomData.characterSet;
541	case TermType::Group:
542	return m_atomData.group == other.m_atomData.group;
543	}
544	ASSERT_NOT_REACHED();
545	return false;
546	}
547
548	inline unsigned Term::hash() const
549	{
550	unsigned primary = static_cast<unsigned>(m_termType) << `16` \| static_cast<unsigned>(m_quantifier);
551	unsigned secondary = `0`;
552	switch (m_termType) {
553	case TermType::Empty:
554	secondary = `52184393`;
555	break;
556	case TermType::CharacterSet:
557	secondary = m_atomData.characterSet.hash();
558	break;
559	case TermType::Group:
560	secondary = m_atomData.group.hash();
561	break;
562	}
563	return WTF::pairIntHash(primary, secondary);
564	}
565
566	inline bool Term::isEmptyValue() const
567	{
568	return m_termType == TermType::Empty;
569	}
570
571	inline bool Term::isUniversalTransition() const
572	{
573	ASSERT(isValid());
574
575	switch (m_termType) {
576	case TermType::Empty:
577	ASSERT_NOT_REACHED();
578	break;
579	case TermType::CharacterSet:
580	return (m_atomData.characterSet.inverted() && !m_atomData.characterSet.bitCount())
581	\|\| (!m_atomData.characterSet.inverted() && m_atomData.characterSet.bitCount() == `127` && !m_atomData.characterSet.get(`0`));
582	case TermType::Group:
583	return m_atomData.group.terms.size() == `1` && m_atomData.group.terms.first().isUniversalTransition();
584	}
585	return false;
586	}
587
588	inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const
589	{
590	generateSubgraphForAtom(nfa, source, destination.nodeId());
591	}
592
593	inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
594	{
595	switch (m_termType) {
596	case TermType::Empty:
597	ASSERT_NOT_REACHED();
598	source.addEpsilonTransition(destination);
599	break;
600	case TermType::CharacterSet: {
601	if (!m_atomData.characterSet.inverted()) {
602	UChar i = `0`;
603	while (true) {
604	while (i < `128` && !m_atomData.characterSet.get(i))
605	++i;
606	if (i == `128`)
607	break;
608
609	UChar start = i;
610	++i;
611	while (i < `128` && m_atomData.characterSet.get(i))
612	++i;
613	source.addTransition(start, i - `1`, destination);
614	}
615	} else {
616	UChar i = `1`;
617	while (true) {
618	while (i < `128` && m_atomData.characterSet.get(i))
619	++i;
620	if (i == `128`)
621	break;
622
623	UChar start = i;
624	++i;
625	while (i < `128` && !m_atomData.characterSet.get(i))
626	++i;
627	source.addTransition(start, i - `1`, destination);
628	}
629	}
630	break;
631	}
632	case TermType::Group: {
633	if (m_atomData.group.terms.isEmpty()) {
634	// FIXME: any kind of empty term could be avoided in the parser. This case should turned into an assertion.
635	source.addEpsilonTransition(destination);
636	return;
637	}
638
639	if (m_atomData.group.terms.size() == `1`) {
640	m_atomData.group.terms.first().generateGraph(nfa, source, destination);
641	return;
642	}
643
644	ImmutableCharNFANodeBuilder lastTarget = m_atomData.group.terms.first().generateGraph(nfa, source, ActionList ());
645	for (unsigned i = `1`; i < m_atomData.group.terms.size() - `1`; ++i) {
646	const Term& currentTerm = m_atomData.group.terms [i];
647	ImmutableCharNFANodeBuilder newNode = currentTerm.generateGraph(nfa, lastTarget, ActionList ());
648	lastTarget = WTFMove(newNode);
649	}
650	const Term& lastTerm = m_atomData.group.terms.last();
651	lastTerm.generateGraph(nfa, lastTarget, destination);
652	break;
653	}
654	}
655	}
656
657	inline void Term::destroy()
658	{
659	switch (m_termType) {
660	case TermType::Empty:
661	break;
662	case TermType::CharacterSet:
663	m_atomData.characterSet.~CharacterSet();
664	break;
665	case TermType::Group:
666	m_atomData.group.~Group();
667	break;
668	}
669	m_termType = TermType::Empty;
670	}
671
672	#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
673	inline size_t Term::memoryUsed() const
674	{
675	size_t extraMemory = `0`;
676	if (m_termType == TermType::Group) {
677	for (const Term& term : m_atomData.group.terms)
678	extraMemory += term.memoryUsed();
679	}
680	return sizeof(Term) + extraMemory;
681	}
682	#endif
683
684	} // namespace ContentExtensions
685	} // namespace WebCore
686
687	#endif // ENABLE(CONTENT_EXTENSIONS)
688

Browse the source code of webkit/Source/WebCore/contentextensions/Term.h