1/*
2 * Copyright (C) 2014, 2015 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "URLFilterParser.h"
28
29#if ENABLE(CONTENT_EXTENSIONS)
30
31#include "CombinedURLFilters.h"
32#include "Term.h"
33#include <JavaScriptCore/YarrParser.h>
34#include <wtf/Deque.h>
35#include <wtf/text/CString.h>
36
37namespace WebCore {
38
39namespace ContentExtensions {
40
41class PatternParser {
42public:
43 PatternParser(bool patternIsCaseSensitive)
44 : m_patternIsCaseSensitive(patternIsCaseSensitive)
45 , m_parseStatus(URLFilterParser::Ok)
46 {
47 }
48
49 void finalize(uint64_t patternId, CombinedURLFilters& combinedURLFilters)
50 {
51 if (hasError())
52 return;
53
54 sinkFloatingTermIfNecessary();
55
56 simplifySunkTerms();
57
58 // Check to see if there are any terms without ? or *.
59 bool matchesEverything = true;
60 for (const auto& term : m_sunkTerms) {
61 if (term.matchesAtLeastOneCharacter()) {
62 matchesEverything = false;
63 break;
64 }
65 }
66 if (matchesEverything) {
67 fail(URLFilterParser::MatchesEverything);
68 return;
69 }
70
71 combinedURLFilters.addPattern(patternId, m_sunkTerms);
72 }
73
74 URLFilterParser::ParseStatus parseStatus() const
75 {
76 return m_parseStatus;
77 }
78
79 void atomPatternCharacter(UChar character)
80 {
81 if (hasError())
82 return;
83
84 if (!isASCII(character)) {
85 fail(URLFilterParser::NonASCII);
86 return;
87 }
88
89 sinkFloatingTermIfNecessary();
90 ASSERT(!m_floatingTerm.isValid());
91
92 char asciiChararacter = static_cast<char>(character);
93 m_floatingTerm = Term(asciiChararacter, m_patternIsCaseSensitive);
94 }
95
96 void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
97 {
98 if (hasError())
99 return;
100
101 sinkFloatingTermIfNecessary();
102 ASSERT(!m_floatingTerm.isValid());
103
104 if (builtInCharacterClassID == JSC::Yarr::BuiltInCharacterClassID::DotClassID && !inverted)
105 m_floatingTerm = Term(Term::UniversalTransition);
106 else
107 fail(URLFilterParser::UnsupportedCharacterClass);
108 }
109
110 void quantifyAtom(unsigned minimum, unsigned maximum, bool)
111 {
112 if (hasError())
113 return;
114
115 ASSERT(m_floatingTerm.isValid());
116
117 if (!minimum && maximum == 1)
118 m_floatingTerm.quantify(AtomQuantifier::ZeroOrOne);
119 else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
120 m_floatingTerm.quantify(AtomQuantifier::ZeroOrMore);
121 else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
122 m_floatingTerm.quantify(AtomQuantifier::OneOrMore);
123 else
124 fail(URLFilterParser::InvalidQuantifier);
125 }
126
127 void atomBackReference(unsigned)
128 {
129 fail(URLFilterParser::BackReference);
130 }
131
132 void atomNamedBackReference(const String&)
133 {
134 fail(URLFilterParser::BackReference);
135 }
136
137 bool isValidNamedForwardReference(const String&)
138 {
139 return false;
140 }
141
142 void atomNamedForwardReference(const String&)
143 {
144 fail(URLFilterParser::ForwardReference);
145 }
146
147 void assertionBOL()
148 {
149 if (hasError())
150 return;
151
152 if (m_floatingTerm.isValid() || !m_sunkTerms.isEmpty() || !m_openGroups.isEmpty()) {
153 fail(URLFilterParser::MisplacedStartOfLine);
154 return;
155 }
156
157 m_hasBeginningOfLineAssertion = true;
158 }
159
160 void assertionEOL()
161 {
162 if (hasError())
163 return;
164
165 sinkFloatingTermIfNecessary();
166 ASSERT(!m_floatingTerm.isValid());
167
168 m_floatingTerm = Term(Term::EndOfLineAssertionTerm);
169 }
170
171 void assertionWordBoundary(bool)
172 {
173 fail(URLFilterParser::WordBoundary);
174 }
175
176 void atomCharacterClassBegin(bool inverted = false)
177 {
178 if (hasError())
179 return;
180
181 sinkFloatingTermIfNecessary();
182 ASSERT(!m_floatingTerm.isValid());
183
184 m_floatingTerm = Term(Term::CharacterSetTerm, inverted);
185 }
186
187 void atomCharacterClassAtom(UChar character)
188 {
189 if (hasError())
190 return;
191
192 ASSERT(isASCII(character));
193
194 m_floatingTerm.addCharacter(character, m_patternIsCaseSensitive);
195 }
196
197 void atomCharacterClassRange(UChar a, UChar b)
198 {
199 if (hasError())
200 return;
201
202 ASSERT(a);
203 ASSERT(b);
204 ASSERT(isASCII(a));
205 ASSERT(isASCII(b));
206
207 for (unsigned i = a; i <= b; ++i)
208 m_floatingTerm.addCharacter(static_cast<UChar>(i), m_patternIsCaseSensitive);
209 }
210
211 void atomCharacterClassEnd()
212 {
213 // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily.
214 }
215
216 void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
217 {
218 fail(URLFilterParser::AtomCharacter);
219 }
220
221 void atomParenthesesSubpatternBegin(bool = true, Optional<String> = WTF::nullopt)
222 {
223 if (hasError())
224 return;
225
226 sinkFloatingTermIfNecessary();
227
228 m_openGroups.append(Term(Term::GroupTerm));
229 }
230
231 void atomParentheticalAssertionBegin(bool = false)
232 {
233 fail(URLFilterParser::Group);
234 }
235
236 void atomParenthesesEnd()
237 {
238 if (hasError())
239 return;
240
241 sinkFloatingTermIfNecessary();
242 ASSERT(!m_floatingTerm.isValid());
243
244 m_floatingTerm = m_openGroups.takeLast();
245 }
246
247 void disjunction()
248 {
249 fail(URLFilterParser::Disjunction);
250 }
251
252private:
253 bool hasError() const
254 {
255 return m_parseStatus != URLFilterParser::Ok;
256 }
257
258 void fail(URLFilterParser::ParseStatus reason)
259 {
260 if (hasError())
261 return;
262
263 m_parseStatus = reason;
264 }
265
266 void sinkFloatingTermIfNecessary()
267 {
268 if (!m_floatingTerm.isValid())
269 return;
270
271 if (m_hasProcessedEndOfLineAssertion) {
272 fail(URLFilterParser::MisplacedEndOfLine);
273 m_floatingTerm = Term();
274 return;
275 }
276
277 if (m_floatingTerm.isEndOfLineAssertion())
278 m_hasProcessedEndOfLineAssertion = true;
279
280 if (!m_openGroups.isEmpty()) {
281 m_openGroups.last().extendGroupSubpattern(m_floatingTerm);
282 m_floatingTerm = Term();
283 return;
284 }
285
286 m_sunkTerms.append(m_floatingTerm);
287 m_floatingTerm = Term();
288 }
289
290 void simplifySunkTerms()
291 {
292 ASSERT(!m_floatingTerm.isValid());
293
294 if (m_sunkTerms.isEmpty())
295 return;
296
297 Term canonicalDotStar(Term::UniversalTransition);
298 canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore);
299
300 // Replace every ".*"-like terms by our canonical version. Remove any duplicate ".*".
301 {
302 unsigned termIndex = 0;
303 bool isAfterDotStar = false;
304 while (termIndex < m_sunkTerms.size()) {
305 if (isAfterDotStar && m_sunkTerms[termIndex].isKnownToMatchAnyString()) {
306 m_sunkTerms.remove(termIndex);
307 continue;
308 }
309 isAfterDotStar = false;
310
311 if (m_sunkTerms[termIndex].isKnownToMatchAnyString()) {
312 m_sunkTerms[termIndex] = canonicalDotStar;
313 isAfterDotStar = true;
314 }
315 ++termIndex;
316 }
317 }
318
319 // Add our ".*" in front if needed.
320 if (!m_hasBeginningOfLineAssertion && !m_sunkTerms.first().isKnownToMatchAnyString())
321 m_sunkTerms.insert(0, canonicalDotStar);
322
323 // Remove trailing ".*$".
324 if (m_sunkTerms.size() > 2 && m_sunkTerms.last().isEndOfLineAssertion() && m_sunkTerms[m_sunkTerms.size() - 2].isKnownToMatchAnyString())
325 m_sunkTerms.shrink(m_sunkTerms.size() - 2);
326
327 // Remove irrelevant terms that can match empty. For example in "foob?", matching "b" is irrelevant.
328 if (m_sunkTerms.last().isEndOfLineAssertion())
329 return;
330 while (!m_sunkTerms.isEmpty() && !m_sunkTerms.last().matchesAtLeastOneCharacter())
331 m_sunkTerms.removeLast();
332 }
333
334 bool m_patternIsCaseSensitive;
335
336 Deque<Term> m_openGroups;
337 Vector<Term> m_sunkTerms;
338 Term m_floatingTerm;
339 bool m_hasBeginningOfLineAssertion { false };
340 bool m_hasProcessedEndOfLineAssertion { false };
341
342 URLFilterParser::ParseStatus m_parseStatus;
343};
344
345URLFilterParser::URLFilterParser(CombinedURLFilters& combinedURLFilters)
346 : m_combinedURLFilters(combinedURLFilters)
347{
348}
349
350URLFilterParser::~URLFilterParser() = default;
351
352URLFilterParser::ParseStatus URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId)
353{
354 if (!pattern.isAllASCII())
355 return NonASCII;
356 if (pattern.isEmpty())
357 return EmptyPattern;
358
359 ParseStatus status = Ok;
360 PatternParser patternParser(patternIsCaseSensitive);
361 if (!JSC::Yarr::hasError(JSC::Yarr::parse(patternParser, pattern, false, 0)))
362 patternParser.finalize(patternId, m_combinedURLFilters);
363 else
364 status = YarrError;
365
366 if (status == Ok)
367 status = patternParser.parseStatus();
368
369 return status;
370}
371
372String URLFilterParser::statusString(ParseStatus status)
373{
374 switch (status) {
375 case Ok:
376 return "Ok";
377 case MatchesEverything:
378 return "Matches everything.";
379 case NonASCII:
380 return "Only ASCII characters are supported in pattern.";
381 case UnsupportedCharacterClass:
382 return "Character class is not supported.";
383 case BackReference:
384 return "Patterns cannot contain backreferences.";
385 case ForwardReference:
386 return "Patterns cannot contain forward references.";
387 case MisplacedStartOfLine:
388 return "Start of line assertion can only appear as the first term in a filter.";
389 case WordBoundary:
390 return "Word boundaries assertions are not supported yet.";
391 case AtomCharacter:
392 return "Builtins character class atoms are not supported yet.";
393 case Group:
394 return "Groups are not supported yet.";
395 case Disjunction:
396 return "Disjunctions are not supported yet.";
397 case MisplacedEndOfLine:
398 return "The end of line assertion must be the last term in an expression.";
399 case EmptyPattern:
400 return "Empty pattern.";
401 case YarrError:
402 return "Internal error in YARR.";
403 case InvalidQuantifier:
404 return "Arbitrary atom repetitions are not supported.";
405 }
406
407 RELEASE_ASSERT_NOT_REACHED();
408}
409
410} // namespace ContentExtensions
411} // namespace WebCore
412
413#endif // ENABLE(CONTENT_EXTENSIONS)
414