1 | /* |
2 | * Copyright (C) 2014, 2015 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' |
14 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
15 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS |
17 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
18 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
19 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
20 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
21 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
22 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
23 | * THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #include "config.h" |
27 | #include "URLFilterParser.h" |
28 | |
29 | #if ENABLE(CONTENT_EXTENSIONS) |
30 | |
31 | #include "CombinedURLFilters.h" |
32 | #include "Term.h" |
33 | #include <JavaScriptCore/YarrParser.h> |
34 | #include <wtf/Deque.h> |
35 | #include <wtf/text/CString.h> |
36 | |
37 | namespace WebCore { |
38 | |
39 | namespace ContentExtensions { |
40 | |
41 | class PatternParser { |
42 | public: |
43 | PatternParser(bool patternIsCaseSensitive) |
44 | : m_patternIsCaseSensitive(patternIsCaseSensitive) |
45 | , m_parseStatus(URLFilterParser::Ok) |
46 | { |
47 | } |
48 | |
49 | void finalize(uint64_t patternId, CombinedURLFilters& combinedURLFilters) |
50 | { |
51 | if (hasError()) |
52 | return; |
53 | |
54 | sinkFloatingTermIfNecessary(); |
55 | |
56 | simplifySunkTerms(); |
57 | |
58 | // Check to see if there are any terms without ? or *. |
59 | bool matchesEverything = true; |
60 | for (const auto& term : m_sunkTerms) { |
61 | if (term.matchesAtLeastOneCharacter()) { |
62 | matchesEverything = false; |
63 | break; |
64 | } |
65 | } |
66 | if (matchesEverything) { |
67 | fail(URLFilterParser::MatchesEverything); |
68 | return; |
69 | } |
70 | |
71 | combinedURLFilters.addPattern(patternId, m_sunkTerms); |
72 | } |
73 | |
74 | URLFilterParser::ParseStatus parseStatus() const |
75 | { |
76 | return m_parseStatus; |
77 | } |
78 | |
79 | void atomPatternCharacter(UChar character) |
80 | { |
81 | if (hasError()) |
82 | return; |
83 | |
84 | if (!isASCII(character)) { |
85 | fail(URLFilterParser::NonASCII); |
86 | return; |
87 | } |
88 | |
89 | sinkFloatingTermIfNecessary(); |
90 | ASSERT(!m_floatingTerm.isValid()); |
91 | |
92 | char asciiChararacter = static_cast<char>(character); |
93 | m_floatingTerm = Term(asciiChararacter, m_patternIsCaseSensitive); |
94 | } |
95 | |
96 | void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted) |
97 | { |
98 | if (hasError()) |
99 | return; |
100 | |
101 | sinkFloatingTermIfNecessary(); |
102 | ASSERT(!m_floatingTerm.isValid()); |
103 | |
104 | if (builtInCharacterClassID == JSC::Yarr::BuiltInCharacterClassID::DotClassID && !inverted) |
105 | m_floatingTerm = Term(Term::UniversalTransition); |
106 | else |
107 | fail(URLFilterParser::UnsupportedCharacterClass); |
108 | } |
109 | |
110 | void quantifyAtom(unsigned minimum, unsigned maximum, bool) |
111 | { |
112 | if (hasError()) |
113 | return; |
114 | |
115 | ASSERT(m_floatingTerm.isValid()); |
116 | |
117 | if (!minimum && maximum == 1) |
118 | m_floatingTerm.quantify(AtomQuantifier::ZeroOrOne); |
119 | else if (!minimum && maximum == JSC::Yarr::quantifyInfinite) |
120 | m_floatingTerm.quantify(AtomQuantifier::ZeroOrMore); |
121 | else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite) |
122 | m_floatingTerm.quantify(AtomQuantifier::OneOrMore); |
123 | else |
124 | fail(URLFilterParser::InvalidQuantifier); |
125 | } |
126 | |
127 | void atomBackReference(unsigned) |
128 | { |
129 | fail(URLFilterParser::BackReference); |
130 | } |
131 | |
132 | void atomNamedBackReference(const String&) |
133 | { |
134 | fail(URLFilterParser::BackReference); |
135 | } |
136 | |
137 | bool isValidNamedForwardReference(const String&) |
138 | { |
139 | return false; |
140 | } |
141 | |
142 | void atomNamedForwardReference(const String&) |
143 | { |
144 | fail(URLFilterParser::ForwardReference); |
145 | } |
146 | |
147 | void assertionBOL() |
148 | { |
149 | if (hasError()) |
150 | return; |
151 | |
152 | if (m_floatingTerm.isValid() || !m_sunkTerms.isEmpty() || !m_openGroups.isEmpty()) { |
153 | fail(URLFilterParser::MisplacedStartOfLine); |
154 | return; |
155 | } |
156 | |
157 | m_hasBeginningOfLineAssertion = true; |
158 | } |
159 | |
160 | void assertionEOL() |
161 | { |
162 | if (hasError()) |
163 | return; |
164 | |
165 | sinkFloatingTermIfNecessary(); |
166 | ASSERT(!m_floatingTerm.isValid()); |
167 | |
168 | m_floatingTerm = Term(Term::EndOfLineAssertionTerm); |
169 | } |
170 | |
171 | void assertionWordBoundary(bool) |
172 | { |
173 | fail(URLFilterParser::WordBoundary); |
174 | } |
175 | |
176 | void atomCharacterClassBegin(bool inverted = false) |
177 | { |
178 | if (hasError()) |
179 | return; |
180 | |
181 | sinkFloatingTermIfNecessary(); |
182 | ASSERT(!m_floatingTerm.isValid()); |
183 | |
184 | m_floatingTerm = Term(Term::CharacterSetTerm, inverted); |
185 | } |
186 | |
187 | void atomCharacterClassAtom(UChar character) |
188 | { |
189 | if (hasError()) |
190 | return; |
191 | |
192 | ASSERT(isASCII(character)); |
193 | |
194 | m_floatingTerm.addCharacter(character, m_patternIsCaseSensitive); |
195 | } |
196 | |
197 | void atomCharacterClassRange(UChar a, UChar b) |
198 | { |
199 | if (hasError()) |
200 | return; |
201 | |
202 | ASSERT(a); |
203 | ASSERT(b); |
204 | ASSERT(isASCII(a)); |
205 | ASSERT(isASCII(b)); |
206 | |
207 | for (unsigned i = a; i <= b; ++i) |
208 | m_floatingTerm.addCharacter(static_cast<UChar>(i), m_patternIsCaseSensitive); |
209 | } |
210 | |
211 | void atomCharacterClassEnd() |
212 | { |
213 | // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily. |
214 | } |
215 | |
216 | void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool) |
217 | { |
218 | fail(URLFilterParser::AtomCharacter); |
219 | } |
220 | |
221 | void atomParenthesesSubpatternBegin(bool = true, Optional<String> = WTF::nullopt) |
222 | { |
223 | if (hasError()) |
224 | return; |
225 | |
226 | sinkFloatingTermIfNecessary(); |
227 | |
228 | m_openGroups.append(Term(Term::GroupTerm)); |
229 | } |
230 | |
231 | void atomParentheticalAssertionBegin(bool = false) |
232 | { |
233 | fail(URLFilterParser::Group); |
234 | } |
235 | |
236 | void atomParenthesesEnd() |
237 | { |
238 | if (hasError()) |
239 | return; |
240 | |
241 | sinkFloatingTermIfNecessary(); |
242 | ASSERT(!m_floatingTerm.isValid()); |
243 | |
244 | m_floatingTerm = m_openGroups.takeLast(); |
245 | } |
246 | |
247 | void disjunction() |
248 | { |
249 | fail(URLFilterParser::Disjunction); |
250 | } |
251 | |
252 | private: |
253 | bool hasError() const |
254 | { |
255 | return m_parseStatus != URLFilterParser::Ok; |
256 | } |
257 | |
258 | void fail(URLFilterParser::ParseStatus reason) |
259 | { |
260 | if (hasError()) |
261 | return; |
262 | |
263 | m_parseStatus = reason; |
264 | } |
265 | |
266 | void sinkFloatingTermIfNecessary() |
267 | { |
268 | if (!m_floatingTerm.isValid()) |
269 | return; |
270 | |
271 | if (m_hasProcessedEndOfLineAssertion) { |
272 | fail(URLFilterParser::MisplacedEndOfLine); |
273 | m_floatingTerm = Term(); |
274 | return; |
275 | } |
276 | |
277 | if (m_floatingTerm.isEndOfLineAssertion()) |
278 | m_hasProcessedEndOfLineAssertion = true; |
279 | |
280 | if (!m_openGroups.isEmpty()) { |
281 | m_openGroups.last().extendGroupSubpattern(m_floatingTerm); |
282 | m_floatingTerm = Term(); |
283 | return; |
284 | } |
285 | |
286 | m_sunkTerms.append(m_floatingTerm); |
287 | m_floatingTerm = Term(); |
288 | } |
289 | |
290 | void simplifySunkTerms() |
291 | { |
292 | ASSERT(!m_floatingTerm.isValid()); |
293 | |
294 | if (m_sunkTerms.isEmpty()) |
295 | return; |
296 | |
297 | Term canonicalDotStar(Term::UniversalTransition); |
298 | canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore); |
299 | |
300 | // Replace every ".*"-like terms by our canonical version. Remove any duplicate ".*". |
301 | { |
302 | unsigned termIndex = 0; |
303 | bool isAfterDotStar = false; |
304 | while (termIndex < m_sunkTerms.size()) { |
305 | if (isAfterDotStar && m_sunkTerms[termIndex].isKnownToMatchAnyString()) { |
306 | m_sunkTerms.remove(termIndex); |
307 | continue; |
308 | } |
309 | isAfterDotStar = false; |
310 | |
311 | if (m_sunkTerms[termIndex].isKnownToMatchAnyString()) { |
312 | m_sunkTerms[termIndex] = canonicalDotStar; |
313 | isAfterDotStar = true; |
314 | } |
315 | ++termIndex; |
316 | } |
317 | } |
318 | |
319 | // Add our ".*" in front if needed. |
320 | if (!m_hasBeginningOfLineAssertion && !m_sunkTerms.first().isKnownToMatchAnyString()) |
321 | m_sunkTerms.insert(0, canonicalDotStar); |
322 | |
323 | // Remove trailing ".*$". |
324 | if (m_sunkTerms.size() > 2 && m_sunkTerms.last().isEndOfLineAssertion() && m_sunkTerms[m_sunkTerms.size() - 2].isKnownToMatchAnyString()) |
325 | m_sunkTerms.shrink(m_sunkTerms.size() - 2); |
326 | |
327 | // Remove irrelevant terms that can match empty. For example in "foob?", matching "b" is irrelevant. |
328 | if (m_sunkTerms.last().isEndOfLineAssertion()) |
329 | return; |
330 | while (!m_sunkTerms.isEmpty() && !m_sunkTerms.last().matchesAtLeastOneCharacter()) |
331 | m_sunkTerms.removeLast(); |
332 | } |
333 | |
334 | bool m_patternIsCaseSensitive; |
335 | |
336 | Deque<Term> m_openGroups; |
337 | Vector<Term> m_sunkTerms; |
338 | Term m_floatingTerm; |
339 | bool m_hasBeginningOfLineAssertion { false }; |
340 | bool m_hasProcessedEndOfLineAssertion { false }; |
341 | |
342 | URLFilterParser::ParseStatus m_parseStatus; |
343 | }; |
344 | |
345 | URLFilterParser::URLFilterParser(CombinedURLFilters& combinedURLFilters) |
346 | : m_combinedURLFilters(combinedURLFilters) |
347 | { |
348 | } |
349 | |
350 | URLFilterParser::~URLFilterParser() = default; |
351 | |
352 | URLFilterParser::ParseStatus URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId) |
353 | { |
354 | if (!pattern.isAllASCII()) |
355 | return NonASCII; |
356 | if (pattern.isEmpty()) |
357 | return EmptyPattern; |
358 | |
359 | ParseStatus status = Ok; |
360 | PatternParser patternParser(patternIsCaseSensitive); |
361 | if (!JSC::Yarr::hasError(JSC::Yarr::parse(patternParser, pattern, false, 0))) |
362 | patternParser.finalize(patternId, m_combinedURLFilters); |
363 | else |
364 | status = YarrError; |
365 | |
366 | if (status == Ok) |
367 | status = patternParser.parseStatus(); |
368 | |
369 | return status; |
370 | } |
371 | |
372 | String URLFilterParser::statusString(ParseStatus status) |
373 | { |
374 | switch (status) { |
375 | case Ok: |
376 | return "Ok" ; |
377 | case MatchesEverything: |
378 | return "Matches everything." ; |
379 | case NonASCII: |
380 | return "Only ASCII characters are supported in pattern." ; |
381 | case UnsupportedCharacterClass: |
382 | return "Character class is not supported." ; |
383 | case BackReference: |
384 | return "Patterns cannot contain backreferences." ; |
385 | case ForwardReference: |
386 | return "Patterns cannot contain forward references." ; |
387 | case MisplacedStartOfLine: |
388 | return "Start of line assertion can only appear as the first term in a filter." ; |
389 | case WordBoundary: |
390 | return "Word boundaries assertions are not supported yet." ; |
391 | case AtomCharacter: |
392 | return "Builtins character class atoms are not supported yet." ; |
393 | case Group: |
394 | return "Groups are not supported yet." ; |
395 | case Disjunction: |
396 | return "Disjunctions are not supported yet." ; |
397 | case MisplacedEndOfLine: |
398 | return "The end of line assertion must be the last term in an expression." ; |
399 | case EmptyPattern: |
400 | return "Empty pattern." ; |
401 | case YarrError: |
402 | return "Internal error in YARR." ; |
403 | case InvalidQuantifier: |
404 | return "Arbitrary atom repetitions are not supported." ; |
405 | } |
406 | |
407 | RELEASE_ASSERT_NOT_REACHED(); |
408 | } |
409 | |
410 | } // namespace ContentExtensions |
411 | } // namespace WebCore |
412 | |
413 | #endif // ENABLE(CONTENT_EXTENSIONS) |
414 | |