1/*
2 * Copyright (C) 2015 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "CombinedURLFilters.h"
28
29#if ENABLE(CONTENT_EXTENSIONS)
30
31#include "HashableActionList.h"
32#include "Term.h"
33#include <wtf/DataLog.h>
34#include <wtf/Vector.h>
35#include <wtf/text/CString.h>
36
37namespace WebCore {
38
39namespace ContentExtensions {
40
41struct PrefixTreeEdge {
42 const Term* term;
43 std::unique_ptr<PrefixTreeVertex> child;
44};
45
46typedef Vector<PrefixTreeEdge, 0, WTF::CrashOnOverflow, 1> PrefixTreeEdges;
47
48struct PrefixTreeVertex {
49 WTF_MAKE_STRUCT_FAST_ALLOCATED;
50
51 PrefixTreeEdges edges;
52};
53
54struct ReverseSuffixTreeVertex;
55struct ReverseSuffixTreeEdge {
56 const Term* term;
57 std::unique_ptr<ReverseSuffixTreeVertex> child;
58};
59typedef Vector<ReverseSuffixTreeEdge, 0, WTF::CrashOnOverflow, 1> ReverseSuffixTreeEdges;
60
61struct ReverseSuffixTreeVertex {
62 ReverseSuffixTreeEdges edges;
63 uint32_t nodeId;
64};
65typedef HashMap<HashableActionList, ReverseSuffixTreeVertex, HashableActionListHash, HashableActionListHashTraits> ReverseSuffixTreeRoots;
66
67#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
68static size_t recursiveMemoryUsed(const PrefixTreeVertex& vertex)
69{
70 size_t size = sizeof(PrefixTreeVertex)
71 + vertex.edges.capacity() * sizeof(PrefixTreeEdge);
72 for (const auto& edge : vertex.edges) {
73 ASSERT(edge.child);
74 size += recursiveMemoryUsed(*edge.child.get());
75 }
76 return size;
77}
78
79size_t CombinedURLFilters::memoryUsed() const
80{
81 ASSERT(m_prefixTreeRoot);
82
83 size_t actionsSize = 0;
84 for (const auto& slot : m_actions)
85 actionsSize += slot.value.capacity() * sizeof(uint64_t);
86
87 return sizeof(CombinedURLFilters)
88 + m_alphabet.memoryUsed()
89 + recursiveMemoryUsed(*m_prefixTreeRoot.get())
90 + sizeof(HashMap<PrefixTreeVertex*, ActionList>)
91 + m_actions.capacity() * (sizeof(PrefixTreeVertex*) + sizeof(ActionList))
92 + actionsSize;
93}
94#endif
95
96#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
97static String prefixTreeVertexToString(const PrefixTreeVertex& vertex, const HashMap<const PrefixTreeVertex*, ActionList>& actions, unsigned depth)
98{
99 StringBuilder builder;
100 while (depth--)
101 builder.appendLiteral(" ");
102 builder.appendLiteral("vertex actions: ");
103
104 auto actionsSlot = actions.find(&vertex);
105 if (actionsSlot != actions.end()) {
106 for (uint64_t action : actionsSlot->value) {
107 builder.appendNumber(action);
108 builder.append(',');
109 }
110 }
111 builder.append('\n');
112 return builder.toString();
113}
114
115static void recursivePrint(const PrefixTreeVertex& vertex, const HashMap<const PrefixTreeVertex*, ActionList>& actions, unsigned depth)
116{
117 dataLogF("%s", prefixTreeVertexToString(vertex, actions, depth).utf8().data());
118 for (const auto& edge : vertex.edges) {
119 StringBuilder builder;
120 for (unsigned i = 0; i < depth * 2; ++i)
121 builder.append(' ');
122 builder.appendLiteral("vertex edge: ");
123 builder.append(edge.term->toString());
124 builder.append('\n');
125 dataLogF("%s", builder.toString().utf8().data());
126 ASSERT(edge.child);
127 recursivePrint(*edge.child.get(), actions, depth + 1);
128 }
129}
130
131void CombinedURLFilters::print() const
132{
133 recursivePrint(*m_prefixTreeRoot.get(), m_actions, 0);
134}
135#endif
136
137CombinedURLFilters::CombinedURLFilters()
138 : m_prefixTreeRoot(std::make_unique<PrefixTreeVertex>())
139{
140}
141
142CombinedURLFilters::~CombinedURLFilters() = default;
143
144bool CombinedURLFilters::isEmpty() const
145{
146 return m_prefixTreeRoot->edges.isEmpty();
147}
148
149void CombinedURLFilters::addDomain(uint64_t actionId, const String& domain)
150{
151 unsigned domainLength = domain.length();
152 if (domainLength && domain[0] == '*') {
153 // If domain starts with a '*' then it means match domain and its subdomains, like (^|.)domain$
154 // This way a domain of "*webkit.org" will match "bugs.webkit.org" and "webkit.org".
155 Vector<Term> prependDot;
156 Vector<Term> prependBeginningOfLine;
157 prependDot.reserveInitialCapacity(domainLength + 2);
158 prependBeginningOfLine.reserveInitialCapacity(domainLength); // This is just no .* at the beginning.
159
160 Term canonicalDotStar(Term::UniversalTransition);
161 canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore);
162 prependDot.uncheckedAppend(canonicalDotStar);
163 prependDot.uncheckedAppend(Term('.', true));
164
165 for (unsigned i = 1; i < domainLength; i++) {
166 ASSERT(isASCII(domain[i]));
167 ASSERT(!isASCIIUpper(domain[i]));
168 prependDot.uncheckedAppend(Term(domain[i], true));
169 prependBeginningOfLine.uncheckedAppend(Term(domain[i], true));
170 }
171 prependDot.uncheckedAppend(Term::EndOfLineAssertionTerm);
172 prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm);
173
174 addPattern(actionId, prependDot);
175 addPattern(actionId, prependBeginningOfLine);
176 } else {
177 // This is like adding ^domain$, but interpreting domain as a series of characters, not a regular expression.
178 // "webkit.org" will match "webkit.org" but not "bugs.webkit.org".
179 Vector<Term> prependBeginningOfLine;
180 prependBeginningOfLine.reserveInitialCapacity(domainLength + 1); // This is just no .* at the beginning.
181 for (unsigned i = 0; i < domainLength; i++) {
182 ASSERT(isASCII(domain[i]));
183 ASSERT(!isASCIIUpper(domain[i]));
184 prependBeginningOfLine.uncheckedAppend(Term(domain[i], true));
185 }
186 prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm);
187 addPattern(actionId, prependBeginningOfLine);
188 }
189}
190
191void CombinedURLFilters::addPattern(uint64_t actionId, const Vector<Term>& pattern)
192{
193 ASSERT_WITH_MESSAGE(!pattern.isEmpty(), "The parser should have excluded empty patterns before reaching CombinedURLFilters.");
194
195 if (pattern.isEmpty())
196 return;
197
198 // Extend the prefix tree with the new pattern.
199 PrefixTreeVertex* lastPrefixTree = m_prefixTreeRoot.get();
200
201 for (const Term& term : pattern) {
202 size_t nextEntryIndex = WTF::notFound;
203 for (size_t i = 0; i < lastPrefixTree->edges.size(); ++i) {
204 if (*lastPrefixTree->edges[i].term == term) {
205 nextEntryIndex = i;
206 break;
207 }
208 }
209 if (nextEntryIndex != WTF::notFound)
210 lastPrefixTree = lastPrefixTree->edges[nextEntryIndex].child.get();
211 else {
212 lastPrefixTree->edges.append(PrefixTreeEdge({m_alphabet.interned(term), std::make_unique<PrefixTreeVertex>()}));
213 lastPrefixTree = lastPrefixTree->edges.last().child.get();
214 }
215 }
216
217 auto addResult = m_actions.add(lastPrefixTree, ActionList());
218 ActionList& actions = addResult.iterator->value;
219 if (actions.find(actionId) == WTF::notFound)
220 actions.append(actionId);
221}
222
223struct ActiveSubtree {
224 ActiveSubtree(PrefixTreeVertex& vertex, ImmutableCharNFANodeBuilder&& nfaNode, unsigned edgeIndex)
225 : vertex(vertex)
226 , nfaNode(WTFMove(nfaNode))
227 , edgeIndex(edgeIndex)
228 {
229 }
230 PrefixTreeVertex& vertex;
231 ImmutableCharNFANodeBuilder nfaNode;
232 unsigned edgeIndex;
233};
234
235static void generateInfixUnsuitableForReverseSuffixTree(NFA& nfa, Vector<ActiveSubtree>& stack, const HashMap<const PrefixTreeVertex*, ActionList>& actions)
236{
237 // To avoid conflicts, we use the reverse suffix tree for subtrees that do not merge
238 // in the prefix tree.
239 //
240 // We only unify the suffixes to the actions on the leaf.
241 // If there are actions inside the tree, we generate the part of the subtree up to the action.
242 //
243 // If we accidentally insert a node with action inside the reverse-suffix-tree, we would create
244 // new actions on unrelated pattern when unifying their suffixes.
245 for (unsigned i = stack.size() - 1; i--;) {
246 ActiveSubtree& activeSubtree = stack[i];
247 if (activeSubtree.nfaNode.isValid())
248 return;
249
250 RELEASE_ASSERT_WITH_MESSAGE(i > 0, "The bottom of the stack must be the root of our fixed-length subtree. It should have it the isValid() case above.");
251
252 auto actionsIterator = actions.find(&activeSubtree.vertex);
253 bool hasActionInsideTree = actionsIterator != actions.end();
254
255 // Stricto sensu, we should count the number of exit edges with fixed length.
256 // That is costly and unlikely to matter in practice.
257 bool hasSingleOutcome = activeSubtree.vertex.edges.size() == 1;
258
259 if (hasActionInsideTree || !hasSingleOutcome) {
260 // Go back to the end of the subtree that has already been generated.
261 // From there, generate everything up to the vertex we found.
262 unsigned end = i;
263 unsigned beginning = end;
264
265 ActiveSubtree* sourceActiveSubtree = nullptr;
266 while (beginning--) {
267 ActiveSubtree& activeSubtree = stack[beginning];
268 if (activeSubtree.nfaNode.isValid()) {
269 sourceActiveSubtree = &activeSubtree;
270 break;
271 }
272 }
273 ASSERT_WITH_MESSAGE(sourceActiveSubtree, "The root should always have a valid generator.");
274
275 for (unsigned stackIndex = beginning + 1; stackIndex <= end; ++stackIndex) {
276 ImmutableCharNFANodeBuilder& sourceNode = sourceActiveSubtree->nfaNode;
277 ASSERT(sourceNode.isValid());
278 auto& edge = sourceActiveSubtree->vertex.edges[sourceActiveSubtree->edgeIndex];
279
280 ActiveSubtree& destinationActiveSubtree = stack[stackIndex];
281 destinationActiveSubtree.nfaNode = edge.term->generateGraph(nfa, sourceNode, actions.get(&destinationActiveSubtree.vertex));
282
283 sourceActiveSubtree = &destinationActiveSubtree;
284 }
285
286 return;
287 }
288 }
289}
290
291static void generateSuffixWithReverseSuffixTree(NFA& nfa, Vector<ActiveSubtree>& stack, const HashMap<const PrefixTreeVertex*, ActionList>& actions, ReverseSuffixTreeRoots& reverseSuffixTreeRoots)
292{
293 ActiveSubtree& leafSubtree = stack.last();
294 ASSERT_WITH_MESSAGE(!leafSubtree.nfaNode.isValid(), "The leaf should never be generated by the code above, it should always be inserted into the prefix tree.");
295
296 ActionList actionList = actions.get(&leafSubtree.vertex);
297 ASSERT_WITH_MESSAGE(!actionList.isEmpty(), "Our prefix tree should always have actions on the leaves by construction.");
298
299 HashableActionList hashableActionList(actionList);
300 auto rootAddResult = reverseSuffixTreeRoots.add(hashableActionList, ReverseSuffixTreeVertex());
301 if (rootAddResult.isNewEntry) {
302 ImmutableCharNFANodeBuilder newNode(nfa);
303 newNode.setActions(actionList.begin(), actionList.end());
304 rootAddResult.iterator->value.nodeId = newNode.nodeId();
305 }
306
307 ReverseSuffixTreeVertex* activeReverseSuffixTreeVertex = &rootAddResult.iterator->value;
308 uint32_t destinationNodeId = rootAddResult.iterator->value.nodeId;
309
310 unsigned stackPosition = stack.size() - 2;
311 while (true) {
312 ActiveSubtree& source = stack[stackPosition];
313 auto& edge = source.vertex.edges[source.edgeIndex];
314
315 // This is the end condition: when we meet a node that has already been generated,
316 // we just need to connect our backward tree to the forward tree.
317 //
318 // We *must not* add this last node to the reverse-suffix tree. That node can have
319 // transitions back to earlier part of the prefix tree. If the prefix tree "caches"
320 // such node, it would create new transitions that did not exist in the source language.
321 if (source.nfaNode.isValid()) {
322 stack.shrink(stackPosition + 1);
323 edge.term->generateGraph(nfa, source.nfaNode, destinationNodeId);
324 return;
325 }
326 --stackPosition;
327
328 ASSERT_WITH_MESSAGE(!actions.contains(&source.vertex), "Any node with final actions should have been created before hitting the reverse suffix-tree.");
329
330 ReverseSuffixTreeEdge* existingEdge = nullptr;
331 for (ReverseSuffixTreeEdge& potentialExistingEdge : activeReverseSuffixTreeVertex->edges) {
332 if (edge.term == potentialExistingEdge.term) {
333 existingEdge = &potentialExistingEdge;
334 break;
335 }
336 }
337
338 if (existingEdge)
339 activeReverseSuffixTreeVertex = existingEdge->child.get();
340 else {
341 ImmutableCharNFANodeBuilder newNode(nfa);
342 edge.term->generateGraph(nfa, newNode, destinationNodeId);
343 std::unique_ptr<ReverseSuffixTreeVertex> newVertex(new ReverseSuffixTreeVertex());
344 newVertex->nodeId = newNode.nodeId();
345
346 ReverseSuffixTreeVertex* newVertexAddress = newVertex.get();
347 activeReverseSuffixTreeVertex->edges.append(ReverseSuffixTreeEdge({ edge.term, WTFMove(newVertex) }));
348 activeReverseSuffixTreeVertex = newVertexAddress;
349 }
350 destinationNodeId = activeReverseSuffixTreeVertex->nodeId;
351
352 ASSERT(source.vertex.edges.size() == 1);
353 source.vertex.edges.clear();
354 }
355
356 RELEASE_ASSERT_NOT_REACHED();
357}
358
359static void clearReverseSuffixTree(ReverseSuffixTreeRoots& reverseSuffixTreeRoots)
360{
361 // We cannot rely on the destructor being called in order from top to bottom as we may overflow
362 // the stack. Instead, we go depth first in the reverse-suffix-tree.
363
364 for (auto& slot : reverseSuffixTreeRoots) {
365 Vector<ReverseSuffixTreeVertex*, 128> stack;
366 stack.append(&slot.value);
367
368 while (true) {
369 ReverseSuffixTreeVertex* top = stack.last();
370 if (top->edges.isEmpty()) {
371 stack.removeLast();
372 if (stack.isEmpty())
373 break;
374 stack.last()->edges.removeLast();
375 } else
376 stack.append(top->edges.last().child.get());
377 }
378 }
379 reverseSuffixTreeRoots.clear();
380}
381
382static void generateNFAForSubtree(NFA& nfa, ImmutableCharNFANodeBuilder&& subtreeRoot, PrefixTreeVertex& root, const HashMap<const PrefixTreeVertex*, ActionList>& actions, size_t maxNFASize)
383{
384 // This recurses the subtree of the prefix tree.
385 // For each edge that has fixed length (no quantifiers like ?, *, or +) it generates the nfa graph,
386 // recurses into children, and deletes any processed leaf nodes.
387
388 ReverseSuffixTreeRoots reverseSuffixTreeRoots;
389 Vector<ActiveSubtree> stack;
390 if (!root.edges.isEmpty())
391 stack.append(ActiveSubtree(root, WTFMove(subtreeRoot), 0));
392
393 bool nfaTooBig = false;
394
395 // Generate graphs for each subtree that does not contain any quantifiers.
396 while (!stack.isEmpty()) {
397 PrefixTreeVertex& vertex = stack.last().vertex;
398 const unsigned edgeIndex = stack.last().edgeIndex;
399
400 if (edgeIndex < vertex.edges.size()) {
401 auto& edge = vertex.edges[edgeIndex];
402
403 // Clean up any processed leaves and return early if we are past the maxNFASize.
404 if (nfaTooBig) {
405 stack.last().edgeIndex = stack.last().vertex.edges.size();
406 continue;
407 }
408
409 // Quantified edges in the subtree will be a part of another NFA.
410 if (!edge.term->hasFixedLength()) {
411 stack.last().edgeIndex++;
412 continue;
413 }
414
415 ASSERT(edge.child.get());
416 ImmutableCharNFANodeBuilder emptyBuilder;
417 stack.append(ActiveSubtree(*edge.child.get(), WTFMove(emptyBuilder), 0));
418 } else {
419 bool isLeaf = vertex.edges.isEmpty();
420
421 ASSERT(edgeIndex == vertex.edges.size());
422 vertex.edges.removeAllMatching([](PrefixTreeEdge& edge)
423 {
424 return !edge.term;
425 });
426
427 if (isLeaf) {
428 generateInfixUnsuitableForReverseSuffixTree(nfa, stack, actions);
429 generateSuffixWithReverseSuffixTree(nfa, stack, actions, reverseSuffixTreeRoots);
430
431 // Only stop generating an NFA at a leaf to ensure we have a correct NFA. We could go slightly over the maxNFASize.
432 if (nfa.nodes.size() > maxNFASize)
433 nfaTooBig = true;
434 } else
435 stack.removeLast();
436
437 if (!stack.isEmpty()) {
438 auto& activeSubtree = stack.last();
439 auto& edge = activeSubtree.vertex.edges[stack.last().edgeIndex];
440 if (edge.child->edges.isEmpty())
441 edge.term = nullptr; // Mark this leaf for deleting.
442 activeSubtree.edgeIndex++;
443 }
444 }
445 }
446 clearReverseSuffixTree(reverseSuffixTreeRoots);
447}
448
449void CombinedURLFilters::processNFAs(size_t maxNFASize, const WTF::Function<void(NFA&&)>& handler)
450{
451#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
452 print();
453#endif
454 while (true) {
455 // Traverse out to a leaf.
456 Vector<PrefixTreeVertex*, 128> stack;
457 PrefixTreeVertex* vertex = m_prefixTreeRoot.get();
458 while (true) {
459 ASSERT(vertex);
460 stack.append(vertex);
461 if (vertex->edges.isEmpty())
462 break;
463 vertex = vertex->edges.last().child.get();
464 }
465 if (stack.size() == 1)
466 break; // We're done once we have processed and removed all the edges in the prefix tree.
467
468 // Find the prefix root for this NFA. This is the vertex after the last term with a quantifier if there is one,
469 // or the root if there are no quantifiers left.
470 while (stack.size() > 1) {
471 if (!stack[stack.size() - 2]->edges.last().term->hasFixedLength())
472 break;
473 stack.removeLast();
474 }
475 ASSERT_WITH_MESSAGE(!stack.isEmpty(), "At least the root should be in the stack");
476
477 // Make an NFA with the subtrees for whom this is also the last quantifier (or who also have no quantifier).
478 NFA nfa;
479 {
480 // Put the prefix into the NFA.
481 ImmutableCharNFANodeBuilder lastNode(nfa);
482 for (unsigned i = 0; i < stack.size() - 1; ++i) {
483 const PrefixTreeEdge& edge = stack[i]->edges.last();
484 ImmutableCharNFANodeBuilder newNode = edge.term->generateGraph(nfa, lastNode, m_actions.get(edge.child.get()));
485 lastNode = WTFMove(newNode);
486 }
487
488 // Put the non-quantified vertices in the subtree into the NFA and delete them.
489 ASSERT(stack.last());
490 generateNFAForSubtree(nfa, WTFMove(lastNode), *stack.last(), m_actions, maxNFASize);
491 }
492 nfa.finalize();
493
494 handler(WTFMove(nfa));
495
496 // Clean up any processed leaf nodes.
497 while (true) {
498 if (stack.size() > 1) {
499 if (stack[stack.size() - 1]->edges.isEmpty()) {
500 stack[stack.size() - 2]->edges.removeLast();
501 stack.removeLast();
502 } else
503 break; // Vertex is not a leaf.
504 } else
505 break; // Leave the empty root.
506 }
507 }
508}
509
510} // namespace ContentExtensions
511} // namespace WebCore
512
513#endif // ENABLE(CONTENT_EXTENSIONS)
514