1 | /* |
2 | * Copyright (C) 2015 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' |
14 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
15 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS |
17 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
18 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
19 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
20 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
21 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
22 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
23 | * THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #include "config.h" |
27 | #include "DFAMinimizer.h" |
28 | |
29 | #if ENABLE(CONTENT_EXTENSIONS) |
30 | |
31 | #include "DFA.h" |
32 | #include "DFANode.h" |
33 | #include "MutableRangeList.h" |
34 | #include <wtf/HashMap.h> |
35 | #include <wtf/Hasher.h> |
36 | #include <wtf/Vector.h> |
37 | |
38 | namespace WebCore { |
39 | namespace ContentExtensions { |
40 | |
41 | namespace { |
42 | |
43 | template<typename VectorType, typename Iterable, typename Function> |
44 | static inline void iterateIntersections(const VectorType& singularTransitionsFirsts, const Iterable& iterableTransitionList, const Function& intersectionHandler) |
45 | { |
46 | ASSERT(!singularTransitionsFirsts.isEmpty()); |
47 | auto otherIterator = iterableTransitionList.begin(); |
48 | auto otherEnd = iterableTransitionList.end(); |
49 | |
50 | if (otherIterator == otherEnd) |
51 | return; |
52 | |
53 | unsigned singularTransitionsLength = singularTransitionsFirsts.size(); |
54 | unsigned singularTransitionsFirstsIndex = 0; |
55 | for (; otherIterator != otherEnd; ++otherIterator) { |
56 | auto firstCharacter = otherIterator.first(); |
57 | while (singularTransitionsFirstsIndex < singularTransitionsLength |
58 | && singularTransitionsFirsts[singularTransitionsFirstsIndex] != firstCharacter) |
59 | ++singularTransitionsFirstsIndex; |
60 | |
61 | intersectionHandler(singularTransitionsFirstsIndex, otherIterator); |
62 | ++singularTransitionsFirstsIndex; |
63 | |
64 | auto lastCharacter = otherIterator.last(); |
65 | while (singularTransitionsFirstsIndex < singularTransitionsLength |
66 | && singularTransitionsFirsts[singularTransitionsFirstsIndex] <= lastCharacter) { |
67 | intersectionHandler(singularTransitionsFirstsIndex, otherIterator); |
68 | ++singularTransitionsFirstsIndex; |
69 | } |
70 | } |
71 | } |
72 | |
73 | class Partition { |
74 | public: |
75 | void initialize(unsigned size) |
76 | { |
77 | if (!size) |
78 | return; |
79 | |
80 | m_sets.reserveInitialCapacity(size); |
81 | m_partitionedElements.resize(size); |
82 | m_elementPositionInPartitionedNodes.resize(size); |
83 | m_elementToSetMap.resize(size); |
84 | |
85 | for (unsigned i = 0; i < size; ++i) { |
86 | m_partitionedElements[i] = i; |
87 | m_elementPositionInPartitionedNodes[i] = i; |
88 | m_elementToSetMap[i] = 0; |
89 | } |
90 | m_sets.uncheckedAppend(SetDescriptor { 0, size, 0 }); |
91 | } |
92 | |
93 | void reserveUninitializedCapacity(unsigned elementCount) |
94 | { |
95 | m_partitionedElements.resize(elementCount); |
96 | m_elementPositionInPartitionedNodes.resize(elementCount); |
97 | m_elementToSetMap.resize(elementCount); |
98 | } |
99 | |
100 | void addSetUnchecked(unsigned start, unsigned size) |
101 | { |
102 | m_sets.append(SetDescriptor { start, size, 0 }); |
103 | } |
104 | |
105 | void setElementUnchecked(unsigned elementIndex, unsigned positionInPartition, unsigned setIndex) |
106 | { |
107 | ASSERT(setIndex < m_sets.size()); |
108 | m_partitionedElements[positionInPartition] = elementIndex; |
109 | m_elementPositionInPartitionedNodes[elementIndex] = positionInPartition; |
110 | m_elementToSetMap[elementIndex] = setIndex; |
111 | } |
112 | |
113 | unsigned startOffsetOfSet(unsigned setIndex) const |
114 | { |
115 | return m_sets[setIndex].start; |
116 | } |
117 | |
118 | ALWAYS_INLINE void markElementInCurrentGeneration(unsigned elementIndex) |
119 | { |
120 | // Swap the node with the first unmarked node. |
121 | unsigned setIndex = m_elementToSetMap[elementIndex]; |
122 | SetDescriptor& setDescriptor = m_sets[setIndex]; |
123 | |
124 | unsigned elementPositionInPartition = m_elementPositionInPartitionedNodes[elementIndex]; |
125 | ASSERT(elementPositionInPartition >= setDescriptor.start); |
126 | ASSERT(elementPositionInPartition < setDescriptor.end()); |
127 | |
128 | unsigned firstUnmarkedElementPositionInPartition = setDescriptor.indexAfterMarkedElements(); |
129 | ASSERT(firstUnmarkedElementPositionInPartition >= setDescriptor.start && firstUnmarkedElementPositionInPartition < setDescriptor.end()); |
130 | ASSERT(firstUnmarkedElementPositionInPartition >= firstUnmarkedElementPositionInPartition); |
131 | |
132 | // Swap the nodes in the set. |
133 | unsigned firstUnmarkedElement = m_partitionedElements[firstUnmarkedElementPositionInPartition]; |
134 | m_partitionedElements[firstUnmarkedElementPositionInPartition] = elementIndex; |
135 | m_partitionedElements[elementPositionInPartition] = firstUnmarkedElement; |
136 | |
137 | // Update their index. |
138 | m_elementPositionInPartitionedNodes[elementIndex] = firstUnmarkedElementPositionInPartition; |
139 | m_elementPositionInPartitionedNodes[firstUnmarkedElement] = elementPositionInPartition; |
140 | |
141 | if (!setDescriptor.markedCount) { |
142 | ASSERT(!m_setsMarkedInCurrentGeneration.contains(setIndex)); |
143 | m_setsMarkedInCurrentGeneration.append(setIndex); |
144 | } |
145 | ++setDescriptor.markedCount; |
146 | } |
147 | |
148 | // The function passed as argument MUST not modify the partition. |
149 | template<typename Function> |
150 | void refineGeneration(const Function& function) |
151 | { |
152 | for (unsigned setIndex : m_setsMarkedInCurrentGeneration) { |
153 | SetDescriptor& setDescriptor = m_sets[setIndex]; |
154 | if (setDescriptor.markedCount == setDescriptor.size) { |
155 | // Everything is marked, there is nothing to refine. |
156 | setDescriptor.markedCount = 0; |
157 | continue; |
158 | } |
159 | |
160 | SetDescriptor newSet; |
161 | bool newSetIsMarkedSet = setDescriptor.markedCount * 2 <= setDescriptor.size; |
162 | if (newSetIsMarkedSet) { |
163 | // Less than half of the nodes have been marked. |
164 | newSet = { setDescriptor.start, setDescriptor.markedCount, 0 }; |
165 | setDescriptor.start = setDescriptor.start + setDescriptor.markedCount; |
166 | } else |
167 | newSet = { setDescriptor.start + setDescriptor.markedCount, setDescriptor.size - setDescriptor.markedCount, 0 }; |
168 | setDescriptor.size -= newSet.size; |
169 | setDescriptor.markedCount = 0; |
170 | |
171 | unsigned newSetIndex = m_sets.size(); |
172 | m_sets.append(newSet); |
173 | |
174 | for (unsigned i = newSet.start; i < newSet.end(); ++i) |
175 | m_elementToSetMap[m_partitionedElements[i]] = newSetIndex; |
176 | |
177 | function(newSetIndex); |
178 | } |
179 | m_setsMarkedInCurrentGeneration.clear(); |
180 | } |
181 | |
182 | // Call Function() on every node of a given subset. |
183 | template<typename Function> |
184 | void iterateSet(unsigned setIndex, const Function& function) |
185 | { |
186 | SetDescriptor& setDescriptor = m_sets[setIndex]; |
187 | for (unsigned i = setDescriptor.start; i < setDescriptor.end(); ++i) |
188 | function(m_partitionedElements[i]); |
189 | } |
190 | |
191 | // Index of the set containing the Node. |
192 | unsigned setIndex(unsigned elementIndex) const |
193 | { |
194 | return m_elementToSetMap[elementIndex]; |
195 | } |
196 | |
197 | // NodeIndex of the first element in the set. |
198 | unsigned firstElementInSet(unsigned setIndex) const |
199 | { |
200 | return m_partitionedElements[m_sets[setIndex].start]; |
201 | } |
202 | |
203 | unsigned size() const |
204 | { |
205 | return m_sets.size(); |
206 | } |
207 | |
208 | private: |
209 | struct SetDescriptor { |
210 | unsigned start; |
211 | unsigned size; |
212 | unsigned markedCount; |
213 | |
214 | unsigned indexAfterMarkedElements() const { return start + markedCount; } |
215 | unsigned end() const { return start + size; } |
216 | }; |
217 | |
218 | // List of sets. |
219 | Vector<SetDescriptor, 0, ContentExtensionsOverflowHandler> m_sets; |
220 | |
221 | // All the element indices such that two elements of the same set never have a element of a different set between them. |
222 | Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_partitionedElements; |
223 | |
224 | // Map elementIndex->position in the partitionedElements. |
225 | Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_elementPositionInPartitionedNodes; |
226 | |
227 | // Map elementIndex->SetIndex. |
228 | Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_elementToSetMap; |
229 | |
230 | // List of sets with any marked node. Each set can appear at most once. |
231 | // FIXME: find a good inline size for this. |
232 | Vector<unsigned, 128, ContentExtensionsOverflowHandler> m_setsMarkedInCurrentGeneration; |
233 | }; |
234 | |
235 | class FullGraphPartition { |
236 | typedef MutableRangeList<char, uint32_t, 128> SingularTransitionsMutableRangeList; |
237 | public: |
238 | FullGraphPartition(const DFA& dfa) |
239 | { |
240 | m_nodePartition.initialize(dfa.nodes.size()); |
241 | |
242 | SingularTransitionsMutableRangeList singularTransitions; |
243 | CounterConverter counterConverter; |
244 | for (const DFANode& node : dfa.nodes) { |
245 | if (node.isKilled()) |
246 | continue; |
247 | auto transitions = node.transitions(dfa); |
248 | singularTransitions.extend(transitions.begin(), transitions.end(), counterConverter); |
249 | } |
250 | |
251 | // Count the number of transition for each singular range. This will give us the bucket size |
252 | // for the transition partition, where transitions are partitioned by "symbol". |
253 | unsigned rangeIndexAccumulator = 0; |
254 | for (const auto& transition : singularTransitions) { |
255 | m_transitionPartition.addSetUnchecked(rangeIndexAccumulator, transition.data); |
256 | rangeIndexAccumulator += transition.data; |
257 | } |
258 | |
259 | // Count the number of incoming transitions per node. |
260 | m_flattenedTransitionsStartOffsetPerNode.resize(dfa.nodes.size()); |
261 | memset(m_flattenedTransitionsStartOffsetPerNode.data(), 0, m_flattenedTransitionsStartOffsetPerNode.size() * sizeof(unsigned)); |
262 | |
263 | Vector<char, 0, ContentExtensionsOverflowHandler> singularTransitionsFirsts; |
264 | singularTransitionsFirsts.reserveInitialCapacity(singularTransitions.m_ranges.size()); |
265 | for (const auto& transition : singularTransitions) |
266 | singularTransitionsFirsts.uncheckedAppend(transition.first); |
267 | |
268 | for (const DFANode& node : dfa.nodes) { |
269 | if (node.isKilled()) |
270 | continue; |
271 | auto transitions = node.transitions(dfa); |
272 | iterateIntersections(singularTransitionsFirsts, transitions, [&](unsigned, const DFANode::ConstRangeIterator& origin) { |
273 | uint32_t targetNodeIndex = origin.target(); |
274 | ++m_flattenedTransitionsStartOffsetPerNode[targetNodeIndex]; |
275 | }); |
276 | } |
277 | |
278 | // Accumulate the offsets. This gives us the start position of each bucket. |
279 | unsigned transitionAccumulator = 0; |
280 | for (unsigned i = 0; i < m_flattenedTransitionsStartOffsetPerNode.size(); ++i) { |
281 | unsigned transitionsCountForNode = m_flattenedTransitionsStartOffsetPerNode[i]; |
282 | m_flattenedTransitionsStartOffsetPerNode[i] = transitionAccumulator; |
283 | transitionAccumulator += transitionsCountForNode; |
284 | } |
285 | unsigned flattenedTransitionsSize = transitionAccumulator; |
286 | ASSERT_WITH_MESSAGE(flattenedTransitionsSize == rangeIndexAccumulator, "The number of transitions should be the same, regardless of how they are arranged in buckets." ); |
287 | |
288 | m_transitionPartition.reserveUninitializedCapacity(flattenedTransitionsSize); |
289 | |
290 | // Next, let's fill the transition table and set up the size of each group at the same time. |
291 | m_flattenedTransitionsSizePerNode.resize(dfa.nodes.size()); |
292 | for (unsigned& counter : m_flattenedTransitionsSizePerNode) |
293 | counter = 0; |
294 | m_flattenedTransitions.resize(flattenedTransitionsSize); |
295 | |
296 | Vector<uint32_t> transitionPerRangeOffset(m_transitionPartition.size()); |
297 | memset(transitionPerRangeOffset.data(), 0, transitionPerRangeOffset.size() * sizeof(uint32_t)); |
298 | |
299 | for (unsigned i = 0; i < dfa.nodes.size(); ++i) { |
300 | const DFANode& node = dfa.nodes[i]; |
301 | if (node.isKilled()) |
302 | continue; |
303 | |
304 | auto transitions = node.transitions(dfa); |
305 | iterateIntersections(singularTransitionsFirsts, transitions, [&](unsigned singularTransitonIndex, const DFANode::ConstRangeIterator& origin) { |
306 | uint32_t targetNodeIndex = origin.target(); |
307 | |
308 | unsigned start = m_flattenedTransitionsStartOffsetPerNode[targetNodeIndex]; |
309 | unsigned offset = m_flattenedTransitionsSizePerNode[targetNodeIndex]; |
310 | unsigned positionInFlattenedTransitions = start + offset; |
311 | m_flattenedTransitions[positionInFlattenedTransitions] = Transition({ i }); |
312 | |
313 | uint32_t& inRangeOffset = transitionPerRangeOffset[singularTransitonIndex]; |
314 | unsigned positionInTransitionPartition = m_transitionPartition.startOffsetOfSet(singularTransitonIndex) + inRangeOffset; |
315 | ++inRangeOffset; |
316 | |
317 | m_transitionPartition.setElementUnchecked(positionInFlattenedTransitions, positionInTransitionPartition, singularTransitonIndex); |
318 | |
319 | ++m_flattenedTransitionsSizePerNode[targetNodeIndex]; |
320 | }); |
321 | } |
322 | } |
323 | |
324 | void markNode(unsigned nodeIndex) |
325 | { |
326 | m_nodePartition.markElementInCurrentGeneration(nodeIndex); |
327 | } |
328 | |
329 | void refinePartitions() |
330 | { |
331 | m_nodePartition.refineGeneration([&](unsigned smallestSetIndex) { |
332 | m_nodePartition.iterateSet(smallestSetIndex, [&](unsigned nodeIndex) { |
333 | unsigned incomingTransitionsStartForNode = m_flattenedTransitionsStartOffsetPerNode[nodeIndex]; |
334 | unsigned incomingTransitionsSizeForNode = m_flattenedTransitionsSizePerNode[nodeIndex]; |
335 | |
336 | for (unsigned i = 0; i < incomingTransitionsSizeForNode; ++i) |
337 | m_transitionPartition.markElementInCurrentGeneration(incomingTransitionsStartForNode + i); |
338 | }); |
339 | |
340 | // We only need to split the transitions, we handle the new sets through the main loop. |
341 | m_transitionPartition.refineGeneration([](unsigned) { }); |
342 | }); |
343 | } |
344 | |
345 | void splitByUniqueTransitions() |
346 | { |
347 | for (; m_nextTransitionSetToProcess < m_transitionPartition.size(); ++m_nextTransitionSetToProcess) { |
348 | // We use the known splitters to refine the set. |
349 | m_transitionPartition.iterateSet(m_nextTransitionSetToProcess, [&](unsigned transitionIndex) { |
350 | unsigned sourceNodeIndex = m_flattenedTransitions[transitionIndex].source; |
351 | m_nodePartition.markElementInCurrentGeneration(sourceNodeIndex); |
352 | }); |
353 | |
354 | refinePartitions(); |
355 | } |
356 | } |
357 | |
358 | unsigned nodeReplacement(unsigned nodeIndex) |
359 | { |
360 | unsigned setIndex = m_nodePartition.setIndex(nodeIndex); |
361 | return m_nodePartition.firstElementInSet(setIndex); |
362 | } |
363 | |
364 | private: |
365 | struct Transition { |
366 | unsigned source; |
367 | }; |
368 | |
369 | struct CounterConverter { |
370 | uint32_t convert(uint32_t) |
371 | { |
372 | return 1; |
373 | } |
374 | |
375 | void extend(uint32_t& destination, uint32_t) |
376 | { |
377 | ++destination; |
378 | } |
379 | }; |
380 | |
381 | Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_flattenedTransitionsStartOffsetPerNode; |
382 | Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_flattenedTransitionsSizePerNode; |
383 | Vector<Transition, 0, ContentExtensionsOverflowHandler> m_flattenedTransitions; |
384 | |
385 | Partition m_nodePartition; |
386 | Partition m_transitionPartition; |
387 | |
388 | unsigned m_nextTransitionSetToProcess { 0 }; |
389 | }; |
390 | |
391 | struct ActionKey { |
392 | enum DeletedValueTag { DeletedValue }; |
393 | explicit ActionKey(DeletedValueTag) { state = Deleted; } |
394 | |
395 | enum EmptyValueTag { EmptyValue }; |
396 | explicit ActionKey(EmptyValueTag) { state = Empty; } |
397 | |
398 | explicit ActionKey(const DFA* dfa, uint32_t actionsStart, uint16_t actionsLength) |
399 | : dfa(dfa) |
400 | , actionsStart(actionsStart) |
401 | , actionsLength(actionsLength) |
402 | , state(Valid) |
403 | { |
404 | StringHasher hasher; |
405 | hasher.addCharactersAssumingAligned(reinterpret_cast<const UChar*>(&dfa->actions[actionsStart]), actionsLength * sizeof(uint64_t) / sizeof(UChar)); |
406 | hash = hasher.hash(); |
407 | } |
408 | |
409 | bool isEmptyValue() const { return state == Empty; } |
410 | bool isDeletedValue() const { return state == Deleted; } |
411 | |
412 | unsigned hash; |
413 | |
414 | const DFA* dfa; |
415 | uint32_t actionsStart; |
416 | uint16_t actionsLength; |
417 | |
418 | enum { |
419 | Valid, |
420 | Empty, |
421 | Deleted |
422 | } state; |
423 | }; |
424 | |
425 | struct ActionKeyHash { |
426 | static unsigned hash(const ActionKey& actionKey) |
427 | { |
428 | return actionKey.hash; |
429 | } |
430 | |
431 | static bool equal(const ActionKey& a, const ActionKey& b) |
432 | { |
433 | if (a.state != b.state |
434 | || a.dfa != b.dfa |
435 | || a.actionsLength != b.actionsLength) |
436 | return false; |
437 | for (uint16_t i = 0; i < a.actionsLength; ++i) { |
438 | if (a.dfa->actions[a.actionsStart + i] != a.dfa->actions[b.actionsStart + i]) |
439 | return false; |
440 | } |
441 | return true; |
442 | } |
443 | static const bool safeToCompareToEmptyOrDeleted = false; |
444 | }; |
445 | |
446 | struct ActionKeyHashTraits : public WTF::CustomHashTraits<ActionKey> { |
447 | static const bool emptyValueIsZero = true; |
448 | }; |
449 | |
450 | } // anonymous namespace. |
451 | |
452 | void DFAMinimizer::minimize(DFA& dfa) |
453 | { |
454 | FullGraphPartition fullGraphPartition(dfa); |
455 | |
456 | // Unlike traditional minimization final states can be differentiated by their action. |
457 | // Instead of creating a single set for the final state, we partition by actions from |
458 | // the start. |
459 | HashMap<ActionKey, Vector<unsigned>, ActionKeyHash, ActionKeyHashTraits> finalStates; |
460 | for (unsigned i = 0; i < dfa.nodes.size(); ++i) { |
461 | const DFANode& node = dfa.nodes[i]; |
462 | if (node.hasActions()) { |
463 | // FIXME: Sort the actions in the dfa to make nodes that have the same actions in different order equal. |
464 | auto addResult = finalStates.add(ActionKey(&dfa, node.actionsStart(), node.actionsLength()), Vector<unsigned>()); |
465 | addResult.iterator->value.append(i); |
466 | } |
467 | } |
468 | |
469 | for (const auto& slot : finalStates) { |
470 | for (unsigned finalStateIndex : slot.value) |
471 | fullGraphPartition.markNode(finalStateIndex); |
472 | fullGraphPartition.refinePartitions(); |
473 | } |
474 | |
475 | // Use every splitter to refine the node partitions. |
476 | fullGraphPartition.splitByUniqueTransitions(); |
477 | |
478 | Vector<unsigned> relocationVector; |
479 | relocationVector.reserveInitialCapacity(dfa.nodes.size()); |
480 | for (unsigned i = 0; i < dfa.nodes.size(); ++i) |
481 | relocationVector.uncheckedAppend(i); |
482 | |
483 | // Update all the transitions. |
484 | for (unsigned i = 0; i < dfa.nodes.size(); ++i) { |
485 | unsigned replacement = fullGraphPartition.nodeReplacement(i); |
486 | if (i != replacement) { |
487 | relocationVector[i] = replacement; |
488 | dfa.nodes[i].kill(dfa); |
489 | } |
490 | } |
491 | |
492 | dfa.root = relocationVector[dfa.root]; |
493 | for (DFANode& node : dfa.nodes) { |
494 | if (node.isKilled()) |
495 | continue; |
496 | |
497 | for (auto& transition : node.transitions(dfa)) { |
498 | uint32_t target = transition.target(); |
499 | uint32_t relocatedTarget = relocationVector[target]; |
500 | if (target != relocatedTarget) |
501 | transition.resetTarget(relocatedTarget); |
502 | } |
503 | } |
504 | } |
505 | |
506 | } // namespace ContentExtensions |
507 | } // namespace WebCore |
508 | |
509 | #endif // ENABLE(CONTENT_EXTENSIONS) |
510 | |