DFAMinimizer.cpp source code [webkit/Source/WebCore/contentextensions/DFAMinimizer.cpp]

1	/*
2	* Copyright (C) 2015 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23	* THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include "DFAMinimizer.h"
28
29	#if ENABLE(CONTENT_EXTENSIONS)
30
31	#include "DFA.h"
32	#include "DFANode.h"
33	#include "MutableRangeList.h"
34	#include <wtf/HashMap.h>
35	#include <wtf/Hasher.h>
36	#include <wtf/Vector.h>
37
38	namespace WebCore {
39	namespace ContentExtensions {
40
41	namespace {
42
43	template<typename VectorType, typename Iterable, typename Function>
44	static inline void iterateIntersections(const VectorType& singularTransitionsFirsts, const Iterable& iterableTransitionList, const Function& intersectionHandler)
45	{
46	ASSERT(!singularTransitionsFirsts.isEmpty());
47	auto otherIterator = iterableTransitionList.begin();
48	auto otherEnd = iterableTransitionList.end();
49
50	if (otherIterator == otherEnd)
51	return;
52
53	unsigned singularTransitionsLength = singularTransitionsFirsts.size();
54	unsigned singularTransitionsFirstsIndex = `0`;
55	for (; otherIterator != otherEnd; ++otherIterator) {
56	auto firstCharacter = otherIterator.first();
57	while (singularTransitionsFirstsIndex < singularTransitionsLength
58	&& singularTransitionsFirsts[singularTransitionsFirstsIndex] != firstCharacter)
59	++singularTransitionsFirstsIndex;
60
61	intersectionHandler(singularTransitionsFirstsIndex, otherIterator);
62	++singularTransitionsFirstsIndex;
63
64	auto lastCharacter = otherIterator.last();
65	while (singularTransitionsFirstsIndex < singularTransitionsLength
66	&& singularTransitionsFirsts[singularTransitionsFirstsIndex] <= lastCharacter) {
67	intersectionHandler(singularTransitionsFirstsIndex, otherIterator);
68	++singularTransitionsFirstsIndex;
69	}
70	}
71	}
72
73	class Partition {
74	public:
75	void initialize(unsigned size)
76	{
77	if (!size)
78	return;
79
80	m_sets.reserveInitialCapacity(size);
81	m_partitionedElements.resize(size);
82	m_elementPositionInPartitionedNodes.resize(size);
83	m_elementToSetMap.resize(size);
84
85	for (unsigned i = `0`; i < size; ++i) {
86	m_partitionedElements [i] = i;
87	m_elementPositionInPartitionedNodes [i] = i;
88	m_elementToSetMap [i] = `0`;
89	}
90	m_sets.uncheckedAppend(SetDescriptor { `0`, size, `0` });
91	}
92
93	void reserveUninitializedCapacity(unsigned elementCount)
94	{
95	m_partitionedElements.resize(elementCount);
96	m_elementPositionInPartitionedNodes.resize(elementCount);
97	m_elementToSetMap.resize(elementCount);
98	}
99
100	void addSetUnchecked(unsigned start, unsigned size)
101	{
102	m_sets.append(SetDescriptor { start, size, `0` });
103	}
104
105	void setElementUnchecked(unsigned elementIndex, unsigned positionInPartition, unsigned setIndex)
106	{
107	ASSERT(setIndex < m_sets.size());
108	m_partitionedElements [positionInPartition] = elementIndex;
109	m_elementPositionInPartitionedNodes [elementIndex] = positionInPartition;
110	m_elementToSetMap [elementIndex] = setIndex;
111	}
112
113	unsigned startOffsetOfSet(unsigned setIndex) const
114	{
115	return m_sets [setIndex].start;
116	}
117
118	ALWAYS_INLINE void markElementInCurrentGeneration(unsigned elementIndex)
119	{
120	// Swap the node with the first unmarked node.
121	unsigned setIndex = m_elementToSetMap [elementIndex];
122	SetDescriptor& setDescriptor = m_sets [setIndex];
123
124	unsigned elementPositionInPartition = m_elementPositionInPartitionedNodes [elementIndex];
125	ASSERT(elementPositionInPartition >= setDescriptor.start);
126	ASSERT(elementPositionInPartition < setDescriptor.end());
127
128	unsigned firstUnmarkedElementPositionInPartition = setDescriptor.indexAfterMarkedElements();
129	ASSERT(firstUnmarkedElementPositionInPartition >= setDescriptor.start && firstUnmarkedElementPositionInPartition < setDescriptor.end());
130	ASSERT(firstUnmarkedElementPositionInPartition >= firstUnmarkedElementPositionInPartition);
131
132	// Swap the nodes in the set.
133	unsigned firstUnmarkedElement = m_partitionedElements [firstUnmarkedElementPositionInPartition];
134	m_partitionedElements [firstUnmarkedElementPositionInPartition] = elementIndex;
135	m_partitionedElements [elementPositionInPartition] = firstUnmarkedElement;
136
137	// Update their index.
138	m_elementPositionInPartitionedNodes [elementIndex] = firstUnmarkedElementPositionInPartition;
139	m_elementPositionInPartitionedNodes [firstUnmarkedElement] = elementPositionInPartition;
140
141	if (!setDescriptor.markedCount) {
142	ASSERT(!m_setsMarkedInCurrentGeneration.contains(setIndex));
143	m_setsMarkedInCurrentGeneration.append(setIndex);
144	}
145	++setDescriptor.markedCount;
146	}
147
148	// The function passed as argument MUST not modify the partition.
149	template<typename Function>
150	void refineGeneration(const Function& function)
151	{
152	for (unsigned setIndex : m_setsMarkedInCurrentGeneration) {
153	SetDescriptor& setDescriptor = m_sets [setIndex];
154	if (setDescriptor.markedCount == setDescriptor.size) {
155	// Everything is marked, there is nothing to refine.
156	setDescriptor.markedCount = `0`;
157	continue;
158	}
159
160	SetDescriptor newSet;
161	bool newSetIsMarkedSet = setDescriptor.markedCount * `2` <= setDescriptor.size;
162	if (newSetIsMarkedSet) {
163	// Less than half of the nodes have been marked.
164	newSet = { setDescriptor.start, setDescriptor.markedCount, `0` };
165	setDescriptor.start = setDescriptor.start + setDescriptor.markedCount;
166	} else
167	newSet = { setDescriptor.start + setDescriptor.markedCount, setDescriptor.size - setDescriptor.markedCount, `0` };
168	setDescriptor.size -= newSet.size;
169	setDescriptor.markedCount = `0`;
170
171	unsigned newSetIndex = m_sets.size();
172	m_sets.append(newSet);
173
174	for (unsigned i = newSet.start; i < newSet.end(); ++i)
175	m_elementToSetMap [m_partitionedElements [i]] = newSetIndex;
176
177	function(newSetIndex);
178	}
179	m_setsMarkedInCurrentGeneration.clear();
180	}
181
182	// Call Function() on every node of a given subset.
183	template<typename Function>
184	void iterateSet(unsigned setIndex, const Function& function)
185	{
186	SetDescriptor& setDescriptor = m_sets [setIndex];
187	for (unsigned i = setDescriptor.start; i < setDescriptor.end(); ++i)
188	function(m_partitionedElements [i]);
189	}
190
191	// Index of the set containing the Node.
192	unsigned setIndex(unsigned elementIndex) const
193	{
194	return m_elementToSetMap [elementIndex];
195	}
196
197	// NodeIndex of the first element in the set.
198	unsigned firstElementInSet(unsigned setIndex) const
199	{
200	return m_partitionedElements [m_sets [setIndex].start];
201	}
202
203	unsigned size() const
204	{
205	return m_sets.size();
206	}
207
208	private:
209	struct SetDescriptor {
210	unsigned start;
211	unsigned size;
212	unsigned markedCount;
213
214	unsigned indexAfterMarkedElements() const { return start + markedCount; }
215	unsigned end() const { return start + size; }
216	};
217
218	// List of sets.
219	Vector<SetDescriptor, `0`, ContentExtensionsOverflowHandler> m_sets;
220
221	// All the element indices such that two elements of the same set never have a element of a different set between them.
222	Vector<unsigned, `0`, ContentExtensionsOverflowHandler> m_partitionedElements;
223
224	// Map elementIndex->position in the partitionedElements.
225	Vector<unsigned, `0`, ContentExtensionsOverflowHandler> m_elementPositionInPartitionedNodes;
226
227	// Map elementIndex->SetIndex.
228	Vector<unsigned, `0`, ContentExtensionsOverflowHandler> m_elementToSetMap;
229
230	// List of sets with any marked node. Each set can appear at most once.
231	// FIXME: find a good inline size for this.
232	Vector<unsigned, `128`, ContentExtensionsOverflowHandler> m_setsMarkedInCurrentGeneration;
233	};
234
235	class FullGraphPartition {
236	typedef MutableRangeList<char, uint32_t, `128`> SingularTransitionsMutableRangeList;
237	public:
238	FullGraphPartition(const DFA& dfa)
239	{
240	m_nodePartition.initialize(dfa.nodes.size());
241
242	SingularTransitionsMutableRangeList singularTransitions;
243	CounterConverter counterConverter;
244	for (const DFANode& node : dfa.nodes) {
245	if (node.isKilled())
246	continue;
247	auto transitions = node.transitions(dfa);
248	singularTransitions.extend(transitions.begin(), transitions.end(), counterConverter);
249	}
250
251	// Count the number of transition for each singular range. This will give us the bucket size
252	// for the transition partition, where transitions are partitioned by "symbol".
253	unsigned rangeIndexAccumulator = `0`;
254	for (const auto& transition : singularTransitions) {
255	m_transitionPartition.addSetUnchecked(rangeIndexAccumulator, transition.data);
256	rangeIndexAccumulator += transition.data;
257	}
258
259	// Count the number of incoming transitions per node.
260	m_flattenedTransitionsStartOffsetPerNode.resize(dfa.nodes.size());
261	memset(m_flattenedTransitionsStartOffsetPerNode.data(), `0`, m_flattenedTransitionsStartOffsetPerNode.size() * sizeof(unsigned));
262
263	Vector<char, `0`, ContentExtensionsOverflowHandler> singularTransitionsFirsts;
264	singularTransitionsFirsts.reserveInitialCapacity(singularTransitions.m_ranges.size());
265	for (const auto& transition : singularTransitions)
266	singularTransitionsFirsts.uncheckedAppend(transition.first);
267
268	for (const DFANode& node : dfa.nodes) {
269	if (node.isKilled())
270	continue;
271	auto transitions = node.transitions(dfa);
272	iterateIntersections(singularTransitionsFirsts, transitions, [&](unsigned, const DFANode::ConstRangeIterator& origin) {
273	uint32_t targetNodeIndex = origin.target();
274	++m_flattenedTransitionsStartOffsetPerNode [targetNodeIndex];
275	});
276	}
277
278	// Accumulate the offsets. This gives us the start position of each bucket.
279	unsigned transitionAccumulator = `0`;
280	for (unsigned i = `0`; i < m_flattenedTransitionsStartOffsetPerNode.size(); ++i) {
281	unsigned transitionsCountForNode = m_flattenedTransitionsStartOffsetPerNode [i];
282	m_flattenedTransitionsStartOffsetPerNode [i] = transitionAccumulator;
283	transitionAccumulator += transitionsCountForNode;
284	}
285	unsigned flattenedTransitionsSize = transitionAccumulator;
286	ASSERT_WITH_MESSAGE(flattenedTransitionsSize == rangeIndexAccumulator, "The number of transitions should be the same, regardless of how they are arranged in buckets.");
287
288	m_transitionPartition.reserveUninitializedCapacity(flattenedTransitionsSize);
289
290	// Next, let's fill the transition table and set up the size of each group at the same time.
291	m_flattenedTransitionsSizePerNode.resize(dfa.nodes.size());
292	for (unsigned& counter : m_flattenedTransitionsSizePerNode)
293	counter = `0`;
294	m_flattenedTransitions.resize(flattenedTransitionsSize);
295
296	Vector<uint32_t> transitionPerRangeOffset(m_transitionPartition.size());
297	memset(transitionPerRangeOffset.data(), `0`, transitionPerRangeOffset.size() * sizeof(uint32_t));
298
299	for (unsigned i = `0`; i < dfa.nodes.size(); ++i) {
300	const DFANode& node = dfa.nodes [i];
301	if (node.isKilled())
302	continue;
303
304	auto transitions = node.transitions(dfa);
305	iterateIntersections(singularTransitionsFirsts, transitions, [&](unsigned singularTransitonIndex, const DFANode::ConstRangeIterator& origin) {
306	uint32_t targetNodeIndex = origin.target();
307
308	unsigned start = m_flattenedTransitionsStartOffsetPerNode [targetNodeIndex];
309	unsigned offset = m_flattenedTransitionsSizePerNode [targetNodeIndex];
310	unsigned positionInFlattenedTransitions = start + offset;
311	m_flattenedTransitions [positionInFlattenedTransitions] = Transition ({ i });
312
313	uint32_t& inRangeOffset = transitionPerRangeOffset [singularTransitonIndex];
314	unsigned positionInTransitionPartition = m_transitionPartition.startOffsetOfSet(singularTransitonIndex) + inRangeOffset;
315	++inRangeOffset;
316
317	m_transitionPartition.setElementUnchecked(positionInFlattenedTransitions, positionInTransitionPartition, singularTransitonIndex);
318
319	++m_flattenedTransitionsSizePerNode [targetNodeIndex];
320	});
321	}
322	}
323
324	void markNode(unsigned nodeIndex)
325	{
326	m_nodePartition.markElementInCurrentGeneration(nodeIndex);
327	}
328
329	void refinePartitions()
330	{
331	m_nodePartition.refineGeneration([&](unsigned smallestSetIndex) {
332	m_nodePartition.iterateSet(smallestSetIndex, [&](unsigned nodeIndex) {
333	unsigned incomingTransitionsStartForNode = m_flattenedTransitionsStartOffsetPerNode [nodeIndex];
334	unsigned incomingTransitionsSizeForNode = m_flattenedTransitionsSizePerNode [nodeIndex];
335
336	for (unsigned i = `0`; i < incomingTransitionsSizeForNode; ++i)
337	m_transitionPartition.markElementInCurrentGeneration(incomingTransitionsStartForNode + i);
338	});
339
340	// We only need to split the transitions, we handle the new sets through the main loop.
341	m_transitionPartition.refineGeneration([](unsigned) { });
342	});
343	}
344
345	void splitByUniqueTransitions()
346	{
347	for (; m_nextTransitionSetToProcess < m_transitionPartition.size(); ++m_nextTransitionSetToProcess) {
348	// We use the known splitters to refine the set.
349	m_transitionPartition.iterateSet(m_nextTransitionSetToProcess, [&](unsigned transitionIndex) {
350	unsigned sourceNodeIndex = m_flattenedTransitions [transitionIndex].source;
351	m_nodePartition.markElementInCurrentGeneration(sourceNodeIndex);
352	});
353
354	refinePartitions();
355	}
356	}
357
358	unsigned nodeReplacement(unsigned nodeIndex)
359	{
360	unsigned setIndex = m_nodePartition.setIndex(nodeIndex);
361	return m_nodePartition.firstElementInSet(setIndex);
362	}
363
364	private:
365	struct Transition {
366	unsigned source;
367	};
368
369	struct CounterConverter {
370	uint32_t convert(uint32_t)
371	{
372	return `1`;
373	}
374
375	void extend(uint32_t& destination, uint32_t)
376	{
377	++destination;
378	}
379	};
380
381	Vector<unsigned, `0`, ContentExtensionsOverflowHandler> m_flattenedTransitionsStartOffsetPerNode;
382	Vector<unsigned, `0`, ContentExtensionsOverflowHandler> m_flattenedTransitionsSizePerNode;
383	Vector<Transition, `0`, ContentExtensionsOverflowHandler> m_flattenedTransitions;
384
385	Partition m_nodePartition;
386	Partition m_transitionPartition;
387
388	unsigned m_nextTransitionSetToProcess { `0` };
389	};
390
391	struct ActionKey {
392	enum DeletedValueTag { DeletedValue };
393	explicit ActionKey(DeletedValueTag) { state = Deleted; }
394
395	enum EmptyValueTag { EmptyValue };
396	explicit ActionKey(EmptyValueTag) { state = Empty; }
397
398	explicit ActionKey(const DFA* dfa, uint32_t actionsStart, uint16_t actionsLength)
399	: dfa(dfa)
400	, actionsStart(actionsStart)
401	, actionsLength(actionsLength)
402	, state(Valid)
403	{
404	StringHasher hasher;
405	hasher.addCharactersAssumingAligned(reinterpret_cast<const UChar>(&dfa->actions [actionsStart]), actionsLength sizeof(uint64_t) / sizeof(UChar));
406	hash = hasher.hash();
407	}
408
409	bool isEmptyValue() const { return state == Empty; }
410	bool isDeletedValue() const { return state == Deleted; }
411
412	unsigned hash;
413
414	const DFA* dfa;
415	uint32_t actionsStart;
416	uint16_t actionsLength;
417
418	enum {
419	Valid,
420	Empty,
421	Deleted
422	} state;
423	};
424
425	struct ActionKeyHash {
426	static unsigned hash(const ActionKey& actionKey)
427	{
428	return actionKey.hash;
429	}
430
431	static bool equal(const ActionKey& a, const ActionKey& b)
432	{
433	if (a.state != b.state
434	\|\| a.dfa != b.dfa
435	\|\| a.actionsLength != b.actionsLength)
436	return false;
437	for (uint16_t i = `0`; i < a.actionsLength; ++i) {
438	if (a.dfa->actions [a.actionsStart + i] != a.dfa->actions [b.actionsStart + i])
439	return false;
440	}
441	return true;
442	}
443	static const bool safeToCompareToEmptyOrDeleted = false;
444	};
445
446	struct ActionKeyHashTraits : public WTF::CustomHashTraits<ActionKey> {
447	static const bool emptyValueIsZero = true;
448	};
449
450	} // anonymous namespace.
451
452	void DFAMinimizer::minimize(DFA& dfa)
453	{
454	FullGraphPartition fullGraphPartition(dfa);
455
456	// Unlike traditional minimization final states can be differentiated by their action.
457	// Instead of creating a single set for the final state, we partition by actions from
458	// the start.
459	HashMap<ActionKey, Vector<unsigned>, ActionKeyHash, ActionKeyHashTraits> finalStates;
460	for (unsigned i = `0`; i < dfa.nodes.size(); ++i) {
461	const DFANode& node = dfa.nodes [i];
462	if (node.hasActions()) {
463	// FIXME: Sort the actions in the dfa to make nodes that have the same actions in different order equal.
464	auto addResult = finalStates.add(ActionKey (&dfa, node.actionsStart(), node.actionsLength()), Vector<unsigned>());
465	addResult.iterator ->value.append(i);
466	}
467	}
468
469	for (const auto& slot : finalStates) {
470	for (unsigned finalStateIndex : slot.value)
471	fullGraphPartition.markNode(finalStateIndex);
472	fullGraphPartition.refinePartitions();
473	}
474
475	// Use every splitter to refine the node partitions.
476	fullGraphPartition.splitByUniqueTransitions();
477
478	Vector<unsigned> relocationVector;
479	relocationVector.reserveInitialCapacity(dfa.nodes.size());
480	for (unsigned i = `0`; i < dfa.nodes.size(); ++i)
481	relocationVector.uncheckedAppend(i);
482
483	// Update all the transitions.
484	for (unsigned i = `0`; i < dfa.nodes.size(); ++i) {
485	unsigned replacement = fullGraphPartition.nodeReplacement(i);
486	if (i != replacement) {
487	relocationVector [i] = replacement;
488	dfa.nodes [i].kill(dfa);
489	}
490	}
491
492	dfa.root = relocationVector [dfa.root];
493	for (DFANode& node : dfa.nodes) {
494	if (node.isKilled())
495	continue;
496
497	for (auto& transition : node.transitions(dfa)) {
498	uint32_t target = transition.target();
499	uint32_t relocatedTarget = relocationVector [target];
500	if (target != relocatedTarget)
501	transition.resetTarget(relocatedTarget);
502	}
503	}
504	}
505
506	} // namespace ContentExtensions
507	} // namespace WebCore
508
509	#endif // ENABLE(CONTENT_EXTENSIONS)
510

Browse the source code of webkit/Source/WebCore/contentextensions/DFAMinimizer.cpp