1 | /* |
2 | * Copyright (C) 2011, 2015 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' |
14 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
15 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS |
17 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
18 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
19 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
20 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
21 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
22 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
23 | * THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #pragma once |
27 | |
28 | #include <array> |
29 | #include <wtf/text/AtomicString.h> |
30 | |
31 | namespace WTF { |
32 | |
33 | // Bloom filter with k=2. Uses 2^keyBits/8 bytes of memory. |
34 | // False positive rate is approximately (1-e^(-2n/m))^2, where n is the number of unique |
35 | // keys and m is the table size (==2^keyBits). |
36 | // See http://en.wikipedia.org/wiki/Bloom_filter |
37 | template <unsigned keyBits> |
38 | class BloomFilter { |
39 | WTF_MAKE_FAST_ALLOCATED; |
40 | public: |
41 | static const size_t tableSize = 1 << keyBits; |
42 | |
43 | BloomFilter(); |
44 | |
45 | void add(unsigned hash); |
46 | // For example SHA1::Digest. |
47 | template <size_t hashSize> void add(const std::array<uint8_t, hashSize>&); |
48 | |
49 | void add(const BloomFilter&); |
50 | |
51 | // The filter may give false positives (claim it may contain a key it doesn't) |
52 | // but never false negatives (claim it doesn't contain a key it does). |
53 | bool mayContain(unsigned hash) const; |
54 | template <size_t hashSize> bool mayContain(const std::array<uint8_t, hashSize>&) const; |
55 | |
56 | void clear(); |
57 | |
58 | void add(const AtomicString& string) { add(string.impl()->existingHash()); } |
59 | void add(const String& string) { add(string.impl()->hash()); } |
60 | bool mayContain(const AtomicString& string) const { return mayContain(string.impl()->existingHash()); } |
61 | bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); } |
62 | |
63 | private: |
64 | static const unsigned bitsPerPosition = 8 * sizeof(unsigned); |
65 | static const unsigned keyMask = (1 << keyBits) - 1; |
66 | static unsigned arrayIndex(unsigned key) { return key / bitsPerPosition; } |
67 | static unsigned bitMask(unsigned key) { return 1 << (key % bitsPerPosition); } |
68 | template <size_t hashSize> static std::pair<unsigned, unsigned> keysFromHash(const std::array<uint8_t, hashSize>&); |
69 | |
70 | bool isBitSet(unsigned key) const; |
71 | void setBit(unsigned key); |
72 | |
73 | std::array<unsigned, tableSize / bitsPerPosition> m_bitArray; |
74 | }; |
75 | |
76 | template <unsigned keyBits> |
77 | inline BloomFilter<keyBits>::BloomFilter() |
78 | : m_bitArray() |
79 | { |
80 | } |
81 | |
82 | template <unsigned keyBits> |
83 | inline bool BloomFilter<keyBits>::mayContain(unsigned hash) const |
84 | { |
85 | // The top and bottom bits of the incoming hash are treated as independent bloom filter hash functions. |
86 | // This works well as long as the filter size is not much above 2^16. |
87 | return isBitSet(hash) && isBitSet(hash >> 16); |
88 | } |
89 | |
90 | template <unsigned keyBits> |
91 | inline void BloomFilter<keyBits>::add(unsigned hash) |
92 | { |
93 | setBit(hash); |
94 | setBit(hash >> 16); |
95 | } |
96 | |
97 | template <unsigned keyBits> |
98 | template <size_t hashSize> |
99 | inline std::pair<unsigned, unsigned> BloomFilter<keyBits>::keysFromHash(const std::array<uint8_t, hashSize>& hash) |
100 | { |
101 | // We could use larger k value than 2 for long hashes. |
102 | static_assert(hashSize >= 2 * sizeof(unsigned), "Hash array too short" ); |
103 | return { |
104 | *reinterpret_cast<const unsigned*>(hash.data()), |
105 | *reinterpret_cast<const unsigned*>(hash.data() + sizeof(unsigned)) |
106 | }; |
107 | } |
108 | |
109 | template <unsigned keyBits> |
110 | template <size_t hashSize> |
111 | inline bool BloomFilter<keyBits>::mayContain(const std::array<uint8_t, hashSize>& hash) const |
112 | { |
113 | auto keys = keysFromHash(hash); |
114 | return isBitSet(keys.first) && isBitSet(keys.second); |
115 | } |
116 | |
117 | template <unsigned keyBits> |
118 | template <size_t hashSize> |
119 | inline void BloomFilter<keyBits>::add(const std::array<uint8_t, hashSize>& hash) |
120 | { |
121 | auto keys = keysFromHash(hash); |
122 | setBit(keys.first); |
123 | setBit(keys.second); |
124 | } |
125 | |
126 | template <unsigned keyBits> |
127 | inline void BloomFilter<keyBits>::add(const BloomFilter& other) |
128 | { |
129 | for (size_t i = 0; i < m_bitArray.size(); ++i) |
130 | m_bitArray[i] |= other.m_bitArray[i]; |
131 | } |
132 | |
133 | template <unsigned keyBits> |
134 | bool BloomFilter<keyBits>::isBitSet(unsigned key) const |
135 | { |
136 | unsigned maskedKey = key & keyMask; |
137 | ASSERT(arrayIndex(maskedKey) < m_bitArray.size()); |
138 | return m_bitArray[arrayIndex(maskedKey)] & bitMask(maskedKey); |
139 | } |
140 | |
141 | template <unsigned keyBits> |
142 | void BloomFilter<keyBits>::setBit(unsigned key) |
143 | { |
144 | unsigned maskedKey = key & keyMask; |
145 | ASSERT(arrayIndex(maskedKey) < m_bitArray.size()); |
146 | m_bitArray[arrayIndex(maskedKey)] |= bitMask(maskedKey); |
147 | } |
148 | |
149 | template <unsigned keyBits> |
150 | inline void BloomFilter<keyBits>::clear() |
151 | { |
152 | m_bitArray.fill(0); |
153 | } |
154 | |
155 | // Counting bloom filter with 8 bit counters. Uses 2^keyBits bytes of memory. Error rates as above. |
156 | // See http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters |
157 | template <unsigned keyBits> |
158 | class CountingBloomFilter { |
159 | WTF_MAKE_FAST_ALLOCATED; |
160 | public: |
161 | static const size_t tableSize = 1 << keyBits; |
162 | static unsigned maximumCount() { return std::numeric_limits<uint8_t>::max(); } |
163 | |
164 | CountingBloomFilter(); |
165 | |
166 | void add(unsigned hash); |
167 | void remove(unsigned hash); |
168 | |
169 | // The filter may give false positives (claim it may contain a key it doesn't) |
170 | // but never false negatives (claim it doesn't contain a key it does). |
171 | bool mayContain(unsigned hash) const { return firstBucket(hash) && secondBucket(hash); } |
172 | |
173 | // The filter must be cleared before reuse even if all keys are removed. |
174 | // Otherwise overflowed keys will stick around. |
175 | void clear(); |
176 | |
177 | void add(const AtomicString& string) { add(string.impl()->existingHash()); } |
178 | void add(const String& string) { add(string.impl()->hash()); } |
179 | void remove(const AtomicString& string) { remove(string.impl()->existingHash()); } |
180 | void remove(const String& string) { remove(string.impl()->hash()); } |
181 | |
182 | bool mayContain(const AtomicString& string) const { return mayContain(string.impl()->existingHash()); } |
183 | bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); } |
184 | |
185 | #if !ASSERT_DISABLED |
186 | // Slow. |
187 | bool likelyEmpty() const; |
188 | bool isClear() const; |
189 | #endif |
190 | |
191 | private: |
192 | static const unsigned keyMask = (1 << keyBits) - 1; |
193 | |
194 | uint8_t& firstBucket(unsigned hash) { return m_buckets[hash & keyMask]; } |
195 | uint8_t& secondBucket(unsigned hash) { return m_buckets[(hash >> 16) & keyMask]; } |
196 | const uint8_t& firstBucket(unsigned hash) const { return m_buckets[hash & keyMask]; } |
197 | const uint8_t& secondBucket(unsigned hash) const { return m_buckets[(hash >> 16) & keyMask]; } |
198 | |
199 | std::array<uint8_t, tableSize> m_buckets; |
200 | }; |
201 | |
202 | template <unsigned keyBits> |
203 | inline CountingBloomFilter<keyBits>::CountingBloomFilter() |
204 | : m_buckets() |
205 | { |
206 | } |
207 | |
208 | template <unsigned keyBits> |
209 | inline void CountingBloomFilter<keyBits>::add(unsigned hash) |
210 | { |
211 | auto& first = firstBucket(hash); |
212 | auto& second = secondBucket(hash); |
213 | if (LIKELY(first < maximumCount())) |
214 | ++first; |
215 | if (LIKELY(second < maximumCount())) |
216 | ++second; |
217 | } |
218 | |
219 | template <unsigned keyBits> |
220 | inline void CountingBloomFilter<keyBits>::remove(unsigned hash) |
221 | { |
222 | auto& first = firstBucket(hash); |
223 | auto& second = secondBucket(hash); |
224 | ASSERT(first); |
225 | ASSERT(second); |
226 | // In case of an overflow, the bucket sticks in the table until clear(). |
227 | if (LIKELY(first < maximumCount())) |
228 | --first; |
229 | if (LIKELY(second < maximumCount())) |
230 | --second; |
231 | } |
232 | |
233 | template <unsigned keyBits> |
234 | inline void CountingBloomFilter<keyBits>::clear() |
235 | { |
236 | m_buckets.fill(0); |
237 | } |
238 | |
239 | #if !ASSERT_DISABLED |
240 | template <unsigned keyBits> |
241 | bool CountingBloomFilter<keyBits>::likelyEmpty() const |
242 | { |
243 | for (auto& bucket : m_buckets) { |
244 | if (bucket && bucket != maximumCount()) |
245 | return false; |
246 | } |
247 | return true; |
248 | } |
249 | |
250 | template <unsigned keyBits> |
251 | bool CountingBloomFilter<keyBits>::isClear() const |
252 | { |
253 | for (auto& bucket : m_buckets) { |
254 | if (bucket) |
255 | return false; |
256 | } |
257 | return true; |
258 | } |
259 | #endif |
260 | |
261 | } |
262 | |
263 | using WTF::BloomFilter; |
264 | using WTF::CountingBloomFilter; |
265 | |