001/* 002 * Copyright (C) 2011 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.hash; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019 020import com.google.common.annotations.Beta; 021import com.google.common.annotations.VisibleForTesting; 022import com.google.common.base.Objects; 023import com.google.common.base.Predicate; 024import com.google.common.hash.BloomFilterStrategies.LockFreeBitArray; 025import com.google.common.math.DoubleMath; 026import com.google.common.primitives.SignedBytes; 027import com.google.common.primitives.UnsignedBytes; 028import com.google.errorprone.annotations.CanIgnoreReturnValue; 029import java.io.DataInputStream; 030import java.io.DataOutputStream; 031import java.io.IOException; 032import java.io.InputStream; 033import java.io.OutputStream; 034import java.io.Serializable; 035import java.math.RoundingMode; 036import javax.annotation.CheckForNull; 037import org.checkerframework.checker.nullness.qual.Nullable; 038 039/** 040 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test 041 * with one-sided error: if it claims that an element is contained in it, this might be in error, 042 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true. 043 * 044 * <p>If you are unfamiliar with Bloom filters, this nice <a 045 * href="http://llimllib.github.io/bloomfilter-tutorial/">tutorial</a> may help you understand how 046 * they work. 047 * 048 * <p>The false positive probability ({@code FPP}) of a Bloom filter is defined as the probability 049 * that {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that 050 * has not actually been put in the {@code BloomFilter}. 051 * 052 * <p>Bloom filters are serializable. They also support a more compact serial representation via the 053 * {@link #writeTo} and {@link #readFrom} methods. Both serialized forms will continue to be 054 * supported by future versions of this library. However, serial forms generated by newer versions 055 * of the code may not be readable by older versions of the code (e.g., a serialized Bloom filter 056 * generated today may <i>not</i> be readable by a binary that was compiled 6 months ago). 057 * 058 * <p>As of Guava 23.0, this class is thread-safe and lock-free. It internally uses atomics and 059 * compare-and-swap to ensure correctness when multiple threads are used to access it. 060 * 061 * @param <T> the type of instances that the {@code BloomFilter} accepts 062 * @author Dimitris Andreou 063 * @author Kevin Bourrillion 064 * @since 11.0 (thread-safe since 23.0) 065 */ 066@Beta 067@ElementTypesAreNonnullByDefault 068public final class BloomFilter<T extends @Nullable Object> implements Predicate<T>, Serializable { 069 /** 070 * A strategy to translate T instances, to {@code numHashFunctions} bit indexes. 071 * 072 * <p>Implementations should be collections of pure functions (i.e. stateless). 073 */ 074 interface Strategy extends java.io.Serializable { 075 076 /** 077 * Sets {@code numHashFunctions} bits of the given bit array, by hashing a user element. 078 * 079 * <p>Returns whether any bits changed as a result of this operation. 080 */ 081 <T extends @Nullable Object> boolean put( 082 @ParametricNullness T object, 083 Funnel<? super T> funnel, 084 int numHashFunctions, 085 LockFreeBitArray bits); 086 087 /** 088 * Queries {@code numHashFunctions} bits of the given bit array, by hashing a user element; 089 * returns {@code true} if and only if all selected bits are set. 090 */ 091 <T extends @Nullable Object> boolean mightContain( 092 @ParametricNullness T object, 093 Funnel<? super T> funnel, 094 int numHashFunctions, 095 LockFreeBitArray bits); 096 097 /** 098 * Identifier used to encode this strategy, when marshalled as part of a BloomFilter. Only 099 * values in the [-128, 127] range are valid for the compact serial form. Non-negative values 100 * are reserved for enums defined in BloomFilterStrategies; negative values are reserved for any 101 * custom, stateful strategy we may define (e.g. any kind of strategy that would depend on user 102 * input). 103 */ 104 int ordinal(); 105 } 106 107 /** The bit set of the BloomFilter (not necessarily power of 2!) */ 108 private final LockFreeBitArray bits; 109 110 /** Number of hashes per element */ 111 private final int numHashFunctions; 112 113 /** The funnel to translate Ts to bytes */ 114 private final Funnel<? super T> funnel; 115 116 /** The strategy we employ to map an element T to {@code numHashFunctions} bit indexes. */ 117 private final Strategy strategy; 118 119 /** Creates a BloomFilter. */ 120 private BloomFilter( 121 LockFreeBitArray bits, int numHashFunctions, Funnel<? super T> funnel, Strategy strategy) { 122 checkArgument(numHashFunctions > 0, "numHashFunctions (%s) must be > 0", numHashFunctions); 123 checkArgument( 124 numHashFunctions <= 255, "numHashFunctions (%s) must be <= 255", numHashFunctions); 125 this.bits = checkNotNull(bits); 126 this.numHashFunctions = numHashFunctions; 127 this.funnel = checkNotNull(funnel); 128 this.strategy = checkNotNull(strategy); 129 } 130 131 /** 132 * Creates a new {@code BloomFilter} that's a copy of this instance. The new instance is equal to 133 * this instance but shares no mutable state. 134 * 135 * @since 12.0 136 */ 137 public BloomFilter<T> copy() { 138 return new BloomFilter<T>(bits.copy(), numHashFunctions, funnel, strategy); 139 } 140 141 /** 142 * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter, {@code 143 * false} if this is <i>definitely</i> not the case. 144 */ 145 public boolean mightContain(@ParametricNullness T object) { 146 return strategy.mightContain(object, funnel, numHashFunctions, bits); 147 } 148 149 /** 150 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #mightContain} 151 * instead. 152 */ 153 @Deprecated 154 @Override 155 public boolean apply(@ParametricNullness T input) { 156 return mightContain(input); 157 } 158 159 /** 160 * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of {@link 161 * #mightContain(Object)} with the same element will always return {@code true}. 162 * 163 * @return true if the Bloom filter's bits changed as a result of this operation. If the bits 164 * changed, this is <i>definitely</i> the first time {@code object} has been added to the 165 * filter. If the bits haven't changed, this <i>might</i> be the first time {@code object} has 166 * been added to the filter. Note that {@code put(t)} always returns the <i>opposite</i> 167 * result to what {@code mightContain(t)} would have returned at the time it is called. 168 * @since 12.0 (present in 11.0 with {@code void} return type}) 169 */ 170 @CanIgnoreReturnValue 171 public boolean put(@ParametricNullness T object) { 172 return strategy.put(object, funnel, numHashFunctions, bits); 173 } 174 175 /** 176 * Returns the probability that {@linkplain #mightContain(Object)} will erroneously return {@code 177 * true} for an object that has not actually been put in the {@code BloomFilter}. 178 * 179 * <p>Ideally, this number should be close to the {@code fpp} parameter passed in {@linkplain 180 * #create(Funnel, int, double)}, or smaller. If it is significantly higher, it is usually the 181 * case that too many elements (more than expected) have been put in the {@code BloomFilter}, 182 * degenerating it. 183 * 184 * @since 14.0 (since 11.0 as expectedFalsePositiveProbability()) 185 */ 186 public double expectedFpp() { 187 return Math.pow((double) bits.bitCount() / bitSize(), numHashFunctions); 188 } 189 190 /** 191 * Returns an estimate for the total number of distinct elements that have been added to this 192 * Bloom filter. This approximation is reasonably accurate if it does not exceed the value of 193 * {@code expectedInsertions} that was used when constructing the filter. 194 * 195 * @since 22.0 196 */ 197 public long approximateElementCount() { 198 long bitSize = bits.bitSize(); 199 long bitCount = bits.bitCount(); 200 201 /** 202 * Each insertion is expected to reduce the # of clear bits by a factor of 203 * `numHashFunctions/bitSize`. So, after n insertions, expected bitCount is `bitSize * (1 - (1 - 204 * numHashFunctions/bitSize)^n)`. Solving that for n, and approximating `ln x` as `x - 1` when x 205 * is close to 1 (why?), gives the following formula. 206 */ 207 double fractionOfBitsSet = (double) bitCount / bitSize; 208 return DoubleMath.roundToLong( 209 -Math.log1p(-fractionOfBitsSet) * bitSize / numHashFunctions, RoundingMode.HALF_UP); 210 } 211 212 /** Returns the number of bits in the underlying bit array. */ 213 @VisibleForTesting 214 long bitSize() { 215 return bits.bitSize(); 216 } 217 218 /** 219 * Determines whether a given Bloom filter is compatible with this Bloom filter. For two Bloom 220 * filters to be compatible, they must: 221 * 222 * <ul> 223 * <li>not be the same instance 224 * <li>have the same number of hash functions 225 * <li>have the same bit size 226 * <li>have the same strategy 227 * <li>have equal funnels 228 * </ul> 229 * 230 * @param that The Bloom filter to check for compatibility. 231 * @since 15.0 232 */ 233 public boolean isCompatible(BloomFilter<T> that) { 234 checkNotNull(that); 235 return this != that 236 && this.numHashFunctions == that.numHashFunctions 237 && this.bitSize() == that.bitSize() 238 && this.strategy.equals(that.strategy) 239 && this.funnel.equals(that.funnel); 240 } 241 242 /** 243 * Combines this Bloom filter with another Bloom filter by performing a bitwise OR of the 244 * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the Bloom 245 * filters are appropriately sized to avoid saturating them. 246 * 247 * @param that The Bloom filter to combine this Bloom filter with. It is not mutated. 248 * @throws IllegalArgumentException if {@code isCompatible(that) == false} 249 * @since 15.0 250 */ 251 public void putAll(BloomFilter<T> that) { 252 checkNotNull(that); 253 checkArgument(this != that, "Cannot combine a BloomFilter with itself."); 254 checkArgument( 255 this.numHashFunctions == that.numHashFunctions, 256 "BloomFilters must have the same number of hash functions (%s != %s)", 257 this.numHashFunctions, 258 that.numHashFunctions); 259 checkArgument( 260 this.bitSize() == that.bitSize(), 261 "BloomFilters must have the same size underlying bit arrays (%s != %s)", 262 this.bitSize(), 263 that.bitSize()); 264 checkArgument( 265 this.strategy.equals(that.strategy), 266 "BloomFilters must have equal strategies (%s != %s)", 267 this.strategy, 268 that.strategy); 269 checkArgument( 270 this.funnel.equals(that.funnel), 271 "BloomFilters must have equal funnels (%s != %s)", 272 this.funnel, 273 that.funnel); 274 this.bits.putAll(that.bits); 275 } 276 277 @Override 278 public boolean equals(@CheckForNull Object object) { 279 if (object == this) { 280 return true; 281 } 282 if (object instanceof BloomFilter) { 283 BloomFilter<?> that = (BloomFilter<?>) object; 284 return this.numHashFunctions == that.numHashFunctions 285 && this.funnel.equals(that.funnel) 286 && this.bits.equals(that.bits) 287 && this.strategy.equals(that.strategy); 288 } 289 return false; 290 } 291 292 @Override 293 public int hashCode() { 294 return Objects.hashCode(numHashFunctions, funnel, strategy, bits); 295 } 296 297 /** 298 * Creates a {@link BloomFilter} with the expected number of insertions and expected false 299 * positive probability. 300 * 301 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified, 302 * will result in its saturation, and a sharp deterioration of its false positive probability. 303 * 304 * <p>The constructed {@code BloomFilter} will be serializable if the provided {@code Funnel<T>} 305 * is. 306 * 307 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of 308 * ensuring proper serialization and deserialization, which is important since {@link #equals} 309 * also relies on object identity of funnels. 310 * 311 * @param funnel the funnel of T's that the constructed {@code BloomFilter} will use 312 * @param expectedInsertions the number of expected insertions to the constructed {@code 313 * BloomFilter}; must be positive 314 * @param fpp the desired false positive probability (must be positive and less than 1.0) 315 * @return a {@code BloomFilter} 316 */ 317 public static <T extends @Nullable Object> BloomFilter<T> create( 318 Funnel<? super T> funnel, int expectedInsertions, double fpp) { 319 return create(funnel, (long) expectedInsertions, fpp); 320 } 321 322 /** 323 * Creates a {@link BloomFilter} with the expected number of insertions and expected false 324 * positive probability. 325 * 326 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified, 327 * will result in its saturation, and a sharp deterioration of its false positive probability. 328 * 329 * <p>The constructed {@code BloomFilter} will be serializable if the provided {@code Funnel<T>} 330 * is. 331 * 332 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of 333 * ensuring proper serialization and deserialization, which is important since {@link #equals} 334 * also relies on object identity of funnels. 335 * 336 * @param funnel the funnel of T's that the constructed {@code BloomFilter} will use 337 * @param expectedInsertions the number of expected insertions to the constructed {@code 338 * BloomFilter}; must be positive 339 * @param fpp the desired false positive probability (must be positive and less than 1.0) 340 * @return a {@code BloomFilter} 341 * @since 19.0 342 */ 343 public static <T extends @Nullable Object> BloomFilter<T> create( 344 Funnel<? super T> funnel, long expectedInsertions, double fpp) { 345 return create(funnel, expectedInsertions, fpp, BloomFilterStrategies.MURMUR128_MITZ_64); 346 } 347 348 @VisibleForTesting 349 static <T extends @Nullable Object> BloomFilter<T> create( 350 Funnel<? super T> funnel, long expectedInsertions, double fpp, Strategy strategy) { 351 checkNotNull(funnel); 352 checkArgument( 353 expectedInsertions >= 0, "Expected insertions (%s) must be >= 0", expectedInsertions); 354 checkArgument(fpp > 0.0, "False positive probability (%s) must be > 0.0", fpp); 355 checkArgument(fpp < 1.0, "False positive probability (%s) must be < 1.0", fpp); 356 checkNotNull(strategy); 357 358 if (expectedInsertions == 0) { 359 expectedInsertions = 1; 360 } 361 /* 362 * TODO(user): Put a warning in the javadoc about tiny fpp values, since the resulting size 363 * is proportional to -log(p), but there is not much of a point after all, e.g. 364 * optimalM(1000, 0.0000000000000001) = 76680 which is less than 10kb. Who cares! 365 */ 366 long numBits = optimalNumOfBits(expectedInsertions, fpp); 367 int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits); 368 try { 369 return new BloomFilter<T>(new LockFreeBitArray(numBits), numHashFunctions, funnel, strategy); 370 } catch (IllegalArgumentException e) { 371 throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e); 372 } 373 } 374 375 /** 376 * Creates a {@link BloomFilter} with the expected number of insertions and a default expected 377 * false positive probability of 3%. 378 * 379 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified, 380 * will result in its saturation, and a sharp deterioration of its false positive probability. 381 * 382 * <p>The constructed {@code BloomFilter} will be serializable if the provided {@code Funnel<T>} 383 * is. 384 * 385 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of 386 * ensuring proper serialization and deserialization, which is important since {@link #equals} 387 * also relies on object identity of funnels. 388 * 389 * @param funnel the funnel of T's that the constructed {@code BloomFilter} will use 390 * @param expectedInsertions the number of expected insertions to the constructed {@code 391 * BloomFilter}; must be positive 392 * @return a {@code BloomFilter} 393 */ 394 public static <T extends @Nullable Object> BloomFilter<T> create( 395 Funnel<? super T> funnel, int expectedInsertions) { 396 return create(funnel, (long) expectedInsertions); 397 } 398 399 /** 400 * Creates a {@link BloomFilter} with the expected number of insertions and a default expected 401 * false positive probability of 3%. 402 * 403 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified, 404 * will result in its saturation, and a sharp deterioration of its false positive probability. 405 * 406 * <p>The constructed {@code BloomFilter} will be serializable if the provided {@code Funnel<T>} 407 * is. 408 * 409 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of 410 * ensuring proper serialization and deserialization, which is important since {@link #equals} 411 * also relies on object identity of funnels. 412 * 413 * @param funnel the funnel of T's that the constructed {@code BloomFilter} will use 414 * @param expectedInsertions the number of expected insertions to the constructed {@code 415 * BloomFilter}; must be positive 416 * @return a {@code BloomFilter} 417 * @since 19.0 418 */ 419 public static <T extends @Nullable Object> BloomFilter<T> create( 420 Funnel<? super T> funnel, long expectedInsertions) { 421 return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions 422 } 423 424 // Cheat sheet: 425 // 426 // m: total bits 427 // n: expected insertions 428 // b: m/n, bits per insertion 429 // p: expected false positive probability 430 // 431 // 1) Optimal k = b * ln2 432 // 2) p = (1 - e ^ (-kn/m))^k 433 // 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b 434 // 4) For optimal k: m = -nlnp / ((ln2) ^ 2) 435 436 /** 437 * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the 438 * expected insertions and total number of bits in the Bloom filter. 439 * 440 * <p>See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula. 441 * 442 * @param n expected insertions (must be positive) 443 * @param m total number of bits in Bloom filter (must be positive) 444 */ 445 @VisibleForTesting 446 static int optimalNumOfHashFunctions(long n, long m) { 447 // (m / n) * log(2), but avoid truncation due to division! 448 return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); 449 } 450 451 /** 452 * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified 453 * expected insertions, the required false positive probability. 454 * 455 * <p>See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the 456 * formula. 457 * 458 * @param n expected insertions (must be positive) 459 * @param p false positive rate (must be 0 < p < 1) 460 */ 461 @VisibleForTesting 462 static long optimalNumOfBits(long n, double p) { 463 if (p == 0) { 464 p = Double.MIN_VALUE; 465 } 466 return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); 467 } 468 469 private Object writeReplace() { 470 return new SerialForm<T>(this); 471 } 472 473 private static class SerialForm<T extends @Nullable Object> implements Serializable { 474 final long[] data; 475 final int numHashFunctions; 476 final Funnel<? super T> funnel; 477 final Strategy strategy; 478 479 SerialForm(BloomFilter<T> bf) { 480 this.data = LockFreeBitArray.toPlainArray(bf.bits.data); 481 this.numHashFunctions = bf.numHashFunctions; 482 this.funnel = bf.funnel; 483 this.strategy = bf.strategy; 484 } 485 486 Object readResolve() { 487 return new BloomFilter<T>(new LockFreeBitArray(data), numHashFunctions, funnel, strategy); 488 } 489 490 private static final long serialVersionUID = 1; 491 } 492 493 /** 494 * Writes this {@code BloomFilter} to an output stream, with a custom format (not Java 495 * serialization). This has been measured to save at least 400 bytes compared to regular 496 * serialization. 497 * 498 * <p>Use {@linkplain #readFrom(InputStream, Funnel)} to reconstruct the written BloomFilter. 499 */ 500 public void writeTo(OutputStream out) throws IOException { 501 // Serial form: 502 // 1 signed byte for the strategy 503 // 1 unsigned byte for the number of hash functions 504 // 1 big endian int, the number of longs in our bitset 505 // N big endian longs of our bitset 506 DataOutputStream dout = new DataOutputStream(out); 507 dout.writeByte(SignedBytes.checkedCast(strategy.ordinal())); 508 dout.writeByte(UnsignedBytes.checkedCast(numHashFunctions)); // note: checked at the c'tor 509 dout.writeInt(bits.data.length()); 510 for (int i = 0; i < bits.data.length(); i++) { 511 dout.writeLong(bits.data.get(i)); 512 } 513 } 514 515 /** 516 * Reads a byte stream, which was written by {@linkplain #writeTo(OutputStream)}, into a {@code 517 * BloomFilter}. 518 * 519 * <p>The {@code Funnel} to be used is not encoded in the stream, so it must be provided here. 520 * <b>Warning:</b> the funnel provided <b>must</b> behave identically to the one used to populate 521 * the original Bloom filter! 522 * 523 * @throws IOException if the InputStream throws an {@code IOException}, or if its data does not 524 * appear to be a BloomFilter serialized using the {@linkplain #writeTo(OutputStream)} method. 525 */ 526 public static <T extends @Nullable Object> BloomFilter<T> readFrom( 527 InputStream in, Funnel<? super T> funnel) throws IOException { 528 checkNotNull(in, "InputStream"); 529 checkNotNull(funnel, "Funnel"); 530 int strategyOrdinal = -1; 531 int numHashFunctions = -1; 532 int dataLength = -1; 533 try { 534 DataInputStream din = new DataInputStream(in); 535 // currently this assumes there is no negative ordinal; will have to be updated if we 536 // add non-stateless strategies (for which we've reserved negative ordinals; see 537 // Strategy.ordinal()). 538 strategyOrdinal = din.readByte(); 539 numHashFunctions = UnsignedBytes.toInt(din.readByte()); 540 dataLength = din.readInt(); 541 542 Strategy strategy = BloomFilterStrategies.values()[strategyOrdinal]; 543 long[] data = new long[dataLength]; 544 for (int i = 0; i < data.length; i++) { 545 data[i] = din.readLong(); 546 } 547 return new BloomFilter<T>(new LockFreeBitArray(data), numHashFunctions, funnel, strategy); 548 } catch (RuntimeException e) { 549 String message = 550 "Unable to deserialize BloomFilter from InputStream." 551 + " strategyOrdinal: " 552 + strategyOrdinal 553 + " numHashFunctions: " 554 + numHashFunctions 555 + " dataLength: " 556 + dataLength; 557 throw new IOException(message, e); 558 } 559 } 560}