Bloom Filters: Duplicate Detection Without the Memory Cost
Bloom Filters: Duplicate Detection Without the Memory Cost
The main chapter introduced the Bloom filter concept and the math behind optimal sizing. This section builds a production-quality Bloom filter step by step, validates its false positive rate empirically, benchmarks it against HashSet, and shows the migration to Guava.
Building the Bloom Filter: Bit Array and Hashing
The core data structure is a bit array backed by a long[]. Each long holds 64 bits. Addressing bit i means accessing bits[i >>> 6] (which long) and 1L << (i & 63) (which bit within the long):
public class ProductionBloomFilter {
private final long[] bits;
private final int numBits;
private final int numHashFunctions;
private int insertionCount;
public ProductionBloomFilter(int expectedInsertions, double falsePositiveRate) {
if (expectedInsertions <= 0) {
throw new IllegalArgumentException("expectedInsertions must be positive");
}
if (falsePositiveRate <= 0 || falsePositiveRate >= 1) {
throw new IllegalArgumentException("falsePositiveRate must be in (0, 1)");
}
this.numBits = optimalNumBits(expectedInsertions, falsePositiveRate);
this.numHashFunctions = optimalNumHashes(numBits, expectedInsertions);
this.bits = new long[(numBits + 63) >>> 6];
this.insertionCount = 0;
}
public boolean put(byte[] element) {
long hash64 = hash(element, 0);
int h1 = (int) hash64;
int h2 = (int) (hash64 >>> 32);
boolean bitsChanged = false;
for (int i = 0; i < numHashFunctions; i++) {
int combinedHash = h1 + i * h2;
int bitIndex = (combinedHash & Integer.MAX_VALUE) % numBits;
int longIndex = bitIndex >>> 6;
long mask = 1L << (bitIndex & 63);
if ((bits[longIndex] & mask) == 0) {
bits[longIndex] |= mask;
bitsChanged = true;
}
}
if (bitsChanged) insertionCount++;
return bitsChanged;
}
public boolean mightContain(byte[] element) {
long hash64 = hash(element, 0);
int h1 = (int) hash64;
int h2 = (int) (hash64 >>> 32);
for (int i = 0; i < numHashFunctions; i++) {
int combinedHash = h1 + i * h2;
int bitIndex = (combinedHash & Integer.MAX_VALUE) % numBits;
if ((bits[bitIndex >>> 6] & (1L << (bitIndex & 63))) == 0) {
return false;
}
}
return true;
}
public double expectedFalsePositiveRate() {
double fractionSet = (double) countSetBits() / numBits;
return Math.pow(fractionSet, numHashFunctions);
}
public long memoryBytes() {
return (long) bits.length * 8;
}
private int countSetBits() {
int count = 0;
for (long word : bits) count += Long.bitCount(word);
return count;
}
private static long hash(byte[] data, long seed) {
// FNV-1a 64-bit hash
long h = 0xcbf29ce484222325L ^ seed;
for (byte b : data) {
h ^= b & 0xFFL;
h *= 0x100000001b3L;
}
return h;
}
private static int optimalNumBits(int n, double p) {
return (int) Math.ceil(-n * Math.log(p) / (Math.log(2) * Math.log(2)));
}
private static int optimalNumHashes(int m, int n) {
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
}
Empirical False Positive Rate Validation
The theoretical false positive rate is a mathematical guarantee, but implementation bugs can violate it. Always validate empirically:
public class BloomFilterValidation {
public static void main(String[] args) {
int insertions = 1_000_000;
double targetFPR = 0.01; // 1%
ProductionBloomFilter filter = new ProductionBloomFilter(insertions, targetFPR);
// Insert 1M elements
for (int i = 0; i < insertions; i++) {
filter.put(("article-" + i).getBytes(StandardCharsets.UTF_8));
}
// Test 1M elements that were NOT inserted
int falsePositives = 0;
int tests = 1_000_000;
for (int i = insertions; i < insertions + tests; i++) {
if (filter.mightContain(("article-" + i).getBytes(StandardCharsets.UTF_8))) {
falsePositives++;
}
}
double observedFPR = (double) falsePositives / tests;
System.out.printf("Target FPR: %.4f%n", targetFPR);
System.out.printf("Observed FPR: %.4f%n", observedFPR);
System.out.printf("Memory: %.2f MB%n", filter.memoryBytes() / (1024.0 * 1024.0));
System.out.printf("Hash functions: calculated from optimal formula%n");
// Assert observed is within 2x of target
// (statistical variance allows some deviation)
assert observedFPR < targetFPR * 2.0 :
"FPR too high: " + observedFPR + " vs target " + targetFPR;
}
}
Expected output:
Target FPR: 0.0100
Observed FPR: 0.0098
Memory: 1.14 MB
The observed FPR is within 2% of the target. If it deviates significantly, the hash function has poor distribution or the bit count calculation is wrong.
JMH Benchmark: Bloom Filter vs HashSet
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(2)
@State(Scope.Benchmark)
public class BloomVsHashSetBenchmark {
@Param({"1000000", "10000000"})
int size;
ProductionBloomFilter bloomFilter;
HashSet<String> hashSet;
byte[] lookupKeyPresent;
byte[] lookupKeyAbsent;
String lookupStringPresent;
String lookupStringAbsent;
@Setup(Level.Trial)
public void setup() {
bloomFilter = new ProductionBloomFilter(size, 0.01);
hashSet = new HashSet<>(size * 2);
for (int i = 0; i < size; i++) {
String key = "article-content-hash-" + i;
bloomFilter.put(key.getBytes(StandardCharsets.UTF_8));
hashSet.add(key);
}
lookupStringPresent = "article-content-hash-" + (size / 2);
lookupKeyPresent = lookupStringPresent.getBytes(StandardCharsets.UTF_8);
lookupStringAbsent = "article-content-hash-" + (size * 2);
lookupKeyAbsent = lookupStringAbsent.getBytes(StandardCharsets.UTF_8);
}
@Benchmark
public boolean bloomLookupPresent() {
return bloomFilter.mightContain(lookupKeyPresent); // FAST
}
@Benchmark
public boolean hashSetLookupPresent() {
return hashSet.contains(lookupStringPresent);
}
@Benchmark
public boolean bloomLookupAbsent() {
return bloomFilter.mightContain(lookupKeyAbsent); // FAST
}
@Benchmark
public boolean hashSetLookupAbsent() {
return hashSet.contains(lookupStringAbsent);
}
}
| Operation | 1M elements | 10M elements | ||
|---|---|---|---|---|
| Bloom (ns) | HashSet (ns) | Bloom (ns) | HashSet (ns) | |
| Lookup present | 85 | 42 | 88 | 65 |
| Lookup absent | 78 | 38 | 82 | 48 |
| Memory | 1.14 MB | 128 MB | 11.4 MB | 1.3 GB |
HashSet is faster per lookup because it computes one hash and follows one pointer. The Bloom filter computes 7 hash-derived positions and checks 7 bits. But the Bloom filter uses 112x less memory at 1M elements and 114x less memory at 10M elements.
Symptom: Ingestion pipeline OOM at 15 million articles in the dedup HashSet. Cause: HashSet grows without bound as articles accumulate. Fix: Replace HashSet with Bloom filter. Accept 1% false positives (duplicates that slip through are caught by the database’s unique constraint). Trade-off: 1% of duplicates reach the database layer, adding ~500 unnecessary insert-or-ignore queries per day out of 50,000 new articles. The database handles this without measurable impact.
Migrating to Guava BloomFilter
The scratch implementation works but lacks: thread safety, serialization, merge (union) of multiple filters, and funnel-based hashing for arbitrary object types. Guava provides all of these:
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
public class GuavaBloomFilterExample {
private final BloomFilter<CharSequence> filter;
public GuavaBloomFilterExample(int expectedInsertions, double fpp) {
this.filter = BloomFilter.create(
Funnels.stringFunnel(StandardCharsets.UTF_8),
expectedInsertions,
fpp
);
}
public boolean addArticle(String contentHash) {
return filter.put(contentHash); // Returns true if bits changed
}
public boolean mightBeDuplicate(String contentHash) {
return filter.mightContain(contentHash);
}
public double currentFpp() {
return filter.expectedFpp();
}
}
Guava’s implementation uses Murmur3 128-bit hashing (better distribution than FNV-1a), optimized bit addressing, and supports serialization via writeTo(OutputStream) and readFrom(InputStream, Funnel).
// Content platform ingestion pipeline with Bloom filter
public class ArticleIngestionPipeline {
private final BloomFilter<CharSequence> seenArticles;
private final ArticleRepository repository;
public ArticleIngestionPipeline(ArticleRepository repository) {
this.seenArticles = BloomFilter.create(
Funnels.stringFunnel(StandardCharsets.UTF_8),
20_000_000, // Expected articles over filter lifetime
0.01 // 1% false positive rate
);
this.repository = repository;
// Load existing hashes from database on startup
repository.allContentHashes().forEach(seenArticles::put);
}
public IngestResult ingest(RawArticle raw) {
String contentHash = computeContentHash(raw);
// Fast path: Bloom filter says not seen
if (!seenArticles.mightContain(contentHash)) { // FAST: 82ns
seenArticles.put(contentHash);
Article article = transform(raw);
repository.save(article);
return IngestResult.NEW;
}
// Slow path: Bloom filter says maybe seen (1% false positive)
// Confirm with database
if (repository.existsByContentHash(contentHash)) { // SLOW: ~2ms
return IngestResult.DUPLICATE;
}
// False positive: article is new despite Bloom filter saying yes
seenArticles.put(contentHash);
Article article = transform(raw);
repository.save(article);
return IngestResult.NEW_FALSE_POSITIVE;
}
private String computeContentHash(RawArticle raw) {
// SHA-256 of title + body, hex-encoded
// Not shown: use MessageDigest
return "";
}
private Article transform(RawArticle raw) { return null; }
}
The pipeline checks the Bloom filter first (82ns). If negative, the article is definitely new: save it. If positive, confirm with the database (2ms). At 50,000 articles per day with 70% duplicates:
- Without Bloom filter: 50,000 * 2ms = 100 seconds of database queries
- With Bloom filter: 15,000 new * 82ns + 35,000 duplicate * 82ns + 350 false positive * 2ms = ~4ms + ~1ms + ~700ms = ~705ms
The Bloom filter reduces dedup query time from 100 seconds to under 1 second.
Counting Bloom Filters: Supporting Deletion
Standard Bloom filters do not support deletion because clearing a bit might affect other elements. A counting Bloom filter replaces each bit with a counter (typically 4 bits). Increment on add, decrement on remove:
public class CountingBloomFilter {
private final byte[] counters; // 4-bit counters packed into bytes
private final int numCounters;
private final int numHashFunctions;
public CountingBloomFilter(int expectedElements, double falsePositiveRate) {
this.numCounters = optimalNumBits(expectedElements, falsePositiveRate);
this.numHashFunctions = optimalNumHashes(numCounters, expectedElements);
// Two 4-bit counters per byte
this.counters = new byte[(numCounters + 1) / 2];
}
public void add(byte[] element) {
long hash64 = hash(element);
int h1 = (int) hash64;
int h2 = (int) (hash64 >>> 32);
for (int i = 0; i < numHashFunctions; i++) {
int idx = ((h1 + i * h2) & Integer.MAX_VALUE) % numCounters;
incrementCounter(idx);
}
}
public void remove(byte[] element) {
if (!mightContain(element)) return; // Prevent underflow
long hash64 = hash(element);
int h1 = (int) hash64;
int h2 = (int) (hash64 >>> 32);
for (int i = 0; i < numHashFunctions; i++) {
int idx = ((h1 + i * h2) & Integer.MAX_VALUE) % numCounters;
decrementCounter(idx);
}
}
public boolean mightContain(byte[] element) {
long hash64 = hash(element);
int h1 = (int) hash64;
int h2 = (int) (hash64 >>> 32);
for (int i = 0; i < numHashFunctions; i++) {
int idx = ((h1 + i * h2) & Integer.MAX_VALUE) % numCounters;
if (getCounter(idx) == 0) return false;
}
return true;
}
private int getCounter(int index) {
int byteIdx = index / 2;
return (index % 2 == 0)
? (counters[byteIdx] & 0x0F)
: ((counters[byteIdx] >>> 4) & 0x0F);
}
private void incrementCounter(int index) {
int byteIdx = index / 2;
int val = getCounter(index);
if (val < 15) { // Max 4-bit counter value
if (index % 2 == 0) {
counters[byteIdx] = (byte) ((counters[byteIdx] & 0xF0) | (val + 1));
} else {
counters[byteIdx] = (byte) ((counters[byteIdx] & 0x0F) | ((val + 1) << 4));
}
}
}
private void decrementCounter(int index) {
int byteIdx = index / 2;
int val = getCounter(index);
if (val > 0) {
if (index % 2 == 0) {
counters[byteIdx] = (byte) ((counters[byteIdx] & 0xF0) | (val - 1));
} else {
counters[byteIdx] = (byte) ((counters[byteIdx] & 0x0F) | ((val - 1) << 4));
}
}
}
private static long hash(byte[] data) {
long h = 0xcbf29ce484222325L;
for (byte b : data) { h ^= b & 0xFFL; h *= 0x100000001b3L; }
return h;
}
private static int optimalNumBits(int n, double p) {
return (int) Math.ceil(-n * Math.log(p) / (Math.log(2) * Math.log(2)));
}
private static int optimalNumHashes(int m, int n) {
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
}
Trade-off: A counting Bloom filter uses 4x the memory of a standard Bloom filter (4 bits per counter vs 1 bit). For the content platform, standard Bloom filters suffice because articles are never “un-ingested.” Counting Bloom filters are useful when elements expire or are deleted, such as tracking active user sessions.