Bloom Filters: Duplicate Detection Without the Memory Cost

The main chapter introduced the Bloom filter concept and the math behind optimal sizing. This section builds a production-quality Bloom filter step by step, validates its false positive rate empirically, benchmarks it against HashSet, and shows the migration to Guava.

Building the Bloom Filter: Bit Array and Hashing

The core data structure is a bit array backed by a long[]. Each long holds 64 bits. Addressing bit i means accessing bits[i >>> 6] (which long) and 1L << (i & 63) (which bit within the long):

public class ProductionBloomFilter {

    private final long[] bits;
    private final int numBits;
    private final int numHashFunctions;
    private int insertionCount;

    public ProductionBloomFilter(int expectedInsertions, double falsePositiveRate) {
        if (expectedInsertions <= 0) {
            throw new IllegalArgumentException("expectedInsertions must be positive");
        }
        if (falsePositiveRate <= 0 || falsePositiveRate >= 1) {
            throw new IllegalArgumentException("falsePositiveRate must be in (0, 1)");
        }

        this.numBits = optimalNumBits(expectedInsertions, falsePositiveRate);
        this.numHashFunctions = optimalNumHashes(numBits, expectedInsertions);
        this.bits = new long[(numBits + 63) >>> 6];
        this.insertionCount = 0;
    }

    public boolean put(byte[] element) {
        long hash64 = hash(element, 0);
        int h1 = (int) hash64;
        int h2 = (int) (hash64 >>> 32);

        boolean bitsChanged = false;
        for (int i = 0; i < numHashFunctions; i++) {
            int combinedHash = h1 + i * h2;
            int bitIndex = (combinedHash & Integer.MAX_VALUE) % numBits;
            int longIndex = bitIndex >>> 6;
            long mask = 1L << (bitIndex & 63);

            if ((bits[longIndex] & mask) == 0) {
                bits[longIndex] |= mask;
                bitsChanged = true;
            }
        }
        if (bitsChanged) insertionCount++;
        return bitsChanged;
    }

    public boolean mightContain(byte[] element) {
        long hash64 = hash(element, 0);
        int h1 = (int) hash64;
        int h2 = (int) (hash64 >>> 32);

        for (int i = 0; i < numHashFunctions; i++) {
            int combinedHash = h1 + i * h2;
            int bitIndex = (combinedHash & Integer.MAX_VALUE) % numBits;
            if ((bits[bitIndex >>> 6] & (1L << (bitIndex & 63))) == 0) {
                return false;
            }
        }
        return true;
    }

    public double expectedFalsePositiveRate() {
        double fractionSet = (double) countSetBits() / numBits;
        return Math.pow(fractionSet, numHashFunctions);
    }

    public long memoryBytes() {
        return (long) bits.length * 8;
    }

    private int countSetBits() {
        int count = 0;
        for (long word : bits) count += Long.bitCount(word);
        return count;
    }

    private static long hash(byte[] data, long seed) {
        // FNV-1a 64-bit hash
        long h = 0xcbf29ce484222325L ^ seed;
        for (byte b : data) {
            h ^= b & 0xFFL;
            h *= 0x100000001b3L;
        }
        return h;
    }

    private static int optimalNumBits(int n, double p) {
        return (int) Math.ceil(-n * Math.log(p) / (Math.log(2) * Math.log(2)));
    }

    private static int optimalNumHashes(int m, int n) {
        return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
    }
}

Empirical False Positive Rate Validation

The theoretical false positive rate is a mathematical guarantee, but implementation bugs can violate it. Always validate empirically:

public class BloomFilterValidation {

    public static void main(String[] args) {
        int insertions = 1_000_000;
        double targetFPR = 0.01; // 1%

        ProductionBloomFilter filter = new ProductionBloomFilter(insertions, targetFPR);

        // Insert 1M elements
        for (int i = 0; i < insertions; i++) {
            filter.put(("article-" + i).getBytes(StandardCharsets.UTF_8));
        }

        // Test 1M elements that were NOT inserted
        int falsePositives = 0;
        int tests = 1_000_000;
        for (int i = insertions; i < insertions + tests; i++) {
            if (filter.mightContain(("article-" + i).getBytes(StandardCharsets.UTF_8))) {
                falsePositives++;
            }
        }

        double observedFPR = (double) falsePositives / tests;
        System.out.printf("Target FPR: %.4f%n", targetFPR);
        System.out.printf("Observed FPR: %.4f%n", observedFPR);
        System.out.printf("Memory: %.2f MB%n", filter.memoryBytes() / (1024.0 * 1024.0));
        System.out.printf("Hash functions: calculated from optimal formula%n");

        // Assert observed is within 2x of target
        // (statistical variance allows some deviation)
        assert observedFPR < targetFPR * 2.0 :
            "FPR too high: " + observedFPR + " vs target " + targetFPR;
    }
}

Expected output:

Target FPR: 0.0100
Observed FPR: 0.0098
Memory: 1.14 MB

The observed FPR is within 2% of the target. If it deviates significantly, the hash function has poor distribution or the bit count calculation is wrong.

JMH Benchmark: Bloom Filter vs HashSet

@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(2)
@State(Scope.Benchmark)
public class BloomVsHashSetBenchmark {

    @Param({"1000000", "10000000"})
    int size;

    ProductionBloomFilter bloomFilter;
    HashSet<String> hashSet;
    byte[] lookupKeyPresent;
    byte[] lookupKeyAbsent;
    String lookupStringPresent;
    String lookupStringAbsent;

    @Setup(Level.Trial)
    public void setup() {
        bloomFilter = new ProductionBloomFilter(size, 0.01);
        hashSet = new HashSet<>(size * 2);

        for (int i = 0; i < size; i++) {
            String key = "article-content-hash-" + i;
            bloomFilter.put(key.getBytes(StandardCharsets.UTF_8));
            hashSet.add(key);
        }

        lookupStringPresent = "article-content-hash-" + (size / 2);
        lookupKeyPresent = lookupStringPresent.getBytes(StandardCharsets.UTF_8);
        lookupStringAbsent = "article-content-hash-" + (size * 2);
        lookupKeyAbsent = lookupStringAbsent.getBytes(StandardCharsets.UTF_8);
    }

    @Benchmark
    public boolean bloomLookupPresent() {
        return bloomFilter.mightContain(lookupKeyPresent);    // FAST
    }

    @Benchmark
    public boolean hashSetLookupPresent() {
        return hashSet.contains(lookupStringPresent);
    }

    @Benchmark
    public boolean bloomLookupAbsent() {
        return bloomFilter.mightContain(lookupKeyAbsent);     // FAST
    }

    @Benchmark
    public boolean hashSetLookupAbsent() {
        return hashSet.contains(lookupStringAbsent);
    }
}

Operation	1M elements		10M elements
	Bloom (ns)	HashSet (ns)	Bloom (ns)	HashSet (ns)
Lookup present	85	42	88	65
Lookup absent	78	38	82	48
Memory	1.14 MB	128 MB	11.4 MB	1.3 GB

HashSet is faster per lookup because it computes one hash and follows one pointer. The Bloom filter computes 7 hash-derived positions and checks 7 bits. But the Bloom filter uses 112x less memory at 1M elements and 114x less memory at 10M elements.

Symptom: Ingestion pipeline OOM at 15 million articles in the dedup HashSet. Cause: HashSet grows without bound as articles accumulate. Fix: Replace HashSet with Bloom filter. Accept 1% false positives (duplicates that slip through are caught by the database’s unique constraint). Trade-off: 1% of duplicates reach the database layer, adding ~500 unnecessary insert-or-ignore queries per day out of 50,000 new articles. The database handles this without measurable impact.

Migrating to Guava BloomFilter

The scratch implementation works but lacks: thread safety, serialization, merge (union) of multiple filters, and funnel-based hashing for arbitrary object types. Guava provides all of these:

import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;

public class GuavaBloomFilterExample {

    private final BloomFilter<CharSequence> filter;

    public GuavaBloomFilterExample(int expectedInsertions, double fpp) {
        this.filter = BloomFilter.create(
            Funnels.stringFunnel(StandardCharsets.UTF_8),
            expectedInsertions,
            fpp
        );
    }

    public boolean addArticle(String contentHash) {
        return filter.put(contentHash);  // Returns true if bits changed
    }

    public boolean mightBeDuplicate(String contentHash) {
        return filter.mightContain(contentHash);
    }

    public double currentFpp() {
        return filter.expectedFpp();
    }
}

Guava’s implementation uses Murmur3 128-bit hashing (better distribution than FNV-1a), optimized bit addressing, and supports serialization via writeTo(OutputStream) and readFrom(InputStream, Funnel).

// Content platform ingestion pipeline with Bloom filter
public class ArticleIngestionPipeline {

    private final BloomFilter<CharSequence> seenArticles;
    private final ArticleRepository repository;

    public ArticleIngestionPipeline(ArticleRepository repository) {
        this.seenArticles = BloomFilter.create(
            Funnels.stringFunnel(StandardCharsets.UTF_8),
            20_000_000,  // Expected articles over filter lifetime
            0.01         // 1% false positive rate
        );
        this.repository = repository;

        // Load existing hashes from database on startup
        repository.allContentHashes().forEach(seenArticles::put);
    }

    public IngestResult ingest(RawArticle raw) {
        String contentHash = computeContentHash(raw);

        // Fast path: Bloom filter says not seen
        if (!seenArticles.mightContain(contentHash)) {     // FAST: 82ns
            seenArticles.put(contentHash);
            Article article = transform(raw);
            repository.save(article);
            return IngestResult.NEW;
        }

        // Slow path: Bloom filter says maybe seen (1% false positive)
        // Confirm with database
        if (repository.existsByContentHash(contentHash)) { // SLOW: ~2ms
            return IngestResult.DUPLICATE;
        }

        // False positive: article is new despite Bloom filter saying yes
        seenArticles.put(contentHash);
        Article article = transform(raw);
        repository.save(article);
        return IngestResult.NEW_FALSE_POSITIVE;
    }

    private String computeContentHash(RawArticle raw) {
        // SHA-256 of title + body, hex-encoded
        // Not shown: use MessageDigest
        return "";
    }

    private Article transform(RawArticle raw) { return null; }
}

The pipeline checks the Bloom filter first (82ns). If negative, the article is definitely new: save it. If positive, confirm with the database (2ms). At 50,000 articles per day with 70% duplicates:

Without Bloom filter: 50,000 * 2ms = 100 seconds of database queries
With Bloom filter: 15,000 new * 82ns + 35,000 duplicate * 82ns + 350 false positive * 2ms = ~4ms + ~1ms + ~700ms = ~705ms

The Bloom filter reduces dedup query time from 100 seconds to under 1 second.

Counting Bloom Filters: Supporting Deletion

Standard Bloom filters do not support deletion because clearing a bit might affect other elements. A counting Bloom filter replaces each bit with a counter (typically 4 bits). Increment on add, decrement on remove:

public class CountingBloomFilter {

    private final byte[] counters; // 4-bit counters packed into bytes
    private final int numCounters;
    private final int numHashFunctions;

    public CountingBloomFilter(int expectedElements, double falsePositiveRate) {
        this.numCounters = optimalNumBits(expectedElements, falsePositiveRate);
        this.numHashFunctions = optimalNumHashes(numCounters, expectedElements);
        // Two 4-bit counters per byte
        this.counters = new byte[(numCounters + 1) / 2];
    }

    public void add(byte[] element) {
        long hash64 = hash(element);
        int h1 = (int) hash64;
        int h2 = (int) (hash64 >>> 32);

        for (int i = 0; i < numHashFunctions; i++) {
            int idx = ((h1 + i * h2) & Integer.MAX_VALUE) % numCounters;
            incrementCounter(idx);
        }
    }

    public void remove(byte[] element) {
        if (!mightContain(element)) return; // Prevent underflow
        long hash64 = hash(element);
        int h1 = (int) hash64;
        int h2 = (int) (hash64 >>> 32);

        for (int i = 0; i < numHashFunctions; i++) {
            int idx = ((h1 + i * h2) & Integer.MAX_VALUE) % numCounters;
            decrementCounter(idx);
        }
    }

    public boolean mightContain(byte[] element) {
        long hash64 = hash(element);
        int h1 = (int) hash64;
        int h2 = (int) (hash64 >>> 32);

        for (int i = 0; i < numHashFunctions; i++) {
            int idx = ((h1 + i * h2) & Integer.MAX_VALUE) % numCounters;
            if (getCounter(idx) == 0) return false;
        }
        return true;
    }

    private int getCounter(int index) {
        int byteIdx = index / 2;
        return (index % 2 == 0)
            ? (counters[byteIdx] & 0x0F)
            : ((counters[byteIdx] >>> 4) & 0x0F);
    }

    private void incrementCounter(int index) {
        int byteIdx = index / 2;
        int val = getCounter(index);
        if (val < 15) { // Max 4-bit counter value
            if (index % 2 == 0) {
                counters[byteIdx] = (byte) ((counters[byteIdx] & 0xF0) | (val + 1));
            } else {
                counters[byteIdx] = (byte) ((counters[byteIdx] & 0x0F) | ((val + 1) << 4));
            }
        }
    }

    private void decrementCounter(int index) {
        int byteIdx = index / 2;
        int val = getCounter(index);
        if (val > 0) {
            if (index % 2 == 0) {
                counters[byteIdx] = (byte) ((counters[byteIdx] & 0xF0) | (val - 1));
            } else {
                counters[byteIdx] = (byte) ((counters[byteIdx] & 0x0F) | ((val - 1) << 4));
            }
        }
    }

    private static long hash(byte[] data) {
        long h = 0xcbf29ce484222325L;
        for (byte b : data) { h ^= b & 0xFFL; h *= 0x100000001b3L; }
        return h;
    }

    private static int optimalNumBits(int n, double p) {
        return (int) Math.ceil(-n * Math.log(p) / (Math.log(2) * Math.log(2)));
    }

    private static int optimalNumHashes(int m, int n) {
        return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
    }
}

Trade-off: A counting Bloom filter uses 4x the memory of a standard Bloom filter (4 bits per counter vs 1 bit). For the content platform, standard Bloom filters suffice because articles are never “un-ingested.” Counting Bloom filters are useful when elements expire or are deleted, such as tracking active user sessions.