# k = number of hash functions
# n = number of input bits per hash function
# size of the bloom filter, in bits: 2^n
# size of the bloom filter, in bytes: 2^n / 8
# hash function i (i = 0 .. k-1)
def hashfunction(n, i, sha1)
return (sha1 >> (i*n)) & ((1 << n) - 1)
end
def insert_bloomfilter(k, n, bloomfilter, sha1)
k.times do |i|
h = hashfunction(n, i, sha1)
bloomfilter |= (1 << h)
end
return bloomfilter
end
def check_bloomfilter(k, n, bloomfilter, sha1)
k.times do |i|
h = hashfunction(n, i, sha1)
if (bloomfilter & (1 << h)) == 0
return false
end
end
return true
end
def test_bloomfilter(k, n, gencount, testcount)
bloomfilter = 0
gencount.times do
sha1 = rand(1<<160)
bloomfilter = insert_bloomfilter(k, n, bloomfilter, sha1)
end
false_positives = 0
testcount.times do
sha1 = rand(1<<160)
if check_bloomfilter(k, n, bloomfilter, sha1)
false_positives += 1
end
end
rate = false_positives / testcount.to_f * 100
printf "gencount=%d: false positive rate %.1f\n", gencount, rate
end
gencounts = [100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000]
testcount = 1000
[[8, 20], [10, 16], [13, 12]].each do |k, n|
puts "Testing bloom filter with k=#{k}, n=#{n}, testcount=#{testcount}"
puts "Bloom filter size: #{2**n/8} bytes"
gencounts.each do |gencount|
test_bloomfilter(k, n, gencount, testcount)
end
end
# Testing bloom filter with k=8, n=20, testcount=1000
# Bloom filter size: 131072 bytes
# gencount=100: false positive rate 0.0
# gencount=250: false positive rate 0.0
# gencount=500: false positive rate 0.0
# gencount=1000: false positive rate 0.0
# gencount=2500: false positive rate 0.0
# gencount=5000: false positive rate 0.0
# gencount=10000: false positive rate 0.0
# gencount=25000: false positive rate 0.0
# gencount=50000: false positive rate 0.0
# Testing bloom filter with k=10, n=16, testcount=1000
# Bloom filter size: 8192 bytes
# gencount=100: false positive rate 0.0
# gencount=250: false positive rate 0.0
# gencount=500: false positive rate 0.0
# gencount=1000: false positive rate 0.0
# gencount=2500: false positive rate 0.0
# gencount=5000: false positive rate 0.3
# gencount=10000: false positive rate 10.1
# gencount=25000: false positive rate 80.5
# gencount=50000: false positive rate 99.7
# Testing bloom filter with k=13, n=12, testcount=1000
# Bloom filter size: 512 bytes
# gencount=100: false positive rate 0.0
# gencount=250: false positive rate 0.0
# gencount=500: false positive rate 4.6
# gencount=1000: false positive rate 59.9
# gencount=2500: false positive rate 99.8
# gencount=5000: false positive rate 100.0
# gencount=10000: false positive rate 100.0
# gencount=25000: false positive rate 100.0
# gencount=50000: false positive rate 100.0