[ create a new paste ] login | about

Link: http://codepad.org/NPIDm0ta    [ raw code | fork ]

Ruby, pasted on Aug 26:
# k = number of hash functions
# n = number of input bits per hash function
# size of the bloom filter, in bits: 2^n
# size of the bloom filter, in bytes: 2^n / 8


# hash function i (i = 0 .. k-1)
def hashfunction(n, i, sha1)
	return (sha1 >> (i*n)) & ((1 << n) - 1)
end

def insert_bloomfilter(k, n, bloomfilter, sha1)
	k.times do |i|
		h = hashfunction(n, i, sha1)
		bloomfilter |= (1 << h)
	end
	return bloomfilter
end

def check_bloomfilter(k, n, bloomfilter, sha1)
	k.times do |i|
		h = hashfunction(n, i, sha1)
		if (bloomfilter & (1 << h)) == 0
			return false
		end
	end
	return true
end

def test_bloomfilter(k, n, gencount, testcount)
	bloomfilter = 0
	gencount.times do
		sha1 = rand(1<<160)
		bloomfilter = insert_bloomfilter(k, n, bloomfilter, sha1)
	end
	false_positives = 0
	testcount.times do
		sha1 = rand(1<<160)
		if check_bloomfilter(k, n, bloomfilter, sha1)
			false_positives += 1
		end
	end
	rate = false_positives / testcount.to_f * 100
	printf "gencount=%d: false positive rate %.1f\n", gencount, rate
end

gencounts = [100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000]
testcount = 1000
[[8, 20], [10, 16], [13, 12]].each do |k, n|
	puts "Testing bloom filter with k=#{k}, n=#{n}, testcount=#{testcount}"
	puts "Bloom filter size: #{2**n/8} bytes"
	gencounts.each do |gencount|
		test_bloomfilter(k, n, gencount, testcount)
	end
end



# Testing bloom filter with k=8, n=20, testcount=1000
# Bloom filter size: 131072 bytes
# gencount=100: false positive rate 0.0
# gencount=250: false positive rate 0.0
# gencount=500: false positive rate 0.0
# gencount=1000: false positive rate 0.0
# gencount=2500: false positive rate 0.0
# gencount=5000: false positive rate 0.0
# gencount=10000: false positive rate 0.0
# gencount=25000: false positive rate 0.0
# gencount=50000: false positive rate 0.0
# Testing bloom filter with k=10, n=16, testcount=1000
# Bloom filter size: 8192 bytes
# gencount=100: false positive rate 0.0
# gencount=250: false positive rate 0.0
# gencount=500: false positive rate 0.0
# gencount=1000: false positive rate 0.0
# gencount=2500: false positive rate 0.0
# gencount=5000: false positive rate 0.3
# gencount=10000: false positive rate 10.1
# gencount=25000: false positive rate 80.5
# gencount=50000: false positive rate 99.7
# Testing bloom filter with k=13, n=12, testcount=1000
# Bloom filter size: 512 bytes
# gencount=100: false positive rate 0.0
# gencount=250: false positive rate 0.0
# gencount=500: false positive rate 4.6
# gencount=1000: false positive rate 59.9
# gencount=2500: false positive rate 99.8
# gencount=5000: false positive rate 100.0
# gencount=10000: false positive rate 100.0
# gencount=25000: false positive rate 100.0
# gencount=50000: false positive rate 100.0


Create a new paste based on this one


Comments: