#! python
# digraph.txt used: http://jnicholl.org/Cryptanalysis/Data/DigramFrequencies.php
import math
def percentile (sorted_set, starting=0, upto=1.0, key=None):
project = lambda x: key(x) if key else x
total = sum(project(x) for x in sorted_set);
accum = 0.0
for item in sorted_set:
if accum > upto:
break;
if accum >= starting:
yield item
accum += (project(item) / total)
frequencies = []
for line in open('digrams.txt'):
if not line.startswith('#'):
items = (int(x) for x in line.split('\t')[1:])
frequencies += items
total = sum(frequencies)
normalize = [float(x)/total for x in frequencies]
entropies = [(x, (-math.log(x,2) if x > 0 else x)) for x in normalize]
#if digrams chosen at same rate as in english, each digram independently
weighted_entropy = sum(f*e for f,e in entropies)
#if top 90% most common digrams chosen at random
normalize.sort(reverse=True);
norm2 = len(list(percentile(normalize, upto=0.9)))
random_entropy = (-math.log(1/float(norm2),2))
print "entropy per digram:"
print "weighted: %f" % weighted_entropy
print "random: %f" % random_entropy