[ create a new paste ] login | about

Link: http://codepad.org/2UcL9UBK    [ raw code | output | fork ]

aaronla - Python, pasted on Aug 4:
#! python

# digraph.txt used: http://jnicholl.org/Cryptanalysis/Data/DigramFrequencies.php
import math

def percentile (sorted_set, starting=0, upto=1.0, key=None):
    project = lambda x: key(x) if key else x
    total = sum(project(x) for x in sorted_set);
    accum = 0.0
    for item in sorted_set:
        if accum > upto:
            break;
        if accum >= starting:
            yield item
        accum += (project(item) / total)

frequencies = []

for line in open('digrams.txt'):
    if not line.startswith('#'):
        items = (int(x) for x in line.split('\t')[1:])
        frequencies += items

total = sum(frequencies)    
normalize = [float(x)/total for x in frequencies]

entropies = [(x, (-math.log(x,2) if x > 0 else x)) for x in normalize]

#if digrams chosen at same rate as in english, each digram independently
weighted_entropy = sum(f*e for f,e in entropies)

#if top 90% most common digrams chosen at random
normalize.sort(reverse=True);
norm2 = len(list(percentile(normalize, upto=0.9)))
random_entropy = (-math.log(1/float(norm2),2))



print "entropy per digram:"
print "weighted: %f" % weighted_entropy
print "random:   %f" % random_entropy


Output:
1
2
3
4
Traceback (most recent call last):
  Line 19, in <module>
    for line in open('digrams.txt'):
IOError: [Errno 2] No such file or directory: 'digrams.txt'


Create a new paste based on this one


Comments: