def morris_pratt(P, T):
"""Search for a pattern, P, in some text, T, using the Morris-Pratt
algorithm."""
def index(P):
"""Find all borders, that is, matching prefixes and suffixes, that
are in P by matching P against itself. The worst-case border is:
AA .. AAAB, all the same letter followed by a single different
letter.
!!! z is there simply for debugging purposes. If you want to see
how good/bad a pattern is, print z.
"""
F = range(0, len(P))
i = 1
j = 0
z = 0
# go over the characters in the pattern starting at the 2nd character
while i < len(P):
z = z+1
# does the current character in the pattern match the jth character
# from the beginning of P?
if P[i] is P[j]:
F[i] = F[i-1]+1
j = j+1
i = i+1
# try to extend the border from the position before j
elif j > 0:
j = F[j-1]
# no it doesn't, so set j=0 to tell us to start matching the
# pattern against itself from the beginning and state that this
# chatacter in the pattern ins't part of a prefix
else:
F[i] = 0
i = i+1
print "\t", z, "character comparisons (indexing)"
# we've indexed the pattern, return the failure function
return F
# using the results of the failure function, search for P in T.
def search(P, F, T):
"""Search for P in T using the failure function of P, F, to take
advantage of information we've figured out along the way."""
i = 0 # where we are in the text
m = 0 # where we started matching from in the text
j = 0 # where we are in the pattern, this is also useful for the
z = 0 # comparison count
r = -1 # what to return
pattern_len = len(P)
while i < len(T):
z = z+1
# yay we have a match, move the pattern and text indexes along
if T[i] is P[j]:
i = i+1
j = j+1
# we've matched the whole pattern
if j is pattern_len:
r = m
break
# no match, lets move past already matched characters
else:
j = F[j-1] # tell us where to start matching from in the
# pattern the next time we loop
i = i >= (m+j+1) and i+1 or (m+j+1)
m = i # reset where we should match from
print "\t", z, "character comparisons (matching)"
return r
# search the text
print "pattern length:", len(P)
print "text length:", len(T)
i = search(P, index(P), T)
print "pattern matched at index:", i
if __name__ == "__main__":
morris_pratt("aaab","baabbaabaaaab")