netarch/codebreak.py

369 lines
7.6 KiB
Python

#! /usr/bin/python
## Codebreaking tools
## 2007 Neale Pickett
## I should get an LAUR for this so we can share it.
from sets import Set
from pprint import pprint
# From Wikipedia article "Letter Frequencies"
english_frequency = {'A': .08167,
'B': .01492,
'C': .02782,
'D': .04253,
'E': .12702,
'F': .02228,
'G': .02015,
'H': .06094,
'I': .06966,
'J': .00153,
'K': .00772,
'L': .04025,
'M': .02406,
'N': .06749,
'O': .07507,
'P': .01929,
'Q': .00095,
'R': .05987,
'S': .06327,
'T': .09056,
'U': .02758,
'V': .00978,
'W': .02360,
'X': .00150,
'Y': .01974,
'Z': .00074}
##
## Binary stuff
##
def bin(i):
"""Return the binary representation of i"""
r = []
while i > 0:
r.append(i % 2)
i = i >> 1
r.reverse()
s = ''.join(str(x) for x in r)
return s
class bitvector:
def __init__(self, i, length=None):
if type(i) == type(''):
self._val = 0
for c in i:
self._val <<= 8
self._val += ord(c)
if length is not None:
self._len = length
else:
self._len = len(i) * 8
else:
self._val = i
if length is not None:
self._len = length
else:
self._len = 0
while i > 0:
i >>= 1
self._len += 1
def __len__(self):
return self._len
def __getitem__(self, idx):
if idx > self._len:
raise IndexError()
idx = self._len - idx
return int((self._val >> idx) & 1)
def __getslice__(self, a, b):
if b > self._len:
b = self._len
i = self._val >> (self._len - b)
l = b - a
mask = (1 << l) - 1
return bitvector(i & mask, length=l)
def __iter__(self):
v = self._val
for i in xrange(self._len):
yield int(v & 1)
v >>= 1
def __str__(self):
r = ''
v = self._val
i = self._len
while i > 8:
o = ((v >> (i - 8)) & 0xFF)
r += chr(o)
i -= 8
if i > 0:
o = v & ((1 << i) - 1)
r += chr(o)
return r
def __int__(self):
return self._val
def __repr__(self):
l = list(self)
l.reverse()
return '<bitvector ' + ''.join(str(x) for x in l) + '>'
##
## Statistical stuff
##
def basedist(l):
"""Return a string of length l, with standard distribution of letters"""
out = ""
for c, n in english_frequency.iteritems():
out += c * int(n * l)
return out
##
## Factoring stuff
##
def isPrime(number):
for x in range(2, number):
if number % x == 0:
return True
else:
if number - 1 == x:
return False
def smallestFactor(number):
for x in range(2, number):
if number % x == 0:
return x
def factor(number):
"""Return prime factors for number"""
factors = []
while isPrime(number):
newFactor = smallestFactor(number)
factors.append(newFactor)
number = number / newFactor
factors.append(number)
return factors
##
## Statistical analysis
##
def where(haystack, needle):
ret = []
while True:
pos = haystack.find(needle)
if pos == -1:
break
ret.append(pos)
haystack = haystack[pos + 1:]
return ret
def ngrams(n, haystack, min=2, repeats=False):
acc = {}
for i in range(len(haystack)):
rtxt = haystack[i:]
needle = rtxt[:n]
if repeats:
c = needle[0]
for d in needle:
if d != c:
break
if d != c:
continue
if not acc.has_key(needle):
found = where(rtxt, needle)
if len(found) >= min:
acc[needle] = found
return acc
def freq(txt):
return ngrams(1, txt, min=0)
def bigrams(txt):
return ngrams(2, txt)
def trigrams(txt):
return ngrams(3, txt)
def freqgraph(f):
def cmp2(x, y):
a = x[1]
b = y[1]
if a > b:
return -1
elif a < b:
return 1
else:
return 0
items = []
for c,n in f.iteritems():
if type(n) != type(0):
n = len(n)
items.append((c,n))
items.sort(cmp2)
for c,n in items:
print '%s: %s' % (c, '#' * n)
def neighbors(txt):
out = {}
for dg, w in bigrams(txt).iteritems():
count = len(w)
n = out.get(dg[0], Set())
n.add(dg[1])
out[dg[0]] = n
n = out.get(dg[1], Set())
n.add(dg[0])
out[dg[1]] = n
return out
##
## Brute force tools
##
def rot(n, txt):
"""Caesar cipher"""
out = ""
for c in txt:
if c.isalpha():
o = ord(c) + n
if ((c.islower() and o > ord('z')) or
(c.isupper() and o > ord('Z'))):
o -= 26
out += chr(o)
else:
out += c
return out
def caesars(txt):
return [rot(i, txt) for i in range(26)]
# Tabula recta
tabula_recta = caesars('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
def xor(n, txt):
out = ''
for c in txt:
o = ord(c) ^ n
out += chr(o)
return out
def xors(txt):
ret = []
for n in range(256):
ret.append(xor(n, txt))
return ret
def add(n, txt):
out = ''
for c in txt:
o = (ord(c) + 256 + n) % 256 # Add 256 in case n < 0
out += chr(o)
return out
def adds(txt):
ret = []
for n in range(256):
ret.append(add(n, txt))
return ret
##
## Grep-like things within dictionary
##
def matches(str, tgt):
if len(str) != len(tgt):
return False
map = {}
rmap = {}
for i in range(len(str)):
s = str[i]
t = tgt[i]
m = map.get(s)
if m and m != t:
return False
map[s] = t
r = rmap.get(t)
if r and r != s:
return False
rmap[t] = s
return True
def guess(pattern):
ret = []
pattern = pattern.lower()
words = file('/usr/share/dict/words')
for word in words:
word = word.strip()
word = word.lower()
if matches(word, pattern):
print word
return ret
##
## Overview tools
##
def summary(txt):
print "Length", len(txt)
print "Factors", factor(len(txt))
print
print "Frequency (etaoin shrdlcu)"
freqgraph(freq(txt))
print
print "Bigrams (th er on an re he in ed nd ha at en es of or"
print " nt ea ti to it st io le is ou ar as de rt ve)"
freqgraph(bigrams(txt))
print
print "Trigrams (the and tha ent ion tio for nde has nce edt"
print " tis oft sth men)"
freqgraph(trigrams(txt))
print
# 4-letter words: that with have this will your from they know
# want been good much some time
print "Repeats (ss ee tt ff ll mm oo)"
freqgraph(ngrams(2, txt, min=1, repeats=True))
print
print "Unique neighbors"
pprint(neighbors(txt))
print
def replace(txt, orig, repl):
for o, r in zip(orig, repl):
txt = txt.replace(o, r)
return txt