Much cleaner Esab32 decoding

Tries all permutations of chunks in Esab32, until it gets only printable characters. This decodes all but one line of the data I currently have: that GUID is successfully decoded later after another chunk is added. So this is 100% successful.
2020-12-22 21:45:30 -07:00 · 2020-12-22 21:45:30 -07:00 · 3e0413f9f7
parent 7390cb6f7c
commit 3e0413f9f7
1 changed files with 117 additions and 71 deletions
--- a/sunburst.py
+++ b/sunburst.py
@ -4,7 +4,7 @@
 # Unclassified/FOUO
 #
 # Created: 2020-12-14 16:49:51
-# Last-modified: 2020-12-22 17:57:54
+# Last-modified: 2020-12-22 21:42:40
 #
 # Based on work by @RedDrip7 (twitter),
 # who should be getting more credit in the English-speaking world.
@ -26,10 +26,14 @@ knownDomains = [
    "avsvmcloud.com",
 ]
 def xor(key, buf):
    return bytes(b ^ k for b, k in zip(buf, itertools.cycle(key)))
-Bsae32Alphabet = "ph2eifo3n5utg1j8d94qrvbmk0sal76c"
+
 Esab32Alphabet = "ph2eifo3n5utg1j8d94qrvbmk0sal76c"
 SubstitutionAlphabet = 'rq3gsalt6u1iyfzop572d49bnx8cvmkewhj'
 SubstitutionAlphabet0 = '0_-.'
 def DecodeBase32(s: str):
@ -39,63 +43,109 @@ def DecodeBase32(s: str):
    It doesn't work.
    """
-    t = s.translate(Bsae32Alphabet,
+    trans = str.maketrans(Esab32Alphabet,
                          "ABCDEFGHIJKLMNOPQRSTUVWXYZ2345678")
    t = s.translate(trans)
    while len(t) % 8 > 0:
        t += '='
    return base64.b32decode(t)
-def DecodeBsae32(s: str):
+def DecodeEsab32(s: str) -> (int, int):
-    """Decode using zany base32-like algorithm.
+    """Decode using big-endian base32 algorithm.
-    The following opinion has been formed hastily and could be misinformed:
+    Returns a bigint, and the number of bits contained therein
    This is not proper Base32. It's more like somebody read about Base32,
    implemented an encoder and decoder incorrectly, and went on to the next task.
    """
-    bits = 0
+    acc = bits = 0
    acc = 0
    for c in s:
-        acc |= Bsae32Alphabet.find(c) << bits
+        try:
            p = Esab32Alphabet.index(c)
        except ValueError:
            raise RuntimeError(
                "Not an Esab32 encoded character: %c in %s" % (c, s))
        acc |= p << bits
        bits += 5
-
+    return acc, bits
    out8 = []
    while bits > 0:
        out8.append(acc & 255)
        acc >>= 8
        bits -= 8
    if bits:
        del out8[-1]
    return bytes(out8)
-SubstitutionAlphabet = 'rq3gsalt6u1iyfzop572d49bnx8cvmkewhj'
+def DecodeSubst(s: str) -> str:
-SubstitutionXlat = str.maketrans(
+    alphabet = SubstitutionAlphabet
    SubstitutionAlphabet[4:] + SubstitutionAlphabet[:4], SubstitutionAlphabet)
 SubstitutionXlat0 = str.maketrans(
    SubstitutionAlphabet, ('0_-.' * 9)[:len(SubstitutionAlphabet)])
 def DecodeSubst(s: str):
    zeroBaby = False
    out = []
    for c in s:
        if c == '0':
-            zeroBaby = True
+            alphabet = SubstitutionAlphabet0
            continue
        if zeroBaby:
            out.append(c.translate(SubstitutionXlat0))
        else:
-            out.append(c.translate(SubstitutionXlat))
+            try:
-        zeroBaby = False
+                pos = (SubstitutionAlphabet.index(c) - 4) % len(alphabet)
-    return ''.join(out)
+            except ValueError:
                raise RuntimeError(
                    "Not a subst encoded character: %c in %s" % (c, s))
            out.append(alphabet[pos])
            alphabet = SubstitutionAlphabet
    return "".join(out)
 PayloadsByGuid = {}
-def DecodeDomain(domain: str):
+Guid = int
 def isprintable(c: int) -> bool:
    return c >= 0x20 and c <= 0x7f
 def quopri(buf: bytes) -> str:
    return codecs.encode(buf, "quopri").decode("utf-8")
 class DGADecoder:
    def __init__(self, guid: Guid):
        self.guid = guid
        self.history = []
        self._decoder = self.DecodeSubst
    def decode(self, s: str):
        if s.startswith("00"):
            self._decoder = self.DecodeEsab32
            # We'll throw away the information about which is first,
            # since we do a computationally intensive trick later to determine ordering
            s = s[2:]
            self.history.insert(0, s)
        else:
            self.history.append(s)
        return self._decoder()
    def DecodeSubst(self) -> str:
        decodes = {DecodeSubst(x) for x in self.history}
        return ''.join(sorted(decodes, key=len, reverse=True))
    def DecodeEsab32(self) -> str:
        history = {x.rstrip("0") for x in self.history}
        # "Why don't we just mix up absolutely everything and see what happens?"
        # -- Ridcully, in Terry Pratchett's "The Hogfather"
        possibilities = []
        for attempt in itertools.permutations(history):
            acc, abits = DecodeEsab32(''.join(attempt))
            length = abits // 8
            if abits % 8:
                buf = acc.to_bytes(length+1, 'little')
            else:
                buf = acc.to_bytes(length, 'little')
            buf = buf[:length]
            if sum(isprintable(b) for b in buf) == length:
                # Yay it's probably okay
                possibilities.append(buf)
        # Well, we tried.
        if not possibilities:
            return quopri(buf)
        else:
            return " | ".join(quopri(buf) for buf in possibilities)
 DecodersByGuid = {}
 def DecodeDomain(domain: str) -> (Guid, int, str):
    s = domain.strip()
    foundDomain = None
    for d in knownDomains:
@ -110,35 +160,28 @@ def DecodeDomain(domain: str):
    assert(s[-1] == '.')
    s = s[:-1]
    if foundDomain == "avsvmcloud.com":
        return (None, None, "[Probably not a Sunburst domain]")
    if len(s) < 16:
-        return (None, None, "[unable to decode: too short for any known decoding rules]")
+        return (None, None, "[too short]")
    dec, _ = DecodeEsab32(s[:15])
    eguid = dec.to_bytes(10, 'little')[:9]
    guid = int.from_bytes(xor(eguid[0:1], eguid[1:]), 'big')
    eguid = DecodeBsae32(s[:15])
    unknown_a = s[15]
-    guid = xor(eguid[0:1], eguid[1:])
+    payload = s[16:]
    s = s[16:]
-    payloads = PayloadsByGuid.setdefault(guid, [])
+    decoder = DecodersByGuid.get(guid)
-    if s not in payloads:
+    if not decoder:
-        if s.startswith("00"):
+        decoder = DGADecoder(guid)
-            payloads.insert(0, s)
+        DecodersByGuid[guid] = decoder
        else:
            payloads.append(s)
            # People: friggin' preserve metadata, ugh. 
            # If I gave you every line of The Empire Strikes Back, 
            # sorted alphabetically, without timestamps,
            # could you reconstruct the movie?
            payloads.sort(key=len, reverse=True)
-    payload = ''.join(payloads)
+    decoded = decoder.decode(payload)
    if payload.startswith("00"):
        buf = DecodeBsae32(payload[2:])
        decoded = codecs.encode(buf, "quopri").decode("utf-8")
    else:
        decoded = DecodeSubst(payload)
    return (guid, unknown_a, decoded)
 class TextReader:
    def __init__(self, infile):
        self.infile = infile
@ -152,12 +195,14 @@ class TextReader:
 class CsvReader:
    def __init__(self, infile):
        self.reader = csv.DictReader(infile)
-        self.fieldnames = self.reader.fieldnames + ["guid", "unknown a", "decode"]
+        self.fieldnames = self.reader.fieldnames + \
            ["guid", "unknown a", "decode"]
    def __iter__(self):
        for record in self.reader:
            yield record
 def main():
    parser = argparse.ArgumentParser(
        description="Decode sunburst Domain Generation Algorithm (DGA) names")
@ -191,10 +236,11 @@ def main():
    for record in reader:
        name = record.get("name") or record.get("fqdn")
        guid, unknown_a, ptext = DecodeDomain(name)
-        record["guid"] = int.from_bytes(guid or b"", "big")
+        record["guid"] = guid
        record["unknown a"] = unknown_a
        record["decode"] = ptext
        writer.writerow(record)
 if __name__ == '__main__':
    main()