From 7390cb6f7cdfb1836ad815b639d21813e103b3e8 Mon Sep 17 00:00:00 2001
From: Neale Pickett <neale@lanl.gov>
Date: Tue, 22 Dec 2020 17:59:54 -0700
Subject: [PATCH] subst reconstitution is broken

---
 .gitignore  |   2 +
 Makefile    |  10 +++
 NOTES.md    |  24 +++++++
 sunburst.py | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 236 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Makefile
 create mode 100644 NOTES.md
 create mode 100755 sunburst.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ee8eda7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+decodes
+data
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..07185d6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,10 @@
+TARGETS += $(patsubst data/%.txt, decodes/%.csv, $(wildcard data/*.txt))
+TARGETS += $(patsubst data/%.csv, decodes/%.csv, $(wildcard data/*.csv))
+
+all: $(TARGETS)
+
+decodes/%.csv: data/%.txt sunburst.py
+	./sunburst.py --outfile $@ $<
+
+decodes/%.csv: data/%.csv sunburst.py
+	./sunburst.py --outfile $@ $<
diff --git a/NOTES.md b/NOTES.md
new file mode 100644
index 0000000..9c835dd
--- /dev/null
+++ b/NOTES.md
@@ -0,0 +1,24 @@
+Stuff I still can't decode
+====================
+
+A lot of these things end in `.local`.
+
+* fidelitycomm.lo|cal
+* milledgeville.l|ocal
+* cs.haystax.loc|al
+* signaturebank.l|ocal
+* vantagedatacente|rs.local
+
+I bet these end in `.local` too:
+
+* ABLE.loc|7l
+* ETC1.loc|7l
+* FVF.loca|m
+* MOC.loca|m
+* FSAR.LOC|7f
+
+`7l = al`
+`7f = AL`
+`m = l`
+
+How does `SCMRI.lo|ujjc` become `SCMRI.local`?
\ No newline at end of file
diff --git a/sunburst.py b/sunburst.py
new file mode 100755
index 0000000..cf27bf4
--- /dev/null
+++ b/sunburst.py
@@ -0,0 +1,200 @@
+#! /usr/bin/python3
+
+# Neale Pickett <neale@lanl.gov>
+# Unclassified/FOUO
+#
+# Created: 2020-12-14 16:49:51
+# Last-modified: 2020-12-22 17:57:54
+#
+# Based on work by @RedDrip7 (twitter),
+# who should be getting more credit in the English-speaking world.
+
+import argparse
+import base64
+import codecs
+import csv
+import itertools
+import re
+import sys
+
+
+knownDomains = [
+    "appsync-api.us-east-1.avsvmcloud.com",
+    "appsync-api.us-east-2.avsvmcloud.com",
+    "appsync-api.us-west-2.avsvmcloud.com",
+    "appsync-api.eu-west-1.avsvmcloud.com",
+    "avsvmcloud.com",
+]
+
+def xor(key, buf):
+    return bytes(b^k for b,k in zip(buf, itertools.cycle(key)))
+
+Bsae32Alphabet = "ph2eifo3n5utg1j8d94qrvbmk0sal76c"
+
+
+def DecodeBase32(s: str):
+    """Not used by sunburst.
+
+    If Sunburst actually used Base32, this would work to decode things.
+    It doesn't work.
+    """
+
+    t = s.translate(Bsae32Alphabet,
+                    "ABCDEFGHIJKLMNOPQRSTUVWXYZ2345678")
+    while len(t) % 8 > 0:
+        t += '='
+    return base64.b32decode(t)
+
+
+def DecodeBsae32(s: str):
+    """Decode using zany base32-like algorithm.
+
+    The following opinion has been formed hastily and could be misinformed:
+
+    This is not proper Base32. It's more like somebody read about Base32,
+    implemented an encoder and decoder incorrectly, and went on to the next task.
+    """
+
+    bits = 0
+    acc = 0
+    for c in s:
+        acc |= Bsae32Alphabet.find(c) << bits
+        bits += 5
+
+    out8 = []
+    while bits > 0:
+        out8.append(acc & 255)
+        acc >>= 8
+        bits -= 8
+
+    if bits:
+        del out8[-1]
+    return bytes(out8)
+
+
+SubstitutionAlphabet = 'rq3gsalt6u1iyfzop572d49bnx8cvmkewhj'
+SubstitutionXlat = str.maketrans(
+    SubstitutionAlphabet[4:] + SubstitutionAlphabet[:4], SubstitutionAlphabet)
+SubstitutionXlat0 = str.maketrans(
+    SubstitutionAlphabet, ('0_-.' * 9)[:len(SubstitutionAlphabet)])
+
+
+def DecodeSubst(s: str):
+    zeroBaby = False
+    out = []
+    for c in s:
+        if c == '0':
+            zeroBaby = True
+            continue
+        if zeroBaby:
+            out.append(c.translate(SubstitutionXlat0))
+        else:
+            out.append(c.translate(SubstitutionXlat))
+        zeroBaby = False
+    return ''.join(out)
+
+PayloadsByGuid = {}
+
+def DecodeDomain(domain: str):
+    s = domain.strip()
+    foundDomain = None
+    for d in knownDomains:
+        if s.endswith(d):
+            foundDomain = d
+            break
+    if not foundDomain:
+        raise RuntimeError("Can't find domain for %s" % s)
+    s = s[:-len(foundDomain)]
+    if not s:
+        return (None, None, "[no data transmitted]")
+    assert(s[-1] == '.')
+    s = s[:-1]
+
+    if len(s) < 16:
+        return (None, None, "[unable to decode: too short for any known decoding rules]")
+    
+    eguid = DecodeBsae32(s[:15])
+    unknown_a = s[15]
+    guid = xor(eguid[0:1], eguid[1:])
+    s = s[16:]
+
+    payloads = PayloadsByGuid.setdefault(guid, [])
+    if s not in payloads:
+        if s.startswith("00"):
+            payloads.insert(0, s)
+        else:
+            payloads.append(s)
+            # People: friggin' preserve metadata, ugh. 
+            # If I gave you every line of The Empire Strikes Back, 
+            # sorted alphabetically, without timestamps,
+            # could you reconstruct the movie?
+            payloads.sort(key=len, reverse=True)
+
+    payload = ''.join(payloads)
+
+    if payload.startswith("00"):
+        buf = DecodeBsae32(payload[2:])
+        decoded = codecs.encode(buf, "quopri").decode("utf-8")
+    else:
+        decoded = DecodeSubst(payload)
+    return (guid, unknown_a, decoded)
+
+class TextReader:
+    def __init__(self, infile):
+        self.infile = infile
+        self.fieldnames = ["name"]
+        
+    def __iter__(self):
+        for s in self.infile:
+            yield {"name": s.strip()}
+    
+
+class CsvReader:
+    def __init__(self, infile):
+        self.reader = csv.DictReader(infile)
+        self.fieldnames = self.reader.fieldnames + ["guid", "unknown a", "decode"]
+
+    def __iter__(self):
+        for record in self.reader:
+            yield record
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Decode sunburst Domain Generation Algorithm (DGA) names")
+    parser.add_argument("--text", dest="input", action="store_const", const="text",
+                        help="Parse bambenek-style: list of fqdns, one per line")
+    parser.add_argument("--csv", dest="input", action="store_const", const="csv",
+                        help="Parse CSV: records must be in a 'name' or 'fqdn' field")
+    parser.add_argument("infile", nargs="?",
+                        type=argparse.FileType("r"), default=sys.stdin)
+    parser.add_argument("--outfile", nargs="?",
+                        type=argparse.FileType("w"), default=sys.stdout,
+                        help="CSV file to write (default stdout)")
+    args = parser.parse_args()
+
+    reader = None
+    if args.input == "text":
+        reader = TextReader(args.infile)
+    elif args.input == "csv":
+        reader = CsvReader(args.infile)
+    elif args.infile.name.endswith(".txt"):
+        reader = TextReader(args.infile)
+    elif args.infile.name.endswith(".csv"):
+        reader = CsvReader(args.infile)
+    else:
+        parser.print_help()
+        return
+
+    fieldnames = reader.fieldnames + ["guid", "unknown a", "decode"]
+    writer = csv.DictWriter(args.outfile, fieldnames)
+    writer.writeheader()
+    for record in reader:        
+        name = record.get("name") or record.get("fqdn")
+        guid, unknown_a, ptext = DecodeDomain(name)
+        record["guid"] = int.from_bytes(guid or b"", "big")
+        record["unknown a"] = unknown_a
+        record["decode"] = ptext
+        writer.writerow(record)
+    
+if __name__ == '__main__':
+    main()