Bencoding in Python

Sat 11 May 2013

by minus

This time not as part of a challenge but because I quickly needed a parser for bencoding in Python. There are existing modules to do that but "bencode" needs some Bittorrent library, "bcode" is not Py3-compatible out of the box. "bcoding" seems to work ~~but looks slow~~ and is about as fast/slow as my code (but verifies for correctness). I didn't look into those before writing my own though.

~~So far it can only decode:~~

Added dumping code. Dicts get loaded into ordered dictionaries now so the file order stays the same when loading and saving again. Also got rid of they ugly, un-pythonic classmethods.

#!/usr/bin/env python3
import io
import logging
from collections import OrderedDict

logging.basicConfig(format="[%(asctime)s] %(levelname)s: %(funcName)s: %(message)s", level=logging.DEBUG)
L = logging.getLogger(__name__)

def load_file(filename):
    with open(filename, "rb") as f:
        return load_io(f)

def load(data):
    return load_io(io.BytesIO(data))

def load_io(data):
    def read_until(mark, initial=b"", include_end=False):
        out = io.BytesIO()
        out.write(initial)
        while True:
            t = data.read(1)
            if t == mark:
                if include_end:
                    out.write(t)
                break
            out.write(t)
        return out.getvalue()

    def parse_list():
        out = []
        while True:
            ret = parse()
            # end of list encountered
            if ret is None:
                return out
            out.append(ret)

    def parse_dict():
        tmplist = parse_list()
        return OrderedDict(zip(tmplist[::2], tmplist[1::2]))

    def parse():
        t = data.read(1)
        # integer, i1234e
        if t == b"i":
            val = int(read_until(b"e"))
            L.debug("int:{}".format(val))
            return val
        # string, 4:test
        if t[0] in range(0x30, 0x30+10):
            length = int(read_until(b":", initial=t))
            val = data.read(length)
            L.debug("str:{}".format(val))
            return val
        # end of a list/dict
        if t == b"e":
            return None
        if t == b"l":
            L.debug("list start")
            ret =  parse_list()
            L.debug("list done")
            return ret
        if t == b"d":
            L.debug("dict start")
            ret =  parse_dict()
            L.debug("dict done")
            return ret
        raise ValueError("Unknown instruction: {}".format(t))

    return parse()

def dump(data):
    output = io.BytesIO()
    def dump_partial(x):
        if isinstance(x, str):
            x = x.decode("utf-8")
        if isinstance(x, int):
            output.write(b"i")
            output.write(str(x).encode("ascii"))
            output.write(b"e")
        elif isinstance(x, bytes):
            output.write(str(len(x)).encode("ascii"))
            output.write(b":")
            output.write(x)
        elif isinstance(x, list):
            output.write(b"l")
            for entry in x:
                dump_partial(entry)
            output.write(b"e")
        elif isinstance(x, dict):
            output.write(b"d")
            for key, entry in x.items():
                dump_partial(key)
                dump_partial(entry)
            output.write(b"e")
        else:
            raise ValueError("Can't encode type {}".format(type(t)))

    dump_partial(data)
    return output.getvalue()

def dump_file(data, filename):
    with open(filename, "wb") as f:
        f.write(dump(data))

if __name__  == "__main__":
    import sys
    from pprint import pprint
    pprint(load_file(sys.argv[1]))