Bencoding in Python
This time not as part of a
challenge
but because I quickly needed a parser for bencoding in Python. There
are existing modules to do that but
"bencode" needs
some Bittorrent library,
"bcode" is not
Py3-compatible out of the box.
"bcoding" seems
to work but looks slow and is about as fast/slow as my code (but
verifies for correctness). I didn't look into those before writing my
own though.
So far it can only decode:
Added dumping code. Dicts get loaded into ordered dictionaries now so the file order stays the same when loading and saving again. Also got rid of they ugly, un-pythonic classmethods.
#!/usr/bin/env python3
import io
import logging
from collections import OrderedDict
logging.basicConfig(format="[%(asctime)s] %(levelname)s: %(funcName)s: %(message)s", level=logging.DEBUG)
L = logging.getLogger(__name__)
def load_file(filename):
with open(filename, "rb") as f:
return load_io(f)
def load(data):
return load_io(io.BytesIO(data))
def load_io(data):
def read_until(mark, initial=b"", include_end=False):
out = io.BytesIO()
out.write(initial)
while True:
t = data.read(1)
if t == mark:
if include_end:
out.write(t)
break
out.write(t)
return out.getvalue()
def parse_list():
out = []
while True:
ret = parse()
# end of list encountered
if ret is None:
return out
out.append(ret)
def parse_dict():
tmplist = parse_list()
return OrderedDict(zip(tmplist[::2], tmplist[1::2]))
def parse():
t = data.read(1)
# integer, i1234e
if t == b"i":
val = int(read_until(b"e"))
L.debug("int:{}".format(val))
return val
# string, 4:test
if t[0] in range(0x30, 0x30+10):
length = int(read_until(b":", initial=t))
val = data.read(length)
L.debug("str:{}".format(val))
return val
# end of a list/dict
if t == b"e":
return None
if t == b"l":
L.debug("list start")
ret = parse_list()
L.debug("list done")
return ret
if t == b"d":
L.debug("dict start")
ret = parse_dict()
L.debug("dict done")
return ret
raise ValueError("Unknown instruction: {}".format(t))
return parse()
def dump(data):
output = io.BytesIO()
def dump_partial(x):
if isinstance(x, str):
x = x.decode("utf-8")
if isinstance(x, int):
output.write(b"i")
output.write(str(x).encode("ascii"))
output.write(b"e")
elif isinstance(x, bytes):
output.write(str(len(x)).encode("ascii"))
output.write(b":")
output.write(x)
elif isinstance(x, list):
output.write(b"l")
for entry in x:
dump_partial(entry)
output.write(b"e")
elif isinstance(x, dict):
output.write(b"d")
for key, entry in x.items():
dump_partial(key)
dump_partial(entry)
output.write(b"e")
else:
raise ValueError("Can't encode type {}".format(type(t)))
dump_partial(data)
return output.getvalue()
def dump_file(data, filename):
with open(filename, "wb") as f:
f.write(dump(data))
if __name__ == "__main__":
import sys
from pprint import pprint
pprint(load_file(sys.argv[1]))