|
| 1 | +# -*- coding: UTF-8 -*- |
| 2 | +"""LZ77 Codec - Lempel-Ziv 1977 compression algorithm. |
| 3 | +
|
| 4 | +NB: Not an encoding properly speaking. |
| 5 | +
|
| 6 | +This codec: |
| 7 | +- en/decodes strings from str to str |
| 8 | +- en/decodes strings from bytes to bytes |
| 9 | +- decodes file content to str (read) |
| 10 | +- encodes file content from str to bytes (write) |
| 11 | +
|
| 12 | +Inspired from: https://github.com/manassra/LZ77-Compressor |
| 13 | +""" |
| 14 | +from ..__common__ import * |
| 15 | + |
| 16 | + |
| 17 | +__examples__ = {'enc-dec(lz77)': ["test", "This is a test", "@random{1024}"]} |
| 18 | + |
| 19 | + |
| 20 | +_B2b = lambda B: bin(B if isinstance(B, int) else ord(B))[2:].zfill(8) |
| 21 | +_b2B = lambda bt: "".join(chr(int(bt[i:i+8], 2)) for i in range(0, len(bt), 8)) |
| 22 | +WINDOW_SIZE = 20 |
| 23 | + |
| 24 | + |
| 25 | +def _find_longest_match(data, pos): |
| 26 | + """ Finds the longest match to a substring starting at the current position (pos) in the lookahead buffer from |
| 27 | + the history window. """ |
| 28 | + eob, bmd, bml = min(pos + 15, len(data) + 1), -1, -1 |
| 29 | + for j in range(pos + 2, eob): |
| 30 | + start = max(0, pos - WINDOW_SIZE) |
| 31 | + substr = data[pos:j] |
| 32 | + l = len(substr) |
| 33 | + for i in range(start, pos): |
| 34 | + n, r = l // (pos - i), l % (pos - i) |
| 35 | + if data[i:pos] * n + data[i:i+r] == substr and l > bml: |
| 36 | + bmd, bml = pos - i, l |
| 37 | + if bmd > 0 and bml > 0: |
| 38 | + return bmd, bml |
| 39 | + |
| 40 | + |
| 41 | +def lz77_compress(input, errors="strict"): |
| 42 | + """ Compresses the given data by applying LZ77 compression algorithm. """ |
| 43 | + i, l, bits = 0, len(input), "" |
| 44 | + while i < l: |
| 45 | + try: |
| 46 | + bmd, bml = _find_longest_match(input, i) |
| 47 | + bits += "1" + _B2b(bmd >> 4) + _B2b(((bmd & 0xf) << 4) | bml) |
| 48 | + i += bml |
| 49 | + except TypeError: |
| 50 | + bits += "0" + _B2b(input[i]) |
| 51 | + i += 1 |
| 52 | + bits += "0" * ((8 - (len(bits) % 8)) % 8) |
| 53 | + return _b2B(bits), l |
| 54 | + |
| 55 | + |
| 56 | +def lz77_decompress(input, errors="strict"): |
| 57 | + """ Decompresses the given data. """ |
| 58 | + out, d = "", "".join(_B2b(c) for c in input) |
| 59 | + while len(d) >= 9: |
| 60 | + flag, d = d[0], d[1:] |
| 61 | + if flag == "0": |
| 62 | + out += _b2B(d[:8]) |
| 63 | + d = d[8:] |
| 64 | + else: |
| 65 | + B1, B2 = int(d[:8], 2), int(d[8:16], 2) |
| 66 | + d = d[16:] |
| 67 | + dist = (B1 << 4) | (B2 >> 4) |
| 68 | + for i in range(B2 & 0xf): |
| 69 | + out += out[-dist] |
| 70 | + return out, len(out) |
| 71 | + |
| 72 | + |
| 73 | +add("lz77", lz77_compress, lz77_decompress, entropy=7.9) |
| 74 | + |
0 commit comments