Skip to content

Commit 306a7c0

Browse files
committed
Added codec: lz77
1 parent 66df98d commit 306a7c0

5 files changed

Lines changed: 94 additions & 1 deletion

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ o
185185
`klopf` | text <-> klopf encoded text | Polybius square with trivial alphabetical distribution
186186
`leetspeak` | text <-> leetspeak encoded text | based on minimalistic elite speaking rules
187187
`letter-indices` | text <-> text with letter indices | encodes consonants and/or vowels with their corresponding indices
188+
`lz77` | text <-> LZ77-compressed text | compresses the given data with the algorithm of Lempel and Ziv of 1977
188189
`manchester` | text <-> manchester encoded text | XORes each bit of the input with `01`
189190
`markdown` | markdown --> HTML | unidirectional
190191
`morse` | text <-> morse encoded text | uses whitespace as a separator

codext/compressions/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# -*- coding: UTF-8 -*-
22
from .gzipp import *
3+
from .lz77 import *
34
from .pkzip import *
45

codext/compressions/lz77.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# -*- coding: UTF-8 -*-
2+
"""LZ77 Codec - Lempel-Ziv 1977 compression algorithm.
3+
4+
NB: Not an encoding properly speaking.
5+
6+
This codec:
7+
- en/decodes strings from str to str
8+
- en/decodes strings from bytes to bytes
9+
- decodes file content to str (read)
10+
- encodes file content from str to bytes (write)
11+
12+
Inspired from: https://github.com/manassra/LZ77-Compressor
13+
"""
14+
from ..__common__ import *
15+
16+
17+
__examples__ = {'enc-dec(lz77)': ["test", "This is a test", "@random{1024}"]}
18+
19+
20+
_B2b = lambda B: bin(B if isinstance(B, int) else ord(B))[2:].zfill(8)
21+
_b2B = lambda bt: "".join(chr(int(bt[i:i+8], 2)) for i in range(0, len(bt), 8))
22+
WINDOW_SIZE = 20
23+
24+
25+
def _find_longest_match(data, pos):
26+
""" Finds the longest match to a substring starting at the current position (pos) in the lookahead buffer from
27+
the history window. """
28+
eob, bmd, bml = min(pos + 15, len(data) + 1), -1, -1
29+
for j in range(pos + 2, eob):
30+
start = max(0, pos - WINDOW_SIZE)
31+
substr = data[pos:j]
32+
l = len(substr)
33+
for i in range(start, pos):
34+
n, r = l // (pos - i), l % (pos - i)
35+
if data[i:pos] * n + data[i:i+r] == substr and l > bml:
36+
bmd, bml = pos - i, l
37+
if bmd > 0 and bml > 0:
38+
return bmd, bml
39+
40+
41+
def lz77_compress(input, errors="strict"):
42+
""" Compresses the given data by applying LZ77 compression algorithm. """
43+
i, l, bits = 0, len(input), ""
44+
while i < l:
45+
try:
46+
bmd, bml = _find_longest_match(input, i)
47+
bits += "1" + _B2b(bmd >> 4) + _B2b(((bmd & 0xf) << 4) | bml)
48+
i += bml
49+
except TypeError:
50+
bits += "0" + _B2b(input[i])
51+
i += 1
52+
bits += "0" * ((8 - (len(bits) % 8)) % 8)
53+
return _b2B(bits), l
54+
55+
56+
def lz77_decompress(input, errors="strict"):
57+
""" Decompresses the given data. """
58+
out, d = "", "".join(_B2b(c) for c in input)
59+
while len(d) >= 9:
60+
flag, d = d[0], d[1:]
61+
if flag == "0":
62+
out += _b2B(d[:8])
63+
d = d[8:]
64+
else:
65+
B1, B2 = int(d[:8], 2), int(d[8:16], 2)
66+
d = d[16:]
67+
dist = (B1 << 4) | (B2 >> 4)
68+
for i in range(B2 & 0xf):
69+
out += out[-dist]
70+
return out, len(out)
71+
72+
73+
add("lz77", lz77_compress, lz77_decompress, entropy=7.9)
74+

codext/crypto/citrix.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,5 @@ def decode(text, errors="strict"):
4747
return decode
4848

4949

50-
add("citrix", citrix_encode, citrix_decode, pattern=r"citrix(|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1.)
50+
add("citrix", citrix_encode, citrix_decode, r"citrix(|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1.)
5151

docs/enc/compressions.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,23 @@
1717

1818
-----
1919

20+
### LZ77
21+
22+
This implements the algorithm of Lempel and Ziv of 1977.
23+
24+
**Codec** | **Conversions** | **Aliases** | **Comment**
25+
:---: | :---: | --- | ---
26+
`lz77` | data <-> LZ77-compressed data | |
27+
28+
```python
29+
>>> codecs.encode("A test string !", "lz77")
30+
' \x88\x0e\x86S\x99ÐA\x0029\x1aMÆq\x00\x84'
31+
>>> codecs.decode(" \x88\x0e\x86S\x99ÐA\x0029\x1aMÆq\x00\x84", "lz77")
32+
'A test string !'
33+
```
34+
35+
-----
36+
2037
### PKZip
2138

2239
This implements multiple compression types available in the native [`zipfile`](https://docs.python.org/3/library/zipfile.html) library.

0 commit comments

Comments
 (0)