Added codec: lz77

dhondta · dhondta · commit 306a7c0627e8 · 2021-10-03T00:44:12.000+02:00
diff --git a/README.md b/README.md
@@ -185,6 +185,7 @@ o
 `klopf` | text <-> klopf encoded text | Polybius square with trivial alphabetical distribution
 `leetspeak` | text <-> leetspeak encoded text | based on minimalistic elite speaking rules
 `letter-indices` | text <-> text with letter indices | encodes consonants and/or vowels with their corresponding indices
+`lz77` | text <-> LZ77-compressed text | compresses the given data with the algorithm of Lempel and Ziv of 1977
 `manchester` | text <-> manchester encoded text | XORes each bit of the input with `01`
 `markdown` | markdown --> HTML | unidirectional
 `morse` | text <-> morse encoded text | uses whitespace as a separator
diff --git a/codext/compressions/__init__.py b/codext/compressions/__init__.py
@@ -1,4 +1,5 @@
 # -*- coding: UTF-8 -*-
 from .gzipp import *
+from .lz77 import *
 from .pkzip import *
 
diff --git a/codext/compressions/lz77.py b/codext/compressions/lz77.py
@@ -0,0 +1,74 @@
+# -*- coding: UTF-8 -*-
+"""LZ77 Codec - Lempel-Ziv 1977 compression algorithm.
+
+NB: Not an encoding properly speaking.
+
+This codec:
+- en/decodes strings from str to str
+- en/decodes strings from bytes to bytes
+- decodes file content to str (read)
+- encodes file content from str to bytes (write)
+
+Inspired from: https://github.com/manassra/LZ77-Compressor
+"""
+from ..__common__ import *
+
+
+__examples__ = {'enc-dec(lz77)': ["test", "This is a test", "@random{1024}"]}
+
+
+_B2b = lambda B: bin(B if isinstance(B, int) else ord(B))[2:].zfill(8)
+_b2B = lambda bt: "".join(chr(int(bt[i:i+8], 2)) for i in range(0, len(bt), 8))
+WINDOW_SIZE = 20
+
+
+def _find_longest_match(data, pos):
+    """ Finds the longest match to a substring starting at the current position (pos) in the lookahead buffer from
+         the history window. """
+    eob, bmd, bml = min(pos + 15, len(data) + 1), -1, -1
+    for j in range(pos + 2, eob):
+        start = max(0, pos - WINDOW_SIZE)
+        substr = data[pos:j]
+        l = len(substr)
+        for i in range(start, pos):
+            n, r = l // (pos - i), l % (pos - i)
+            if data[i:pos] * n + data[i:i+r] == substr and l > bml:
+                bmd, bml = pos - i, l
+    if bmd > 0 and bml > 0:
+        return bmd, bml
+
+
+def lz77_compress(input, errors="strict"):
+    """ Compresses the given data by applying LZ77 compression algorithm. """
+    i, l, bits = 0, len(input), ""
+    while i < l:
+        try:
+            bmd, bml = _find_longest_match(input, i)
+            bits += "1" + _B2b(bmd >> 4) + _B2b(((bmd & 0xf) << 4) | bml)
+            i += bml
+        except TypeError:
+            bits += "0" + _B2b(input[i])
+            i += 1
+    bits += "0" * ((8 - (len(bits) % 8)) % 8)
+    return _b2B(bits), l
+
+
+def lz77_decompress(input, errors="strict"):
+    """ Decompresses the given data. """
+    out, d = "", "".join(_B2b(c) for c in input)
+    while len(d) >= 9:
+        flag, d = d[0], d[1:]
+        if flag == "0":
+            out += _b2B(d[:8])
+            d = d[8:]
+        else:
+            B1, B2 = int(d[:8], 2), int(d[8:16], 2)
+            d = d[16:]
+            dist = (B1 << 4) | (B2 >> 4)
+            for i in range(B2 & 0xf):
+                out += out[-dist]
+    return out, len(out)
+
+
+add("lz77", lz77_compress, lz77_decompress, entropy=7.9)
+
diff --git a/codext/crypto/citrix.py b/codext/crypto/citrix.py
@@ -47,5 +47,5 @@ def decode(text, errors="strict"):
     return decode
 
 
-add("citrix", citrix_encode, citrix_decode, pattern=r"citrix(|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1.)
+add("citrix", citrix_encode, citrix_decode, r"citrix(|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1.)
 
diff --git a/docs/enc/compressions.md b/docs/enc/compressions.md
@@ -17,6 +17,23 @@
 
 -----
 
+### LZ77
+
+This implements the algorithm of Lempel and Ziv of 1977.
+
+**Codec** | **Conversions** | **Aliases** | **Comment**
+:---: | :---: | --- | ---
+`lz77` | data <-> LZ77-compressed data | | 
+
+```python
+>>> codecs.encode("A test string !", "lz77")
+' \x88\x0e\x86S\x99ÐA\x0029\x1aMÆq\x00\x84'
+>>> codecs.decode(" \x88\x0e\x86S\x99ÐA\x0029\x1aMÆq\x00\x84", "lz77")
+'A test string !'
+```
+
+-----
+
 ### PKZip
 
 This implements multiple compression types available in the native [`zipfile`](https://docs.python.org/3/library/zipfile.html) library.

Original file line number	Diff line number	Diff line change
`@@ -47,5 +47,5 @@ def decode(text, errors="strict"):`
`47`	`47`	`return decode`
`48`	`48`
`49`	`49`
`50`		`-add("citrix", citrix_encode, citrix_decode, pattern=r"citrix(\|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1.)`
	`50`	`+add("citrix", citrix_encode, citrix_decode, r"citrix(\|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1.)`
`51`	`51`