Fixed minor bugs + New release

dhondta · dhondta · commit 29aeb21f8b95 · 2021-10-15T11:40:30.000+02:00
diff --git a/codext/VERSION.txt b/codext/VERSION.txt
@@ -1 +1 @@
-1.8.4
+1.8.5
diff --git a/codext/__common__.py b/codext/__common__.py
@@ -33,6 +33,7 @@
            "list_encodings", "lookup", "maketrans", "rank", "re", "register", "remove", "reset", "s2i", "search",
            "stopfunc", "BytesIO", "MASKS", "PY3"]
 CODECS_REGISTRY = None
+CODECS_CATEGORIES = ["native", "custom"]
 MASKS = {
     'a': printable,
     'b': "".join(chr(i) for i in range(256)),
@@ -70,6 +71,7 @@ def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=
                            NB: this will make the codec available in the built-in open(...) but will make it impossible
                                 to remove the codec later
     """
+    remove(ename)
     if encode and not isinstance(encode, FunctionType):
         raise ValueError("Bad 'encode' function")
     if decode and not isinstance(decode, FunctionType):
@@ -153,18 +155,23 @@ class StreamReader(Codec, codecs.StreamReader):
         ci.parameters['pattern'] = pattern
         ci.parameters['text'] = text
         f = glob.get('__file__', os.path.join("custom", "_"))
-        ci.parameters['category'] = kwargs.get('category', f.split(os.path.sep)[-2].rstrip("s"))
+        cat = f.split(os.path.sep)[-2].rstrip("s")
+        if cat not in CODECS_CATEGORIES:
+            CODECS_CATEGORIES.append(cat)
+        ci.parameters['category'] = kwargs.get('category', cat)
         ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__'))
         ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename]))
         ci.parameters['module'] = kwargs.get('module', glob.get('__name__'))
         ci.parameters.setdefault("scoring", {})
-        for attr in ["entropy", "len_charset", "printables_rate", "padding_char"]:
-            a = kwargs.get(attr)
+        for attr in ["bonus_func", "entropy", "len_charset", "penalty", "printables_rate", "padding_char"]:
+            a = kwargs.pop(attr, None)
             if a is not None:
                 ci.parameters['scoring'][attr] = a
         return ci
     
     getregentry.__name__ = re.sub(r"[\s\-]", "_", ename)
+    if kwargs.get('aliases'):
+        getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases']))
     getregentry.__pattern__ = pattern
     register(getregentry, add_to_codecs)
 
@@ -446,7 +453,7 @@ def is_native(encoding):
 
 def list_categories():
     """ Get a list of all codec categories. """
-    c = ["native"]
+    c = CODECS_CATEGORIES
     root = os.path.dirname(__file__)
     for d in os.listdir(root):
         if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"):
@@ -657,7 +664,8 @@ def lookup(encoding):
             return codecinfo
     # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo
     for search_function in __codecs_registry:
-        if search_function.__name__.replace("_", "-") == encoding:
+        if search_function.__name__.replace("_", "-") == encoding or \
+           encoding in getattr(search_function, "__aliases__", []):
             codecinfo = search_function(generate_string_from_regex(search_function.__pattern__))
             if codecinfo is not None:
                 return codecinfo
@@ -962,6 +970,7 @@ def __init__(self, text, pad_char=None):
 def __score(prev_input, input, codec, heuristic=False, extended=False):
     """ Score relevant encodings given an input. """
     obj, ci = None, lookup(codec)  # NB: lookup(...) won't fail as the codec value comes from list_encodings(...)
+    sc = ci.parameters.get('scoring', {})
     for encoding in ci.parameters.get('guess', [codec]):
         # ignore encodings that fail to decode with their default errors handling value
         try:
@@ -972,16 +981,16 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
         if prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input):
             continue
         # compute input's characteristics only once and only if the control flow reaches this point
-        pad = ci.parameters.get('scoring', {}).get('padding_char')
+        pad = sc.get('padding_char')
         if obj is None:
             obj = _Text(input, pad)
         if heuristic:
             # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base
             #  codecs) so that we can put the right one as early as possible and eventually exclude bad candidates
-            s = -ci.parameters.get('penalty', .0)
+            s = -sc.get('penalty', .0)
             # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ;
             #  on the contrary, if the length of input text's charset is strictly greater, give a penalty
-            lcs = ci.parameters.get('scoring', {}).get('len_charset', 256)
+            lcs = sc.get('len_charset', 256)
             if isinstance(lcs, type(lambda: None)):
                 lcs = int(lcs(encoding))
             if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset:
@@ -997,14 +1006,14 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
             # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when
             #  lower only for codecs that tolerate errors (otherwise, the printables rate can be biased)
             if not ci.parameters.get('no_error', False):
-                pr = ci.parameters.get('scoring', {}).get('printables_rate', 0)
+                pr = sc.get('printables_rate', 0)
                 if isinstance(pr, type(lambda: None)):
                     pr = float(pr(obj.printables))
                 if obj.printables - pr <= .05:
                     s += .1
             # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the
             #  number of input characters to take bad entropies of shorter strings into account
-            entr = ci.parameters.get('entropy', {})
+            entr = sc.get('entropy', {})
             entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr
             if isinstance(entr, type(lambda: None)):
                 try:  # this case allows to consider the current encoding name from the current codec
@@ -1017,9 +1026,9 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
                 if d_entr <= .5:
                     s += .5 - d_entr
             # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched)
-            bonus = ci.parameters.get('scoring', {}).get('bonus_func')
+            bonus = sc.get('bonus_func')
             if bonus is not None:
-                if isinstance(bon, type(lambda: None)):
+                if isinstance(bonus, type(lambda: None)):
                     bonus = bonus(obj, ci, encoding)
                 if bonus:
                     s += .2
@@ -1063,6 +1072,7 @@ def rank(input, extended=False, limit=-1, codec_categories=None, exclude=None):
             codecs.remove(e)
         except ValueError:
             pass
-    return list(__rank(None, input, codecs, True, extended, True))[:limit]
+    r = list(__rank(None, input, codecs, True, extended, True))
+    return r[:limit] if len(r) > 1 else r
 codecs.rank = rank
 
diff --git a/codext/crypto/rot.py b/codext/crypto/rot.py
@@ -46,5 +46,6 @@ def decode(text, errors="strict"):
     return decode
 
 
-add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", printables_rate=lambda pr: pr)
+add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", aliases=["caesar"],
+    printables_rate=lambda pr: pr)
 
diff --git a/tests/test_common.py b/tests/test_common.py
@@ -88,6 +88,8 @@ def test_list_codecs(self):
         self.assertTrue(len(codext.list("native", "language", "crypto")) > 0)
         self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native")))
         self.assertRaises(ValueError, codext.list, "BAD_CATEGORY")
+        self.assertTrue(codext.is_native("base64_codec"))
+        self.assertFalse(codext.is_native("base64"))
     
     def test_remove_codec(self):
         self.assertIsNone(codext.add("dummy", dummy_encode, dummy_decode))
@@ -128,15 +130,19 @@ def test_search_codecs(self):
         self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]")))
     
     def test_guess_decode(self):
-        _l = lambda d: list(d.items())[0][1]
+        _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None
         codext.reset()
+        codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
+                   "test", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
+        self.assertIn("test-codec", codext.list_encodings("test"))
         STR = "This is a test"
         self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1)))
         self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"])))
         if hasattr(codext.stopfunc, "lang_en"):
-            f = codext.stopfunc.lang_en
-            self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f, found=["base62"])))
-            self.assertIsNotNone(_l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f, max_depth=1)))
+            f1, f2 = codext.stopfunc.lang_en, codext.stopfunc.lang_es
+            self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f1, found=["base62"])))
+            self.assertIsNotNone(_l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f1, max_depth=1)))
+            self.assertNotEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f2, found=["base62"], max_depth=1)))
         self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True,
                                               exclude=["base100"])))
         self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"])))
@@ -177,12 +183,18 @@ def test_guess_decode(self):
         b64 = codext.encode(txt, "base64")
         self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True,
                                               codec_categories="base")))
+        self.assertEqual(list(codext.guess("TEST=", codec_categories="test").items())[0][1], "TEST")
+        self.assertEqual(list(codext.guess("TEST=", codec_categories=["test", "base"]).items())[0][1], "TEST")
     
     def test_rank_input(self):
+        codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
+                   "test", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
         STR = "This is a test string !"
         ENC = codext.encode(STR, "base64")
-        self.assertTrue(len(codext.rank(ENC)) > 100)
+        self.assertTrue(len(codext.rank(ENC)) > 20)
         self.assertEqual(len(codext.rank(ENC, limit=20)), 20)
         self.assertEqual(codext.rank(ENC, exclude=["rot"])[0][1], "base64")
         self.assertEqual(codext.rank(ENC, codec_categories=["base"])[0][0][1], STR)
+        self.assertIsNotNone(codext.rank(ENC, codec_categories=["base"], exclude=["does_not_exist"])[0][0][1], STR)
+        self.assertIsNotNone(codext.rank("TEST=", codec_categories=["test", "base"])[0][0][1], "TEST")
 
diff --git a/tests/test_generated.py b/tests/test_generated.py
@@ -108,7 +108,7 @@ def _template(self):
                                 with codecs.open(tfile, 'wb', encoding=ename) as f:
                                     f.write(b(s1))
                                 with codecs.open(tfile, 'rb', encoding=ename) as f:
-                                    s = f.read().strip(b("\x00"))
+                                    s = f.read() if PY3 else f.read().rstrip("\x00")
                                 self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s)))
                                 os.remove(tfile)
     return _template