Skip to content

Commit 29aeb21

Browse files
committed
Fixed minor bugs + New release
1 parent 57f4269 commit 29aeb21

5 files changed

Lines changed: 44 additions & 21 deletions

File tree

codext/VERSION.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.8.4
1+
1.8.5

codext/__common__.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"list_encodings", "lookup", "maketrans", "rank", "re", "register", "remove", "reset", "s2i", "search",
3434
"stopfunc", "BytesIO", "MASKS", "PY3"]
3535
CODECS_REGISTRY = None
36+
CODECS_CATEGORIES = ["native", "custom"]
3637
MASKS = {
3738
'a': printable,
3839
'b': "".join(chr(i) for i in range(256)),
@@ -70,6 +71,7 @@ def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=
7071
NB: this will make the codec available in the built-in open(...) but will make it impossible
7172
to remove the codec later
7273
"""
74+
remove(ename)
7375
if encode and not isinstance(encode, FunctionType):
7476
raise ValueError("Bad 'encode' function")
7577
if decode and not isinstance(decode, FunctionType):
@@ -153,18 +155,23 @@ class StreamReader(Codec, codecs.StreamReader):
153155
ci.parameters['pattern'] = pattern
154156
ci.parameters['text'] = text
155157
f = glob.get('__file__', os.path.join("custom", "_"))
156-
ci.parameters['category'] = kwargs.get('category', f.split(os.path.sep)[-2].rstrip("s"))
158+
cat = f.split(os.path.sep)[-2].rstrip("s")
159+
if cat not in CODECS_CATEGORIES:
160+
CODECS_CATEGORIES.append(cat)
161+
ci.parameters['category'] = kwargs.get('category', cat)
157162
ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__'))
158163
ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename]))
159164
ci.parameters['module'] = kwargs.get('module', glob.get('__name__'))
160165
ci.parameters.setdefault("scoring", {})
161-
for attr in ["entropy", "len_charset", "printables_rate", "padding_char"]:
162-
a = kwargs.get(attr)
166+
for attr in ["bonus_func", "entropy", "len_charset", "penalty", "printables_rate", "padding_char"]:
167+
a = kwargs.pop(attr, None)
163168
if a is not None:
164169
ci.parameters['scoring'][attr] = a
165170
return ci
166171

167172
getregentry.__name__ = re.sub(r"[\s\-]", "_", ename)
173+
if kwargs.get('aliases'):
174+
getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases']))
168175
getregentry.__pattern__ = pattern
169176
register(getregentry, add_to_codecs)
170177

@@ -446,7 +453,7 @@ def is_native(encoding):
446453

447454
def list_categories():
448455
""" Get a list of all codec categories. """
449-
c = ["native"]
456+
c = CODECS_CATEGORIES
450457
root = os.path.dirname(__file__)
451458
for d in os.listdir(root):
452459
if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"):
@@ -657,7 +664,8 @@ def lookup(encoding):
657664
return codecinfo
658665
# then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo
659666
for search_function in __codecs_registry:
660-
if search_function.__name__.replace("_", "-") == encoding:
667+
if search_function.__name__.replace("_", "-") == encoding or \
668+
encoding in getattr(search_function, "__aliases__", []):
661669
codecinfo = search_function(generate_string_from_regex(search_function.__pattern__))
662670
if codecinfo is not None:
663671
return codecinfo
@@ -962,6 +970,7 @@ def __init__(self, text, pad_char=None):
962970
def __score(prev_input, input, codec, heuristic=False, extended=False):
963971
""" Score relevant encodings given an input. """
964972
obj, ci = None, lookup(codec) # NB: lookup(...) won't fail as the codec value comes from list_encodings(...)
973+
sc = ci.parameters.get('scoring', {})
965974
for encoding in ci.parameters.get('guess', [codec]):
966975
# ignore encodings that fail to decode with their default errors handling value
967976
try:
@@ -972,16 +981,16 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
972981
if prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input):
973982
continue
974983
# compute input's characteristics only once and only if the control flow reaches this point
975-
pad = ci.parameters.get('scoring', {}).get('padding_char')
984+
pad = sc.get('padding_char')
976985
if obj is None:
977986
obj = _Text(input, pad)
978987
if heuristic:
979988
# from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base
980989
# codecs) so that we can put the right one as early as possible and eventually exclude bad candidates
981-
s = -ci.parameters.get('penalty', .0)
990+
s = -sc.get('penalty', .0)
982991
# first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ;
983992
# on the contrary, if the length of input text's charset is strictly greater, give a penalty
984-
lcs = ci.parameters.get('scoring', {}).get('len_charset', 256)
993+
lcs = sc.get('len_charset', 256)
985994
if isinstance(lcs, type(lambda: None)):
986995
lcs = int(lcs(encoding))
987996
if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset:
@@ -997,14 +1006,14 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
9971006
# give a bonus when the rate of printable characters is greater or equal than expected and a penalty when
9981007
# lower only for codecs that tolerate errors (otherwise, the printables rate can be biased)
9991008
if not ci.parameters.get('no_error', False):
1000-
pr = ci.parameters.get('scoring', {}).get('printables_rate', 0)
1009+
pr = sc.get('printables_rate', 0)
10011010
if isinstance(pr, type(lambda: None)):
10021011
pr = float(pr(obj.printables))
10031012
if obj.printables - pr <= .05:
10041013
s += .1
10051014
# afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the
10061015
# number of input characters to take bad entropies of shorter strings into account
1007-
entr = ci.parameters.get('entropy', {})
1016+
entr = sc.get('entropy', {})
10081017
entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr
10091018
if isinstance(entr, type(lambda: None)):
10101019
try: # this case allows to consider the current encoding name from the current codec
@@ -1017,9 +1026,9 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
10171026
if d_entr <= .5:
10181027
s += .5 - d_entr
10191028
# finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched)
1020-
bonus = ci.parameters.get('scoring', {}).get('bonus_func')
1029+
bonus = sc.get('bonus_func')
10211030
if bonus is not None:
1022-
if isinstance(bon, type(lambda: None)):
1031+
if isinstance(bonus, type(lambda: None)):
10231032
bonus = bonus(obj, ci, encoding)
10241033
if bonus:
10251034
s += .2
@@ -1063,6 +1072,7 @@ def rank(input, extended=False, limit=-1, codec_categories=None, exclude=None):
10631072
codecs.remove(e)
10641073
except ValueError:
10651074
pass
1066-
return list(__rank(None, input, codecs, True, extended, True))[:limit]
1075+
r = list(__rank(None, input, codecs, True, extended, True))
1076+
return r[:limit] if len(r) > 1 else r
10671077
codecs.rank = rank
10681078

codext/crypto/rot.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,6 @@ def decode(text, errors="strict"):
4646
return decode
4747

4848

49-
add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", printables_rate=lambda pr: pr)
49+
add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", aliases=["caesar"],
50+
printables_rate=lambda pr: pr)
5051

tests/test_common.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ def test_list_codecs(self):
8888
self.assertTrue(len(codext.list("native", "language", "crypto")) > 0)
8989
self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native")))
9090
self.assertRaises(ValueError, codext.list, "BAD_CATEGORY")
91+
self.assertTrue(codext.is_native("base64_codec"))
92+
self.assertFalse(codext.is_native("base64"))
9193

9294
def test_remove_codec(self):
9395
self.assertIsNone(codext.add("dummy", dummy_encode, dummy_decode))
@@ -128,15 +130,19 @@ def test_search_codecs(self):
128130
self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]")))
129131

130132
def test_guess_decode(self):
131-
_l = lambda d: list(d.items())[0][1]
133+
_l = lambda d: list(d.items())[0][1] if len(d) > 0 else None
132134
codext.reset()
135+
codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
136+
"test", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
137+
self.assertIn("test-codec", codext.list_encodings("test"))
133138
STR = "This is a test"
134139
self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1)))
135140
self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"])))
136141
if hasattr(codext.stopfunc, "lang_en"):
137-
f = codext.stopfunc.lang_en
138-
self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f, found=["base62"])))
139-
self.assertIsNotNone(_l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f, max_depth=1)))
142+
f1, f2 = codext.stopfunc.lang_en, codext.stopfunc.lang_es
143+
self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f1, found=["base62"])))
144+
self.assertIsNotNone(_l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f1, max_depth=1)))
145+
self.assertNotEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", f2, found=["base62"], max_depth=1)))
140146
self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True,
141147
exclude=["base100"])))
142148
self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"])))
@@ -177,12 +183,18 @@ def test_guess_decode(self):
177183
b64 = codext.encode(txt, "base64")
178184
self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True,
179185
codec_categories="base")))
186+
self.assertEqual(list(codext.guess("TEST=", codec_categories="test").items())[0][1], "TEST")
187+
self.assertEqual(list(codext.guess("TEST=", codec_categories=["test", "base"]).items())[0][1], "TEST")
180188

181189
def test_rank_input(self):
190+
codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
191+
"test", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
182192
STR = "This is a test string !"
183193
ENC = codext.encode(STR, "base64")
184-
self.assertTrue(len(codext.rank(ENC)) > 100)
194+
self.assertTrue(len(codext.rank(ENC)) > 20)
185195
self.assertEqual(len(codext.rank(ENC, limit=20)), 20)
186196
self.assertEqual(codext.rank(ENC, exclude=["rot"])[0][1], "base64")
187197
self.assertEqual(codext.rank(ENC, codec_categories=["base"])[0][0][1], STR)
198+
self.assertIsNotNone(codext.rank(ENC, codec_categories=["base"], exclude=["does_not_exist"])[0][0][1], STR)
199+
self.assertIsNotNone(codext.rank("TEST=", codec_categories=["test", "base"])[0][0][1], "TEST")
188200

tests/test_generated.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def _template(self):
108108
with codecs.open(tfile, 'wb', encoding=ename) as f:
109109
f.write(b(s1))
110110
with codecs.open(tfile, 'rb', encoding=ename) as f:
111-
s = f.read().strip(b("\x00"))
111+
s = f.read() if PY3 else f.read().rstrip("\x00")
112112
self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s)))
113113
os.remove(tfile)
114114
return _template

0 commit comments

Comments
 (0)