3333 "list_encodings" , "lookup" , "maketrans" , "rank" , "re" , "register" , "remove" , "reset" , "s2i" , "search" ,
3434 "stopfunc" , "BytesIO" , "MASKS" , "PY3" ]
3535CODECS_REGISTRY = None
36+ CODECS_CATEGORIES = ["native" , "custom" ]
3637MASKS = {
3738 'a' : printable ,
3839 'b' : "" .join (chr (i ) for i in range (256 )),
@@ -70,6 +71,7 @@ def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=
7071 NB: this will make the codec available in the built-in open(...) but will make it impossible
7172 to remove the codec later
7273 """
74+ remove (ename )
7375 if encode and not isinstance (encode , FunctionType ):
7476 raise ValueError ("Bad 'encode' function" )
7577 if decode and not isinstance (decode , FunctionType ):
@@ -153,18 +155,23 @@ class StreamReader(Codec, codecs.StreamReader):
153155 ci .parameters ['pattern' ] = pattern
154156 ci .parameters ['text' ] = text
155157 f = glob .get ('__file__' , os .path .join ("custom" , "_" ))
156- ci .parameters ['category' ] = kwargs .get ('category' , f .split (os .path .sep )[- 2 ].rstrip ("s" ))
158+ cat = f .split (os .path .sep )[- 2 ].rstrip ("s" )
159+ if cat not in CODECS_CATEGORIES :
160+ CODECS_CATEGORIES .append (cat )
161+ ci .parameters ['category' ] = kwargs .get ('category' , cat )
157162 ci .parameters ['examples' ] = kwargs .get ('examples' , glob .get ('__examples__' ))
158163 ci .parameters ['guess' ] = kwargs .get ('guess' , glob .get ('__guess__' , [ename ]))
159164 ci .parameters ['module' ] = kwargs .get ('module' , glob .get ('__name__' ))
160165 ci .parameters .setdefault ("scoring" , {})
161- for attr in ["entropy" , "len_charset" , "printables_rate" , "padding_char" ]:
162- a = kwargs .get (attr )
166+ for attr in ["bonus_func" , " entropy" , "len_charset" , "penalty " , "printables_rate" , "padding_char" ]:
167+ a = kwargs .pop (attr , None )
163168 if a is not None :
164169 ci .parameters ['scoring' ][attr ] = a
165170 return ci
166171
167172 getregentry .__name__ = re .sub (r"[\s\-]" , "_" , ename )
173+ if kwargs .get ('aliases' ):
174+ getregentry .__aliases__ = list (map (lambda n : re .sub (r"[\s\-]" , "_" , n ), kwargs ['aliases' ]))
168175 getregentry .__pattern__ = pattern
169176 register (getregentry , add_to_codecs )
170177
@@ -446,7 +453,7 @@ def is_native(encoding):
446453
447454def list_categories ():
448455 """ Get a list of all codec categories. """
449- c = [ "native" ]
456+ c = CODECS_CATEGORIES
450457 root = os .path .dirname (__file__ )
451458 for d in os .listdir (root ):
452459 if os .path .isdir (os .path .join (root , d )) and not d .startswith ("__" ):
@@ -657,7 +664,8 @@ def lookup(encoding):
657664 return codecinfo
658665 # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo
659666 for search_function in __codecs_registry :
660- if search_function .__name__ .replace ("_" , "-" ) == encoding :
667+ if search_function .__name__ .replace ("_" , "-" ) == encoding or \
668+ encoding in getattr (search_function , "__aliases__" , []):
661669 codecinfo = search_function (generate_string_from_regex (search_function .__pattern__ ))
662670 if codecinfo is not None :
663671 return codecinfo
@@ -962,6 +970,7 @@ def __init__(self, text, pad_char=None):
962970def __score (prev_input , input , codec , heuristic = False , extended = False ):
963971 """ Score relevant encodings given an input. """
964972 obj , ci = None , lookup (codec ) # NB: lookup(...) won't fail as the codec value comes from list_encodings(...)
973+ sc = ci .parameters .get ('scoring' , {})
965974 for encoding in ci .parameters .get ('guess' , [codec ]):
966975 # ignore encodings that fail to decode with their default errors handling value
967976 try :
@@ -972,16 +981,16 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
972981 if prev_input is not None and b (input ) == b (new_input ) or b (prev_input ) == b (new_input ):
973982 continue
974983 # compute input's characteristics only once and only if the control flow reaches this point
975- pad = ci . parameters . get ( 'scoring' , {}) .get ('padding_char' )
984+ pad = sc .get ('padding_char' )
976985 if obj is None :
977986 obj = _Text (input , pad )
978987 if heuristic :
979988 # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base
980989 # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates
981- s = - ci . parameters .get ('penalty' , .0 )
990+ s = - sc .get ('penalty' , .0 )
982991 # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ;
983992 # on the contrary, if the length of input text's charset is strictly greater, give a penalty
984- lcs = ci . parameters . get ( 'scoring' , {}) .get ('len_charset' , 256 )
993+ lcs = sc .get ('len_charset' , 256 )
985994 if isinstance (lcs , type (lambda : None )):
986995 lcs = int (lcs (encoding ))
987996 if (pad and obj .padding and lcs + 1 >= obj .lcharset ) or lcs >= obj .lcharset :
@@ -997,14 +1006,14 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
9971006 # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when
9981007 # lower only for codecs that tolerate errors (otherwise, the printables rate can be biased)
9991008 if not ci .parameters .get ('no_error' , False ):
1000- pr = ci . parameters . get ( 'scoring' , {}) .get ('printables_rate' , 0 )
1009+ pr = sc .get ('printables_rate' , 0 )
10011010 if isinstance (pr , type (lambda : None )):
10021011 pr = float (pr (obj .printables ))
10031012 if obj .printables - pr <= .05 :
10041013 s += .1
10051014 # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the
10061015 # number of input characters to take bad entropies of shorter strings into account
1007- entr = ci . parameters .get ('entropy' , {})
1016+ entr = sc .get ('entropy' , {})
10081017 entr = entr .get (encoding , entr .get ('default' )) if isinstance (entr , dict ) else entr
10091018 if isinstance (entr , type (lambda : None )):
10101019 try : # this case allows to consider the current encoding name from the current codec
@@ -1017,9 +1026,9 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
10171026 if d_entr <= .5 :
10181027 s += .5 - d_entr
10191028 # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched)
1020- bonus = ci . parameters . get ( 'scoring' , {}) .get ('bonus_func' )
1029+ bonus = sc .get ('bonus_func' )
10211030 if bonus is not None :
1022- if isinstance (bon , type (lambda : None )):
1031+ if isinstance (bonus , type (lambda : None )):
10231032 bonus = bonus (obj , ci , encoding )
10241033 if bonus :
10251034 s += .2
@@ -1063,6 +1072,7 @@ def rank(input, extended=False, limit=-1, codec_categories=None, exclude=None):
10631072 codecs .remove (e )
10641073 except ValueError :
10651074 pass
1066- return list (__rank (None , input , codecs , True , extended , True ))[:limit ]
1075+ r = list (__rank (None , input , codecs , True , extended , True ))
1076+ return r [:limit ] if len (r ) > 1 else r
10671077codecs .rank = rank
10681078
0 commit comments