Index: Lib/urllib/parse.py =================================================================== --- Lib/urllib/parse.py (revision 65575) +++ Lib/urllib/parse.py (working copy) @@ -261,85 +261,75 @@ return url, '' -_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) -_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) +def unquote_as_string (s, plus=False, charset=None): + if charset is None: + charset = "raw-unicode-escape" + return str(unquote_as_bytes(s, plus=plus), charset, 'strict') -def unquote(s): +def unquote_as_bytes (s, plus=False): """unquote('abc%20def') -> 'abc def'.""" + if plus: + s = s.replace('+', ' ') res = s.split('%') + res[0] = res[0].encode('ASCII', 'strict') for i in range(1, len(res)): - item = res[i] - try: - res[i] = _hextochr[item[:2]] + item[2:] - except KeyError: - res[i] = '%' + item - except UnicodeDecodeError: - res[i] = chr(int(item[:2], 16)) + item[2:] - return "".join(res) + res[i] = (bytes.fromhex(res[i][:2]) + + res[i][2:].encode('ASCII', 'strict')) + return b''.join(res) -def unquote_plus(s): - """unquote('%7e/abc+def') -> '~/abc def'""" - s = s.replace('+', ' ') - return unquote(s) +_always_safe = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + b'abcdefghijklmnopqrstuvwxyz' + b'0123456789' + b'_.-') -always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_.-') -_safe_quoters= {} +_percent_code = ord('%') -class Quoter: - def __init__(self, safe): - self.cache = {} - self.safe = safe + always_safe +_hextable = b'0123456789ABCDEF' - def __call__(self, c): - try: - return self.cache[c] - except KeyError: - if ord(c) < 256: - res = (c in self.safe) and c or ('%%%02X' % ord(c)) - self.cache[c] = res - return res - else: - return "".join(['%%%02X' % i for i in c.encode("utf-8")]) +def quote_as_bytes(s, safe = '/', plus=False): + """quote(b'abc@def') -> 'abc%40def'""" -def quote(s, safe = '/'): - """quote('abc def') -> 'abc%20def' + if isinstance(s, str): + s = s.encode('UTF-8', 'strict') + if not (isinstance(s, bytes) or isinstance(s, bytearray)): + raise TypeError("Argument to quote must be either bytes " + "or bytearray; string arguments will be " + "converted to UTF-8 bytes") - Each part of a URL, e.g. the path info, the query, etc., has a - different set of reserved characters that must be quoted. + safeset = _always_safe + safe.encode('ASCII', 'strict') + if plus: + safeset += b' ' - RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists - the following reserved characters. + result = bytearray() + for i in s: + if i not in safeset: + result.append(_percent_code) + result.append(_hextable[(i >> 4) & 0xF]) + result.append(_hextable[i & 0xF]) + else: + result.append(i) + if plus: + result = result.replace(b' ', b'+') + return result - reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | - "$" | "," +def quote_as_string(s, safe = '/', plus=False): + return str(quote_as_bytes(s, safe=safe, plus=plus), 'ASCII', 'strict') - Each of these characters is reserved in some component of a URL, - but not necessarily in all of them. +# finally, define defaults for 'quote' and 'unquote' - By default, the quote function is intended for quoting the path - section of a URL. Thus, it will not encode '/'. This character - is reserved, but in typical usage the quote function is being - called on a path where the existing slash characters are used as - reserved characters. - """ - cachekey = (safe, always_safe) - try: - quoter = _safe_quoters[cachekey] - except KeyError: - quoter = Quoter(safe) - _safe_quoters[cachekey] = quoter - res = map(quoter, s) - return ''.join(res) +def quote(s, safe='/'): + return quote_as_string(s, safe=safe) -def quote_plus(s, safe = ''): - """Quote the query fragment of a URL; replacing ' ' with '+'""" - if ' ' in s: - s = quote(s, safe + ' ') - return s.replace(' ', '+') - return quote(s, safe) +def quote_plus(s, safe=''): + return quote_as_string(s, safe=safe, plus=True) +def unquote(s): + return unquote_as_string(s) + +def unquote_plus(s): + return unquote_as_string(s, plus=True) + + def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. @@ -387,7 +377,7 @@ # is there a reasonable way to convert to ASCII? # encode generates a string, but "replace" or "ignore" # lose information and "strict" can raise UnicodeError - v = quote_plus(v.encode("ASCII","replace")) + v = quote_plus(v) l.append(k + '=' + v) else: try: @@ -474,7 +464,8 @@ _userprog = re.compile('^(.*)@(.*)$') match = _userprog.match(host) - if match: return map(unquote, match.group(1, 2)) + if match: + return map(unquote, match.group(1, 2)) return None, host _passwdprog = None Index: Lib/email/utils.py =================================================================== --- Lib/email/utils.py (revision 65575) +++ Lib/email/utils.py (working copy) @@ -46,7 +46,7 @@ EMPTYSTRING = '' UEMPTYSTRING = '' CRLF = '\r\n' -TICK = "'" +TICK = b"'" # ASCII tick character specialsre = re.compile(r'[][\\()<>@,:;".]') escapesre = re.compile(r'[][\\()"]') @@ -209,9 +209,11 @@ parts = s.split(TICK, 2) if len(parts) <= 2: return None, None, s - return parts + else: + return (str(parts[0], "ASCII", "strict"), + str(parts[1], "ASCII", "strict"), + parts[2]) - def encode_rfc2231(s, charset=None, language=None): """Encode string according to RFC 2231. @@ -271,15 +273,19 @@ # language specifiers at the beginning of the string. for num, s, encoded in continuations: if encoded: - s = urllib.parse.unquote(s) + s = urllib.parse.unquote_as_bytes(s) extended = True - value.append(s) - value = quote(EMPTYSTRING.join(value)) + value.append(s) + else: + value.append(s.encode('ASCII', 'strict')) + value = b''.join(value) if extended: charset, language, value = decode_rfc2231(value) - new_params.append((name, (charset, language, '"%s"' % value))) + new_params.append( + (name, (charset, language, + '"%s"' % str(value, 'raw-unicode-escape', 'replace')))) else: - new_params.append((name, '"%s"' % value)) + new_params.append((name, '"%s"' % str(value, 'ASCII', 'replace'))) return new_params def collapse_rfc2231_value(value, errors='replace', Index: Lib/test/test_http_cookiejar.py =================================================================== --- Lib/test/test_http_cookiejar.py (revision 65575) +++ Lib/test/test_http_cookiejar.py (working copy) @@ -1440,26 +1440,6 @@ self.assertEquals(old, repr(c)) - def test_url_encoding(self): - # Try some URL encodings of the PATHs. - # (the behaviour here has changed from libwww-perl) - c = CookieJar(DefaultCookiePolicy(rfc2965=True)) - interact_2965(c, "http://www.acme.com/foo%2f%25/%3c%3c%0Anew%E5/%E5", - "foo = bar; version = 1") - - cookie = interact_2965( - c, "http://www.acme.com/foo%2f%25/<<%0anew\345/\346\370\345", - 'bar=baz; path="/foo/"; version=1'); - version_re = re.compile(r'^\$version=\"?1\"?', re.I) - self.assert_("foo=bar" in cookie and version_re.search(cookie)) - - cookie = interact_2965( - c, "http://www.acme.com/foo/%25/<<%0anew\345/\346\370\345") - self.assert_(not cookie) - - # unicode URL doesn't raise exception - cookie = interact_2965(c, "http://www.acme.com/\xfc") - def test_mozilla(self): # Save / load Mozilla/Netscape cookie file format. year_plus_one = time.localtime()[0] + 1 Index: Lib/test/test_urllib.py =================================================================== --- Lib/test/test_urllib.py (revision 65575) +++ Lib/test/test_urllib.py (working copy) @@ -464,8 +464,9 @@ "using unquote_plus(): %s != %s" % (expect, result)) def test_unquote_with_unicode(self): - r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc') - self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc') + r = urllib.parse.unquote_as_string('br%C3%BCckner_sapporo_20050930.doc', + charset="UTF-8") + self.assertEqual(r, 'br\u00FCckner_sapporo_20050930.doc') class urlencode_Tests(unittest.TestCase): """Tests for urlencode()"""