diff options
-rw-r--r-- | TODO | 4 | ||||
-rw-r--r-- | pypy/interpreter/baseobjspace.py | 4 | ||||
-rw-r--r-- | pypy/interpreter/pyparser/parsestring.py | 7 | ||||
-rw-r--r-- | pypy/interpreter/unicodehelper.py | 49 | ||||
-rw-r--r-- | pypy/module/__builtin__/operation.py | 8 | ||||
-rw-r--r-- | pypy/module/_codecs/interp_codecs.py | 25 | ||||
-rw-r--r-- | pypy/objspace/std/marshal_impl.py | 4 | ||||
-rw-r--r-- | pypy/objspace/std/objspace.py | 8 | ||||
-rw-r--r-- | pypy/objspace/std/test/test_unicodeobject.py | 9 | ||||
-rw-r--r-- | pypy/objspace/std/unicodeobject.py | 191 | ||||
-rw-r--r-- | rpython/rlib/rutf8.py | 61 |
11 files changed, 252 insertions, 118 deletions
@@ -4,3 +4,7 @@ if one is not already readily available * fix _pypyjson * fix cpyext +* write the correct jit_elidable in _get_index_storage +* better flag handling in split/splitlines maybe? +* find all the fast-paths that we want to do with utf8 (we only do + utf-8 now, not UTF8 or utf8) for decode/encode diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py index 20f439471a..72655b4f13 100644 --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -1764,8 +1764,10 @@ class ObjSpace(object): return self.realutf8_w(w_obj).decode('utf8') def newunicode(self, u): + from pypy.interpreter import unicodehelper assert isinstance(u, unicode) - return self.newutf8(u.encode("utf8"), len(u)) + # XXX let's disallow that + return self.newutf8(u.encode("utf8"), len(u), unicodehelper._get_flag(u)) def convert_to_w_unicode(self, w_obj): return w_obj.convert_to_w_unicode(self) diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py index 514636d470..974c6f3c55 100644 --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -59,10 +59,11 @@ def parsestr(space, encoding, s, unicode_literal=False): else: substr = decode_unicode_utf8(space, s, ps, q) if rawmode: - v, length = unicodehelper.decode_raw_unicode_escape(space, substr) + r = unicodehelper.decode_raw_unicode_escape(space, substr) else: - v, length = unicodehelper.decode_unicode_escape(space, substr) - return space.newutf8(v, length) + r = unicodehelper.decode_unicode_escape(space, substr) + v, length, flag = r + return space.newutf8(v, length, flag) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py index 0212f7b790..c41ed1feec 100644 --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -20,11 +20,11 @@ def decode_error_handler(space): @specialize.memo() def encode_error_handler(space): # Fast version of the "strict" errors handler. - def raise_unicode_exception_encode(errors, encoding, msg, u, u_len, + def raise_unicode_exception_encode(errors, encoding, msg, w_u, startingpos, endingpos): raise OperationError(space.w_UnicodeEncodeError, space.newtuple([space.newtext(encoding), - space.newutf8(u, u_len), + w_u, space.newint(startingpos), space.newint(endingpos), space.newtext(msg)])) @@ -41,6 +41,21 @@ def encode(space, w_data, encoding=None, errors='strict'): from pypy.objspace.std.unicodeobject import encode_object return encode_object(space, w_data, encoding, errors) +def _has_surrogate(u): + for c in u: + if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F: + return True + return False + +def _get_flag(u): + flag = rutf8.FLAG_ASCII + for c in u: + if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F: + return rutf8.FLAG_HAS_SURROGATES + if ord(c) >= 0x80: + flag = rutf8.FLAG_REGULAR + return flag + # These functions take and return unwrapped rpython strings and unicodes def decode_unicode_escape(space, string): state = space.fromcache(interp_codecs.CodecState) @@ -52,7 +67,14 @@ def decode_unicode_escape(space, string): final=True, errorhandler=DecodeWrapper(decode_error_handler(space)).handle, unicodedata_handler=unicodedata_handler) # XXX argh. we want each surrogate to be encoded separately - return ''.join([u.encode('utf8') for u in result_u]), len(result_u) + utf8 = ''.join([u.encode('utf8') for u in result_u]) + if rutf8.first_non_ascii_char(utf8) == -1: + flag = rutf8.FLAG_ASCII + elif _has_surrogate(result_u): + flag = rutf8.FLAG_HAS_SURROGATES + else: + flag = rutf8.FLAG_REGULAR + return utf8, len(result_u), flag def decode_raw_unicode_escape(space, string): # XXX pick better length, maybe @@ -61,7 +83,14 @@ def decode_raw_unicode_escape(space, string): string, len(string), "strict", final=True, errorhandler=DecodeWrapper(decode_error_handler(space)).handle) # XXX argh. we want each surrogate to be encoded separately - return ''.join([u.encode('utf8') for u in result_u]), len(result_u) + utf8 = ''.join([u.encode('utf8') for u in result_u]) + if rutf8.first_non_ascii_char(utf8) == -1: + flag = rutf8.FLAG_ASCII + elif _has_surrogate(result_u): + flag = rutf8.FLAG_HAS_SURROGATES + else: + flag = rutf8.FLAG_REGULAR + return utf8, len(result_u), flag def check_ascii_or_raise(space, string): try: @@ -78,12 +107,12 @@ def check_utf8_or_raise(space, string): # you still get two surrogate unicode characters in the result. # These are the Python2 rules; Python3 differs. try: - length = rutf8.check_utf8(string, allow_surrogates=True) + length, flag = rutf8.check_utf8(string, allow_surrogates=True) except rutf8.CheckError as e: decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string, e.pos, e.pos + 1) assert False, "unreachable" - return length + return length, flag def encode_utf8(space, uni): # DEPRECATED @@ -116,7 +145,7 @@ def str_decode_ascii(s, slen, errors, final, errorhandler): except rutf8.CheckError: w = DecodeWrapper((errorhandler)) u, pos = runicode.str_decode_ascii(s, slen, errors, final, w.handle) - return u.encode('utf8'), pos, len(u) + return u.encode('utf8'), pos, len(u), _get_flag(u) # XXX wrappers, think about speed @@ -139,14 +168,14 @@ def str_decode_utf8(s, slen, errors, final, errorhandler): w = DecodeWrapper(errorhandler) u, pos = runicode.str_decode_utf_8_impl(s, slen, errors, final, w.handle, runicode.allow_surrogate_by_default) - return u.encode('utf8'), pos, len(u) + return u.encode('utf8'), pos, len(u), _get_flag(u) def str_decode_unicode_escape(s, slen, errors, final, errorhandler, ud_handler): w = DecodeWrapper(errorhandler) u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final, w.handle, ud_handler) - return u.encode('utf8'), pos, len(u) + return u.encode('utf8'), pos, len(u), _get_flag(u) def setup_new_encoders(encoding): encoder_name = 'utf8_encode_' + encoding @@ -160,7 +189,7 @@ def setup_new_encoders(encoding): def decoder(s, slen, errors, final, errorhandler): w = DecodeWrapper((errorhandler)) u, pos = getattr(runicode, decoder_name)(s, slen, errors, final, w.handle) - return u.encode('utf8'), pos, len(u) + return u.encode('utf8'), pos, len(u), _get_flag(u) encoder.__name__ = encoder_name decoder.__name__ = decoder_name if encoder_name not in globals(): diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py index 0661f0157f..dd5797717e 100644 --- a/pypy/module/__builtin__/operation.py +++ b/pypy/module/__builtin__/operation.py @@ -28,7 +28,13 @@ def unichr(space, code): s = rutf8.unichr_as_utf8(code, allow_surrogates=True) except ValueError: raise oefmt(space.w_ValueError, "unichr() arg out of range") - return space.newutf8(s, 1) + if code < 0x80: + flag = rutf8.FLAG_ASCII + elif 0xDB80 <= code <= 0xCBFF or 0xD800 <= code <= 0xDB7F: + flag = rutf8.FLAG_HAS_SURROGATE + else: + flag = rutf8.FLAG_REGULAR + return space.newutf8(s, 1, flag) def len(space, w_obj): "len(object) -> integer\n\nReturn the number of items of a sequence or mapping." diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py index 26e9d0c548..f86212df59 100644 --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -39,8 +39,8 @@ class CodecState(object): w_input = space.newbytes(input) else: w_cls = space.w_UnicodeEncodeError - length = rutf8.check_utf8(input, allow_surrogates=True) - w_input = space.newutf8(input, length) + length, flag = rutf8.check_utf8(input, allow_surrogates=True) + w_input = space.newutf8(input, length, flag) w_exc = space.call_function( w_cls, space.newtext(encoding), @@ -189,7 +189,7 @@ def strict_errors(space, w_exc): def ignore_errors(space, w_exc): check_exception(space, w_exc) w_end = space.getattr(w_exc, space.newtext('end')) - return space.newtuple([space.newutf8('', 0), w_end]) + return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), w_end]) REPLACEMENT = u'\ufffd'.encode('utf8') @@ -200,13 +200,13 @@ def replace_errors(space, w_exc): size = space.int_w(w_end) - space.int_w(w_start) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): text = '?' * size - return space.newtuple([space.newutf8(text, size), w_end]) + return space.newtuple([space.newutf8(text, size, rutf8.FLAG_ASCII), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): text = REPLACEMENT - return space.newtuple([space.newutf8(text, 1), w_end]) + return space.newtuple([space.newutf8(text, 1, rutf8.FLAG_REGULAR), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError): text = REPLACEMENT * size - return space.newtuple([space.newutf8(text, size), w_end]) + return space.newtuple([space.newutf8(text, size, rutf8.FLAG_REGULAR), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc) @@ -403,9 +403,9 @@ def make_decoder_wrapper(name): final = space.is_true(w_final) state = space.fromcache(CodecState) func = getattr(unicodehelper, rname) - result, consumed, length = func(string, len(string), errors, - final, state.decode_error_handler) - return space.newtuple([space.newutf8(result, length), + result, consumed, length, flag = func(string, len(string), errors, + final, state.decode_error_handler) + return space.newtuple([space.newutf8(result, length, flag), space.newint(consumed)]) wrap_decoder.func_name = rname globals()[name] = wrap_decoder @@ -448,7 +448,7 @@ if hasattr(runicode, 'str_decode_mbcs'): # "allow_surrogates=True" @unwrap_spec(utf8='utf8', errors='text_or_none') def utf_8_encode(space, utf8, errors="strict"): - length = rutf8.check_utf8(utf8, allow_surrogates=True) + length, _ = rutf8.check_utf8(utf8, allow_surrogates=True) return space.newtuple([space.newbytes(utf8), space.newint(length)]) #@unwrap_spec(uni=unicode, errors='text_or_none') #def utf_8_encode(space, uni, errors="strict"): @@ -474,16 +474,17 @@ def utf_8_decode(space, string, errors="strict", w_final=None): state = space.fromcache(CodecState) # call the fast version for checking try: - lgt = rutf8.check_utf8(string, allow_surrogates=True) + lgt, flag = rutf8.check_utf8(string, allow_surrogates=True) except rutf8.CheckError as e: # XXX do the way around runicode - we can optimize it later if we # decide we care about obscure cases + xxx res, consumed, lgt = unicodehelper.str_decode_utf8(string, len(string), errors, final, state.decode_error_handler) return space.newtuple([space.newutf8(res, lgt), space.newint(consumed)]) else: - return space.newtuple([space.newutf8(string, lgt), + return space.newtuple([space.newutf8(string, lgt, flag), space.newint(len(string))]) @unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int, diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py index 038cbf3e94..b8f9e56006 100644 --- a/pypy/objspace/std/marshal_impl.py +++ b/pypy/objspace/std/marshal_impl.py @@ -403,8 +403,8 @@ def marshal_unicode(space, w_unicode, m): @unmarshaller(TYPE_UNICODE) def unmarshal_unicode(space, u, tc): arg = u.get_str() - length = unicodehelper.check_utf8_or_raise(space, arg) - return space.newutf8(arg, length) + length, flag = unicodehelper.check_utf8_or_raise(space, arg) + return space.newutf8(arg, length, flag) @marshaller(W_SetObject) def marshal_set(space, w_set, m): diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py index f1896c2909..0b67c7861f 100644 --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -317,8 +317,8 @@ class StdObjSpace(ObjSpace): for utf in lst: assert utf is not None assert isinstance(utf, str) - length = rutf8.check_utf8(utf, allow_surrogates=True) - res_w.append(self.newutf8(utf, length)) + length, flag = rutf8.check_utf8(utf, allow_surrogates=True) + res_w.append(self.newutf8(utf, length, flag)) return self.newlist(res_w) def newlist_int(self, list_i): @@ -369,10 +369,10 @@ class StdObjSpace(ObjSpace): return self.w_None return self.newtext(s) - def newutf8(self, utf8s, length): + def newutf8(self, utf8s, length, flag): assert utf8s is not None assert isinstance(utf8s, str) - return W_UnicodeObject(utf8s, length) + return W_UnicodeObject(utf8s, length, flag) def newfilename(self, s): assert isinstance(s, str) # on pypy3, this decodes the byte string diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py index a860d1c20a..9620f6e3cc 100644 --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -3,6 +3,7 @@ import py import sys from hypothesis import given, strategies, settings, example +from rpython.rlib import rutf8 from pypy.interpreter.error import OperationError @@ -27,12 +28,12 @@ class TestUnicodeObject: def test_listview_unicode(self): py.test.skip("skip for new") - w_str = self.space.wrap(u'abcd') + w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII) assert self.space.listview_unicode(w_str) == list(u"abcd") def test_new_shortcut(self): space = self.space - w_uni = self.space.wrap(u'abcd') + w_uni = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII) w_new = space.call_method( space.w_unicode, "__new__", space.w_unicode, w_uni) assert w_new is w_uni @@ -44,8 +45,8 @@ class TestUnicodeObject: return # skip this case v = u[start : start + len1] space = self.space - w_u = space.wrap(u) - w_v = space.wrap(v) + w_u = space.newutf8(u.encode('utf8'), len(u), rutf8.FLAG_REGULAR) + w_v = space.newutf8(v.encode('utf8'), len(v), rutf8.FLAG_REGULAR) expected = u.find(v, start, start + len1) try: w_index = space.call_method(w_u, 'index', w_v, diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py index bd8d7df8b2..f050138cbb 100644 --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -36,14 +36,24 @@ class W_UnicodeObject(W_Root): _immutable_fields_ = ['_utf8'] @enforceargs(utf8str=str) - def __init__(self, utf8str, length): + def __init__(self, utf8str, length, flag): assert isinstance(utf8str, str) assert length >= 0 self._utf8 = utf8str self._length = length - self._index_storage = rutf8.null_storage() - #if not we_are_translated(): - # assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length + if flag == rutf8.FLAG_ASCII: + self._index_storage = rutf8.UTF8_IS_ASCII + elif flag == rutf8.FLAG_HAS_SURROGATES: + self._index_storage = rutf8.UTF8_HAS_SURROGATES + else: + assert flag == rutf8.FLAG_REGULAR + self._index_storage = rutf8.null_storage() + # the storage can be one of: + # - null, unicode with no surrogates + # - rutf8.UTF8_HAS_SURROGATES + # - rutf8.UTF8_IS_ASCII + # - malloced object, which means it has index, then + # _index_storage.flags determines the kind def __repr__(self): """representation for debugging purposes""" @@ -222,7 +232,11 @@ class W_UnicodeObject(W_Root): assert isinstance(w_value, W_UnicodeObject) w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype) - W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length) + W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length, + w_value._get_flag()) + if w_value._index_storage: + # copy the storage if it's there + w_newobj._index_storage = w_value._index_storage return w_newobj def descr_repr(self, space): @@ -326,29 +340,33 @@ class W_UnicodeObject(W_Root): def descr_swapcase(self, space): selfvalue = self._utf8 builder = StringBuilder(len(selfvalue)) + flag = self._get_flag() i = 0 while i < len(selfvalue): ch = rutf8.codepoint_at_pos(selfvalue, i) i = rutf8.next_codepoint_pos(selfvalue, i) if unicodedb.isupper(ch): - rutf8.unichr_as_utf8_append(builder, unicodedb.tolower(ch)) + ch = unicodedb.tolower(ch) elif unicodedb.islower(ch): - rutf8.unichr_as_utf8_append(builder, unicodedb.toupper(ch)) - else: - rutf8.unichr_as_utf8_append(builder, ch) - return W_UnicodeObject(builder.build(), self._length) + ch = unicodedb.toupper(ch) + if ch >= 0x80: + flag = self._combine_flags(flag, rutf8.FLAG_REGULAR) + rutf8.unichr_as_utf8_append(builder, ch) + return W_UnicodeObject(builder.build(), self._length, flag) def descr_title(self, space): if len(self._utf8) == 0: return self - return W_UnicodeObject(self.title(self._utf8), self._len()) + utf8, flag = self.title_unicode(self._utf8) + return W_UnicodeObject(utf8, self._len(), flag) @jit.elidable - def title(self, value): + def title_unicode(self, value): input = self._utf8 builder = StringBuilder(len(input)) i = 0 previous_is_cased = False + flag = self._get_flag() while i < len(input): ch = rutf8.codepoint_at_pos(input, i) i = rutf8.next_codepoint_pos(input, i) @@ -356,14 +374,17 @@ class W_UnicodeObject(W_Root): ch = unicodedb.totitle(ch) else: ch = unicodedb.tolower(ch) + if ch >= 0x80: + flag = self._combine_flags(flag, rutf8.FLAG_REGULAR) rutf8.unichr_as_utf8_append(builder, ch) previous_is_cased = unicodedb.iscased(ch) - return builder.build() + return builder.build(), flag def descr_translate(self, space, w_table): input = self._utf8 result = StringBuilder(len(input)) result_length = 0 + flag = self._get_flag() i = 0 while i < len(input): codepoint = rutf8.codepoint_at_pos(input, i) @@ -380,6 +401,7 @@ class W_UnicodeObject(W_Root): codepoint = space.int_w(w_newval) elif isinstance(w_newval, W_UnicodeObject): result.append(w_newval._utf8) + flag = self._combine_flags(flag, w_newval._get_flag()) result_length += w_newval._length continue else: @@ -387,13 +409,15 @@ class W_UnicodeObject(W_Root): "character mapping must return integer, None " "or unicode") try: + if codepoint >= 0x80: + flag = self._combine_flags(flag, rutf8.FLAG_NORMAL) rutf8.unichr_as_utf8_append(result, codepoint, allow_surrogates=True) result_length += 1 except ValueError: raise oefmt(space.w_TypeError, "character mapping must be in range(0x110000)") - return W_UnicodeObject(result.build(), result_length) + return W_UnicodeObject(result.build(), result_length, flag) def descr_find(self, space, w_sub, w_start=None, w_end=None): w_result = self._unwrap_and_search(space, w_sub, w_start, w_end) @@ -472,7 +496,7 @@ class W_UnicodeObject(W_Root): newlen += dist oldtoken = token - return W_UnicodeObject(expanded, newlen) + return W_UnicodeObject(expanded, newlen, self._get_flag()) _StringMethods_descr_join = descr_join def descr_join(self, space, w_list): @@ -506,11 +530,14 @@ class W_UnicodeObject(W_Root): def descr_lower(self, space): builder = StringBuilder(len(self._utf8)) pos = 0 + flag = self._get_flag() while pos < len(self._utf8): lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos)) + if lower >= 0x80: + flag = self._combine_flags(flag, rutf8.FLAG_REGULAR) rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates? pos = rutf8.next_codepoint_pos(self._utf8, pos) - return W_UnicodeObject(builder.build(), self._len()) + return W_UnicodeObject(builder.build(), self._len(), flag) def descr_isdecimal(self, space): return self._is_generic(space, '_isdecimal') @@ -595,6 +622,22 @@ class W_UnicodeObject(W_Root): return True return endswith(value, prefix, start, end) + @staticmethod + def _combine_flags(self_flag, other_flag): + if self_flag == rutf8.FLAG_ASCII and other_flag == rutf8.FLAG_ASCII: + return rutf8.FLAG_ASCII + elif (self_flag == rutf8.FLAG_HAS_SURROGATES or + other_flag == rutf8.FLAG_HAS_SURROGATES): + return rutf8.FLAG_HAS_SURROGATES + return rutf8.FLAG_REGULAR + + def _get_flag(self): + if self._is_ascii(): + return rutf8.FLAG_ASCII + elif self._has_surrogates(): + return rutf8.FLAG_HAS_SURROGATES + return rutf8.FLAG_REGULAR + def descr_add(self, space, w_other): try: w_other = self.convert_arg_to_w_unicode(space, w_other) @@ -602,8 +645,9 @@ class W_UnicodeObject(W_Root): if e.match(space, space.w_TypeError): return space.w_NotImplemented raise + flag = self._combine_flags(self._get_flag(), w_other._get_flag()) return W_UnicodeObject(self._utf8 + w_other._utf8, - self._len() + w_other._len()) + self._len() + w_other._len(), flag) @jit.look_inside_iff(lambda self, space, list_w, size: jit.loop_unrolling_heuristic(list_w, size)) @@ -613,6 +657,7 @@ class W_UnicodeObject(W_Root): prealloc_size = len(value) * (size - 1) unwrapped = newlist_hint(size) + flag = self._get_flag() for i in range(size): w_s = list_w[i] check_item = self._join_check_item(space, w_s) @@ -625,6 +670,7 @@ class W_UnicodeObject(W_Root): # XXX Maybe the extra copy here is okay? It was basically going to # happen anyway, what with being placed into the builder w_u = self.convert_arg_to_w_unicode(space, w_s) + flag = self._combine_flags(flag, w_u._get_flag()) unwrapped.append(w_u._utf8) lgt += w_u._length prealloc_size += len(unwrapped[i]) @@ -634,7 +680,7 @@ class W_UnicodeObject(W_Root): if value and i != 0: sb.append(value) sb.append(unwrapped[i]) - return W_UnicodeObject(sb.build(), lgt) + return W_UnicodeObject(sb.build(), lgt, flag) @unwrap_spec(keepends=bool) def descr_splitlines(self, space, keepends=False): @@ -663,28 +709,33 @@ class W_UnicodeObject(W_Root): lgt += line_end_chars assert eol >= 0 assert sol >= 0 - strs_w.append(W_UnicodeObject(value[sol:eol], lgt)) + # XXX we can do better with flags here, if we want to + strs_w.append(W_UnicodeObject(value[sol:eol], lgt, self._get_flag())) return space.newlist(strs_w) def descr_upper(self, space): value = self._utf8 builder = StringBuilder(len(value)) + flag = self._get_flag() i = 0 while i < len(value): uchar = rutf8.codepoint_at_pos(value, i) + uchar = unicodedb.toupper(uchar) + if uchar >= 0x80: + flag = self._combine_flags(flag, rutf8.FLAG_REGULAR) i = rutf8.next_codepoint_pos(value, i) - rutf8.unichr_as_utf8_append(builder, unicodedb.toupper(uchar)) - return W_UnicodeObject(builder.build(), self._length) + rutf8.unichr_as_utf8_append(builder, uchar) + return W_UnicodeObject(builder.build(), self._length, flag) @unwrap_spec(width=int) def descr_zfill(self, space, width): selfval = self._utf8 if len(selfval) == 0: - return W_UnicodeObject('0' * width, width) + return W_UnicodeObject('0' * width, width, rutf8.FLAG_ASCII) num_zeros = width - self._len() if num_zeros <= 0: # cannot return self, in case it is a subclass of str - return W_UnicodeObject(selfval, self._len()) + return W_UnicodeObject(selfval, self._len(), self._get_flag()) builder = StringBuilder(num_zeros + len(selfval)) if len(selfval) > 0 and (selfval[0] == '+' or selfval[0] == '-'): # copy sign to first position @@ -694,7 +745,7 @@ class W_UnicodeObject(W_Root): start = 0 builder.append_multiple_char('0', num_zeros) builder.append_slice(selfval, start, len(selfval)) - return W_UnicodeObject(builder.build(), width) + return W_UnicodeObject(builder.build(), width, self._get_flag()) @unwrap_spec(maxsplit=int) def descr_split(self, space, w_sep=None, maxsplit=-1): @@ -753,7 +804,7 @@ class W_UnicodeObject(W_Root): break i += 1 byte_pos = self._index_to_byte(start + i * step) - return W_UnicodeObject(builder.build(), sl) + return W_UnicodeObject(builder.build(), sl, self._get_flag()) def descr_getslice(self, space, w_start, w_stop): start, stop = normalize_simple_slice( @@ -770,22 +821,30 @@ class W_UnicodeObject(W_Root): assert stop >= 0 byte_start = self._index_to_byte(start) byte_stop = self._index_to_byte(stop) - return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start) + return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start, + self._get_flag()) def descr_capitalize(self, space): value = self._utf8 if len(value) == 0: return self._empty() + flag = self._get_flag() builder = StringBuilder(len(value)) uchar = rutf8.codepoint_at_pos(value, 0) i = rutf8.next_codepoint_pos(value, 0) - rutf8.unichr_as_utf8_append(builder, unicodedb.toupper(uchar)) + ch = unicodedb.toupper(uchar) + rutf8.unichr_as_utf8_append(builder, ch) + if ch >= 0x80: + flag = self._combine_flags(flag, rutf8.FLAG_REGULAR) while i < len(value): uchar = rutf8.codepoint_at_pos(value, i) i = rutf8.next_codepoint_pos(value, i) - rutf8.unichr_as_utf8_append(builder, unicodedb.tolower(uchar)) - return W_UnicodeObject(builder.build(), self._len()) + ch = unicodedb.tolower(uchar) + rutf8.unichr_as_utf8_append(builder, ch) + if ch >= 0x80: + flag = self._combine_flags(flag, rutf8.FLAG_REGULAR) + return W_UnicodeObject(builder.build(), self._len(), flag) @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) def descr_center(self, space, width, w_fillchar): @@ -804,7 +863,7 @@ class W_UnicodeObject(W_Root): centered = value d = 0 - return W_UnicodeObject(centered, self._len() + d) + return W_UnicodeObject(centered, self._len() + d, self._get_flag()) def descr_count(self, space, w_sub, w_start=None, w_end=None): value = self._utf8 @@ -830,11 +889,11 @@ class W_UnicodeObject(W_Root): if pos < 0: return space.newtuple([self, self._empty(), self._empty()]) else: - lgt = rutf8.check_utf8(value, True, stop=pos) + lgt, _ = rutf8.check_utf8(value, True, stop=pos) return space.newtuple( - [W_UnicodeObject(value[0:pos], lgt), w_sub, + [W_UnicodeObject(value[0:pos], lgt, self._get_flag()), w_sub, W_UnicodeObject(value[pos + len(sub._utf8):len(value)], - self._len() - lgt - sublen)]) + self._len() - lgt - sublen, self._get_flag())]) def descr_rpartition(self, space, w_sub): value = self._utf8 @@ -848,11 +907,11 @@ class W_UnicodeObject(W_Root): if pos < 0: return space.newtuple([self._empty(), self._empty(), self]) else: - lgt = rutf8.check_utf8(value, True, stop=pos) + lgt, _ = rutf8.check_utf8(value, True, stop=pos) return space.newtuple( - [W_UnicodeObject(value[0:pos], lgt), w_sub, + [W_UnicodeObject(value[0:pos], lgt, self._get_flag()), w_sub, W_UnicodeObject(value[pos + len(sub._utf8):len(value)], - self._len() - lgt - sublen)]) + self._len() - lgt - sublen, self._get_flag())]) @unwrap_spec(count=int) def descr_replace(self, space, w_old, w_new, count=-1): @@ -870,8 +929,9 @@ class W_UnicodeObject(W_Root): except OverflowError: raise oefmt(space.w_OverflowError, "replace string is too long") + flag = self._combine_flags(self._get_flag(), w_by._get_flag()) newlength = self._length + replacements * (w_by._length - w_sub._length) - return W_UnicodeObject(res, newlength) + return W_UnicodeObject(res, newlength, flag) def descr_mul(self, space, w_times): try: @@ -883,16 +943,29 @@ class W_UnicodeObject(W_Root): if times <= 0: return self._empty() if len(self._utf8) == 1: - return W_UnicodeObject(self._utf8[0] * times, times) - return W_UnicodeObject(self._utf8 * times, times * self._len()) + return W_UnicodeObject(self._utf8[0] * times, times, + self._get_flag()) + return W_UnicodeObject(self._utf8 * times, times * self._len(), + self._get_flag()) descr_rmul = descr_mul def _get_index_storage(self): - storage = jit.conditional_call_elidable(self._index_storage, - rutf8.create_utf8_index_storage, self._utf8, self._length) + # XXX write the correct jit.elidable + condition = (self._index_storage == rutf8.null_storage() or + not bool(self._index_storage.contents)) + if condition: + storage = rutf8.create_utf8_index_storage(self._utf8, self._length) + else: + storage = self._index_storage if not jit.isconstant(self): + prev_storage = self._index_storage self._index_storage = storage + if prev_storage == rutf8.UTF8_HAS_SURROGATES: + flag = rutf8.FLAG_HAS_SURROGATES + else: + flag = rutf8.FLAG_REGULAR + self._index_storage.flag = flag return storage def _getitem_result(self, space, index): @@ -902,9 +975,19 @@ class W_UnicodeObject(W_Root): raise oefmt(space.w_IndexError, "string index out of range") start = self._index_to_byte(index) end = rutf8.next_codepoint_pos(self._utf8, start) - return W_UnicodeObject(self._utf8[start:end], 1) + return W_UnicodeObject(self._utf8[start:end], 1, self._get_flag()) + + def _is_ascii(self): + return self._index_storage is rutf8.UTF8_IS_ASCII + + def _has_surrogates(self): + return (self._index_storage is rutf8.UTF8_HAS_SURROGATES or + (bool(self._index_storage) and + self._index_storage.flag == rutf8.FLAG_HAS_SURROGATES)) def _index_to_byte(self, index): + if self._is_ascii(): + return index return rutf8.codepoint_position_at_index( self._utf8, self._get_index_storage(), index) @@ -967,6 +1050,7 @@ class W_UnicodeObject(W_Root): if w_fillchar._len() != 1: raise oefmt(space.w_TypeError, "rjust() argument 2 must be a single character") + flag = self._combine_flags(self._get_flag(), w_fillchar._get_flag()) d = width - lgt if d > 0: if len(w_fillchar._utf8) == 1: @@ -974,9 +1058,9 @@ class W_UnicodeObject(W_Root): value = d * w_fillchar._utf8[0] + value else: value = d * w_fillchar._utf8 + value - return W_UnicodeObject(value, width) + return W_UnicodeObject(value, width, flag) - return W_UnicodeObject(value, lgt) + return W_UnicodeObject(value, lgt, flag) @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) def descr_ljust(self, space, width, w_fillchar): @@ -985,6 +1069,7 @@ class W_UnicodeObject(W_Root): if w_fillchar._len() != 1: raise oefmt(space.w_TypeError, "ljust() argument 2 must be a single character") + flag = self._combine_flags(self._get_flag(), w_fillchar._get_flag()) d = width - self._len() if d > 0: if len(w_fillchar._utf8) == 1: @@ -992,9 +1077,9 @@ class W_UnicodeObject(W_Root): value = value + d * w_fillchar._utf8[0] else: value = value + d * w_fillchar._utf8 - return W_UnicodeObject(value, width) + return W_UnicodeObject(value, width, flag) - return W_UnicodeObject(value, self._len()) + return W_UnicodeObject(value, self._len(), flag) def _utf8_sliced(self, start, stop, lgt): assert start >= 0 @@ -1002,7 +1087,7 @@ class W_UnicodeObject(W_Root): #if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj), # space.w_bytes): # return orig_obj - return W_UnicodeObject(self._utf8[start:stop], lgt) + return W_UnicodeObject(self._utf8[start:stop], lgt, self._get_flag()) def _strip_none(self, space, left, right): "internal function called by str_xstrip methods" @@ -1050,7 +1135,7 @@ class W_UnicodeObject(W_Root): return self._utf8_sliced(lpos, rpos, lgt) def descr_getnewargs(self, space): - return space.newtuple([W_UnicodeObject(self._utf8, self._length)]) + return space.newtuple([W_UnicodeObject(self._utf8, self._length, self._get_flag())]) @@ -1135,11 +1220,11 @@ def decode_object(space, w_obj, encoding, errors): if encoding == 'ascii': s = space.charbuf_w(w_obj) unicodehelper.check_ascii_or_raise(space, s) - return space.newutf8(s, len(s)) + return space.newutf8(s, len(s), rutf8.FLAG_ASCII) if encoding == 'utf-8': s = space.charbuf_w(w_obj) - lgt = unicodehelper.check_utf8_or_raise(space, s) - return space.newutf8(s, lgt) + lgt, flag = unicodehelper.check_utf8_or_raise(space, s) + return space.newutf8(s, lgt, flag) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.newtext("decode")) if errors is None: @@ -1194,7 +1279,7 @@ def unicode_from_string(space, w_bytes): return unicode_from_encoded_object(space, w_bytes, encoding, "strict") s = space.bytes_w(w_bytes) unicodehelper.check_ascii_or_raise(space, s) - return W_UnicodeObject(s, len(s)) + return W_UnicodeObject(s, len(s), rutf8.FLAG_ASCII) class UnicodeDocstrings: @@ -1741,7 +1826,7 @@ def _create_list_from_unicode(value): return [s for s in value] -W_UnicodeObject.EMPTY = W_UnicodeObject('', 0) +W_UnicodeObject.EMPTY = W_UnicodeObject('', 0, rutf8.FLAG_ASCII) # Helper for converting int/long diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py index 83347029ca..f06c9fbd2b 100644 --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -305,14 +305,14 @@ def _invalid_byte_2_of_4(ordch1, ordch2): def check_utf8(s, allow_surrogates, start=0, stop=-1): """Check that 's' is a utf-8-encoded byte string. - Returns the length (number of chars) and flags or raise CheckError. + Returns the length (number of chars) and flag or raise CheckError. If allow_surrogates is False, then also raise if we see any. Note also codepoints_in_utf8(), which also computes the length faster by assuming that 's' is valid utf-8. """ - res, flags = _check_utf8(s, allow_surrogates, start, stop) + res, flag = _check_utf8(s, allow_surrogates, start, stop) if res >= 0: - return res, flags + return res, flag raise CheckError(~res) @jit.elidable @@ -416,12 +416,13 @@ def surrogate_in_utf8(value): return False -UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct( - 'utf8_loc', - ('baseindex', lltype.Signed), +UTF8_INDEX_STORAGE = lltype.GcStruct('utf8_loc', ('flag', lltype.Signed), - ('ofs', lltype.FixedSizeArray(lltype.Char, 16)) - )) + ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct( + 'utf8_loc_elem', + ('baseindex', lltype.Signed), + ('ofs', lltype.FixedSizeArray(lltype.Char, 16))) + )))) FLAG_REGULAR = 0 FLAG_HAS_SURROGATES = 1 @@ -429,43 +430,47 @@ FLAG_ASCII = 2 # note that we never need index storage if we're pure ascii, but it's useful # for passing into W_UnicodeObject.__init__ -ASCII_INDEX_STORAGE_BLOCKS = 5 -ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE, - ASCII_INDEX_STORAGE_BLOCKS, - immortal=True) -for _i in range(ASCII_INDEX_STORAGE_BLOCKS): - ASCII_INDEX_STORAGE[_i].baseindex = _i * 64 - for _j in range(16): - ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1) +#ASCII_INDEX_STORAGE_BLOCKS = 5 +#ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO, +# ASCII_INDEX_STORAGE_BLOCKS, +# immortal=True) +#for _i in range(ASCII_INDEX_STORAGE_BLOCKS): +# ASCII_INDEX_STORAGE[_i].baseindex = _i * 64 +# for _j in range(16): +# ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1) def null_storage(): return lltype.nullptr(UTF8_INDEX_STORAGE) -UTF8_IS_ASCII = lltype.malloc(UTF8_INDEX_STORAGE, 0, immortal=True) -UTF8_HAS_SURROGATES = lltype.malloc(UTF8_INDEX_STORAGE, 0, immortal=True) +UTF8_IS_ASCII = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True) +UTF8_IS_ASCII.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO) +UTF8_HAS_SURROGATES = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True) +UTF8_HAS_SURROGATES.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO) def create_utf8_index_storage(utf8, utf8len): """ Create an index storage which stores index of each 4th character in utf8 encoded unicode string. """ - if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64: - return ASCII_INDEX_STORAGE +# if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64: +# return ASCII_INDEX_STORAGE arraysize = utf8len // 64 + 1 - storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize) + storage = lltype.malloc(UTF8_INDEX_STORAGE) + contents = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO, arraysize) + storage.contents = contents baseindex = 0 current = 0 while True: - storage[current].baseindex = baseindex + contents[current].baseindex = baseindex next = baseindex for i in range(16): if utf8len == 0: next += 1 # assume there is an extra '\x00' character else: next = next_codepoint_pos(utf8, next) - storage[current].ofs[i] = chr(next - baseindex) + contents[current].ofs[i] = chr(next - baseindex) utf8len -= 4 if utf8len < 0: - assert current + 1 == len(storage) + assert current + 1 == len(contents) break next = next_codepoint_pos(utf8, next) next = next_codepoint_pos(utf8, next) @@ -485,8 +490,8 @@ def codepoint_position_at_index(utf8, storage, index): this function. """ current = index >> 6 - ofs = ord(storage[current].ofs[(index >> 2) & 0x0F]) - bytepos = storage[current].baseindex + ofs + ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F]) + bytepos = storage.contents[current].baseindex + ofs index &= 0x3 if index == 0: return prev_codepoint_pos(utf8, bytepos) @@ -504,8 +509,8 @@ def codepoint_at_index(utf8, storage, index): storage of type UTF8_INDEX_STORAGE """ current = index >> 6 - ofs = ord(storage[current].ofs[(index >> 2) & 0x0F]) - bytepos = storage[current].baseindex + ofs + ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F]) + bytepos = storage.contents[current].baseindex + ofs index &= 0x3 if index == 0: return codepoint_before_pos(utf8, bytepos) |