aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfijal <unknown>2017-11-04 14:38:37 +0100
committerfijal <unknown>2017-11-04 14:38:37 +0100
commitd0b8d0c46b66559a51b0d0c0fe79b2cd3d1e26c4 (patch)
tree749d690a0a1790aeda5311225f8e32f0b9d77230
parent* Return a flag from check_utf8. (diff)
downloadpypy-d0b8d0c46b66559a51b0d0c0fe79b2cd3d1e26c4.tar.gz
pypy-d0b8d0c46b66559a51b0d0c0fe79b2cd3d1e26c4.tar.bz2
pypy-d0b8d0c46b66559a51b0d0c0fe79b2cd3d1e26c4.zip
progress on having flags correctly propagated, almost there
-rw-r--r--TODO4
-rw-r--r--pypy/interpreter/baseobjspace.py4
-rw-r--r--pypy/interpreter/pyparser/parsestring.py7
-rw-r--r--pypy/interpreter/unicodehelper.py49
-rw-r--r--pypy/module/__builtin__/operation.py8
-rw-r--r--pypy/module/_codecs/interp_codecs.py25
-rw-r--r--pypy/objspace/std/marshal_impl.py4
-rw-r--r--pypy/objspace/std/objspace.py8
-rw-r--r--pypy/objspace/std/test/test_unicodeobject.py9
-rw-r--r--pypy/objspace/std/unicodeobject.py191
-rw-r--r--rpython/rlib/rutf8.py61
11 files changed, 252 insertions, 118 deletions
diff --git a/TODO b/TODO
index 40fbe69731..85d32f43b5 100644
--- a/TODO
+++ b/TODO
@@ -4,3 +4,7 @@
if one is not already readily available
* fix _pypyjson
* fix cpyext
+* write the correct jit_elidable in _get_index_storage
+* better flag handling in split/splitlines maybe?
+* find all the fast-paths that we want to do with utf8 (we only do
+ utf-8 now, not UTF8 or utf8) for decode/encode
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
index 20f439471a..72655b4f13 100644
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1764,8 +1764,10 @@ class ObjSpace(object):
return self.realutf8_w(w_obj).decode('utf8')
def newunicode(self, u):
+ from pypy.interpreter import unicodehelper
assert isinstance(u, unicode)
- return self.newutf8(u.encode("utf8"), len(u))
+ # XXX let's disallow that
+ return self.newutf8(u.encode("utf8"), len(u), unicodehelper._get_flag(u))
def convert_to_w_unicode(self, w_obj):
return w_obj.convert_to_w_unicode(self)
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
index 514636d470..974c6f3c55 100644
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -59,10 +59,11 @@ def parsestr(space, encoding, s, unicode_literal=False):
else:
substr = decode_unicode_utf8(space, s, ps, q)
if rawmode:
- v, length = unicodehelper.decode_raw_unicode_escape(space, substr)
+ r = unicodehelper.decode_raw_unicode_escape(space, substr)
else:
- v, length = unicodehelper.decode_unicode_escape(space, substr)
- return space.newutf8(v, length)
+ r = unicodehelper.decode_unicode_escape(space, substr)
+ v, length, flag = r
+ return space.newutf8(v, length, flag)
need_encoding = (encoding is not None and
encoding != "utf-8" and encoding != "utf8" and
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
index 0212f7b790..c41ed1feec 100644
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -20,11 +20,11 @@ def decode_error_handler(space):
@specialize.memo()
def encode_error_handler(space):
# Fast version of the "strict" errors handler.
- def raise_unicode_exception_encode(errors, encoding, msg, u, u_len,
+ def raise_unicode_exception_encode(errors, encoding, msg, w_u,
startingpos, endingpos):
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([space.newtext(encoding),
- space.newutf8(u, u_len),
+ w_u,
space.newint(startingpos),
space.newint(endingpos),
space.newtext(msg)]))
@@ -41,6 +41,21 @@ def encode(space, w_data, encoding=None, errors='strict'):
from pypy.objspace.std.unicodeobject import encode_object
return encode_object(space, w_data, encoding, errors)
+def _has_surrogate(u):
+ for c in u:
+ if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F:
+ return True
+ return False
+
+def _get_flag(u):
+ flag = rutf8.FLAG_ASCII
+ for c in u:
+ if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F:
+ return rutf8.FLAG_HAS_SURROGATES
+ if ord(c) >= 0x80:
+ flag = rutf8.FLAG_REGULAR
+ return flag
+
# These functions take and return unwrapped rpython strings and unicodes
def decode_unicode_escape(space, string):
state = space.fromcache(interp_codecs.CodecState)
@@ -52,7 +67,14 @@ def decode_unicode_escape(space, string):
final=True, errorhandler=DecodeWrapper(decode_error_handler(space)).handle,
unicodedata_handler=unicodedata_handler)
# XXX argh. we want each surrogate to be encoded separately
- return ''.join([u.encode('utf8') for u in result_u]), len(result_u)
+ utf8 = ''.join([u.encode('utf8') for u in result_u])
+ if rutf8.first_non_ascii_char(utf8) == -1:
+ flag = rutf8.FLAG_ASCII
+ elif _has_surrogate(result_u):
+ flag = rutf8.FLAG_HAS_SURROGATES
+ else:
+ flag = rutf8.FLAG_REGULAR
+ return utf8, len(result_u), flag
def decode_raw_unicode_escape(space, string):
# XXX pick better length, maybe
@@ -61,7 +83,14 @@ def decode_raw_unicode_escape(space, string):
string, len(string), "strict",
final=True, errorhandler=DecodeWrapper(decode_error_handler(space)).handle)
# XXX argh. we want each surrogate to be encoded separately
- return ''.join([u.encode('utf8') for u in result_u]), len(result_u)
+ utf8 = ''.join([u.encode('utf8') for u in result_u])
+ if rutf8.first_non_ascii_char(utf8) == -1:
+ flag = rutf8.FLAG_ASCII
+ elif _has_surrogate(result_u):
+ flag = rutf8.FLAG_HAS_SURROGATES
+ else:
+ flag = rutf8.FLAG_REGULAR
+ return utf8, len(result_u), flag
def check_ascii_or_raise(space, string):
try:
@@ -78,12 +107,12 @@ def check_utf8_or_raise(space, string):
# you still get two surrogate unicode characters in the result.
# These are the Python2 rules; Python3 differs.
try:
- length = rutf8.check_utf8(string, allow_surrogates=True)
+ length, flag = rutf8.check_utf8(string, allow_surrogates=True)
except rutf8.CheckError as e:
decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string,
e.pos, e.pos + 1)
assert False, "unreachable"
- return length
+ return length, flag
def encode_utf8(space, uni):
# DEPRECATED
@@ -116,7 +145,7 @@ def str_decode_ascii(s, slen, errors, final, errorhandler):
except rutf8.CheckError:
w = DecodeWrapper((errorhandler))
u, pos = runicode.str_decode_ascii(s, slen, errors, final, w.handle)
- return u.encode('utf8'), pos, len(u)
+ return u.encode('utf8'), pos, len(u), _get_flag(u)
# XXX wrappers, think about speed
@@ -139,14 +168,14 @@ def str_decode_utf8(s, slen, errors, final, errorhandler):
w = DecodeWrapper(errorhandler)
u, pos = runicode.str_decode_utf_8_impl(s, slen, errors, final, w.handle,
runicode.allow_surrogate_by_default)
- return u.encode('utf8'), pos, len(u)
+ return u.encode('utf8'), pos, len(u), _get_flag(u)
def str_decode_unicode_escape(s, slen, errors, final, errorhandler, ud_handler):
w = DecodeWrapper(errorhandler)
u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final,
w.handle,
ud_handler)
- return u.encode('utf8'), pos, len(u)
+ return u.encode('utf8'), pos, len(u), _get_flag(u)
def setup_new_encoders(encoding):
encoder_name = 'utf8_encode_' + encoding
@@ -160,7 +189,7 @@ def setup_new_encoders(encoding):
def decoder(s, slen, errors, final, errorhandler):
w = DecodeWrapper((errorhandler))
u, pos = getattr(runicode, decoder_name)(s, slen, errors, final, w.handle)
- return u.encode('utf8'), pos, len(u)
+ return u.encode('utf8'), pos, len(u), _get_flag(u)
encoder.__name__ = encoder_name
decoder.__name__ = decoder_name
if encoder_name not in globals():
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
index 0661f0157f..dd5797717e 100644
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -28,7 +28,13 @@ def unichr(space, code):
s = rutf8.unichr_as_utf8(code, allow_surrogates=True)
except ValueError:
raise oefmt(space.w_ValueError, "unichr() arg out of range")
- return space.newutf8(s, 1)
+ if code < 0x80:
+ flag = rutf8.FLAG_ASCII
+ elif 0xDB80 <= code <= 0xCBFF or 0xD800 <= code <= 0xDB7F:
+ flag = rutf8.FLAG_HAS_SURROGATE
+ else:
+ flag = rutf8.FLAG_REGULAR
+ return space.newutf8(s, 1, flag)
def len(space, w_obj):
"len(object) -> integer\n\nReturn the number of items of a sequence or mapping."
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
index 26e9d0c548..f86212df59 100644
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -39,8 +39,8 @@ class CodecState(object):
w_input = space.newbytes(input)
else:
w_cls = space.w_UnicodeEncodeError
- length = rutf8.check_utf8(input, allow_surrogates=True)
- w_input = space.newutf8(input, length)
+ length, flag = rutf8.check_utf8(input, allow_surrogates=True)
+ w_input = space.newutf8(input, length, flag)
w_exc = space.call_function(
w_cls,
space.newtext(encoding),
@@ -189,7 +189,7 @@ def strict_errors(space, w_exc):
def ignore_errors(space, w_exc):
check_exception(space, w_exc)
w_end = space.getattr(w_exc, space.newtext('end'))
- return space.newtuple([space.newutf8('', 0), w_end])
+ return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), w_end])
REPLACEMENT = u'\ufffd'.encode('utf8')
@@ -200,13 +200,13 @@ def replace_errors(space, w_exc):
size = space.int_w(w_end) - space.int_w(w_start)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
text = '?' * size
- return space.newtuple([space.newutf8(text, size), w_end])
+ return space.newtuple([space.newutf8(text, size, rutf8.FLAG_ASCII), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
text = REPLACEMENT
- return space.newtuple([space.newutf8(text, 1), w_end])
+ return space.newtuple([space.newutf8(text, 1, rutf8.FLAG_REGULAR), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
text = REPLACEMENT * size
- return space.newtuple([space.newutf8(text, size), w_end])
+ return space.newtuple([space.newutf8(text, size, rutf8.FLAG_REGULAR), w_end])
else:
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
@@ -403,9 +403,9 @@ def make_decoder_wrapper(name):
final = space.is_true(w_final)
state = space.fromcache(CodecState)
func = getattr(unicodehelper, rname)
- result, consumed, length = func(string, len(string), errors,
- final, state.decode_error_handler)
- return space.newtuple([space.newutf8(result, length),
+ result, consumed, length, flag = func(string, len(string), errors,
+ final, state.decode_error_handler)
+ return space.newtuple([space.newutf8(result, length, flag),
space.newint(consumed)])
wrap_decoder.func_name = rname
globals()[name] = wrap_decoder
@@ -448,7 +448,7 @@ if hasattr(runicode, 'str_decode_mbcs'):
# "allow_surrogates=True"
@unwrap_spec(utf8='utf8', errors='text_or_none')
def utf_8_encode(space, utf8, errors="strict"):
- length = rutf8.check_utf8(utf8, allow_surrogates=True)
+ length, _ = rutf8.check_utf8(utf8, allow_surrogates=True)
return space.newtuple([space.newbytes(utf8), space.newint(length)])
#@unwrap_spec(uni=unicode, errors='text_or_none')
#def utf_8_encode(space, uni, errors="strict"):
@@ -474,16 +474,17 @@ def utf_8_decode(space, string, errors="strict", w_final=None):
state = space.fromcache(CodecState)
# call the fast version for checking
try:
- lgt = rutf8.check_utf8(string, allow_surrogates=True)
+ lgt, flag = rutf8.check_utf8(string, allow_surrogates=True)
except rutf8.CheckError as e:
# XXX do the way around runicode - we can optimize it later if we
# decide we care about obscure cases
+ xxx
res, consumed, lgt = unicodehelper.str_decode_utf8(string, len(string),
errors, final, state.decode_error_handler)
return space.newtuple([space.newutf8(res, lgt),
space.newint(consumed)])
else:
- return space.newtuple([space.newutf8(string, lgt),
+ return space.newtuple([space.newutf8(string, lgt, flag),
space.newint(len(string))])
@unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int,
diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py
index 038cbf3e94..b8f9e56006 100644
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -403,8 +403,8 @@ def marshal_unicode(space, w_unicode, m):
@unmarshaller(TYPE_UNICODE)
def unmarshal_unicode(space, u, tc):
arg = u.get_str()
- length = unicodehelper.check_utf8_or_raise(space, arg)
- return space.newutf8(arg, length)
+ length, flag = unicodehelper.check_utf8_or_raise(space, arg)
+ return space.newutf8(arg, length, flag)
@marshaller(W_SetObject)
def marshal_set(space, w_set, m):
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
index f1896c2909..0b67c7861f 100644
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -317,8 +317,8 @@ class StdObjSpace(ObjSpace):
for utf in lst:
assert utf is not None
assert isinstance(utf, str)
- length = rutf8.check_utf8(utf, allow_surrogates=True)
- res_w.append(self.newutf8(utf, length))
+ length, flag = rutf8.check_utf8(utf, allow_surrogates=True)
+ res_w.append(self.newutf8(utf, length, flag))
return self.newlist(res_w)
def newlist_int(self, list_i):
@@ -369,10 +369,10 @@ class StdObjSpace(ObjSpace):
return self.w_None
return self.newtext(s)
- def newutf8(self, utf8s, length):
+ def newutf8(self, utf8s, length, flag):
assert utf8s is not None
assert isinstance(utf8s, str)
- return W_UnicodeObject(utf8s, length)
+ return W_UnicodeObject(utf8s, length, flag)
def newfilename(self, s):
assert isinstance(s, str) # on pypy3, this decodes the byte string
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
index a860d1c20a..9620f6e3cc 100644
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -3,6 +3,7 @@
import py
import sys
from hypothesis import given, strategies, settings, example
+from rpython.rlib import rutf8
from pypy.interpreter.error import OperationError
@@ -27,12 +28,12 @@ class TestUnicodeObject:
def test_listview_unicode(self):
py.test.skip("skip for new")
- w_str = self.space.wrap(u'abcd')
+ w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
assert self.space.listview_unicode(w_str) == list(u"abcd")
def test_new_shortcut(self):
space = self.space
- w_uni = self.space.wrap(u'abcd')
+ w_uni = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
w_new = space.call_method(
space.w_unicode, "__new__", space.w_unicode, w_uni)
assert w_new is w_uni
@@ -44,8 +45,8 @@ class TestUnicodeObject:
return # skip this case
v = u[start : start + len1]
space = self.space
- w_u = space.wrap(u)
- w_v = space.wrap(v)
+ w_u = space.newutf8(u.encode('utf8'), len(u), rutf8.FLAG_REGULAR)
+ w_v = space.newutf8(v.encode('utf8'), len(v), rutf8.FLAG_REGULAR)
expected = u.find(v, start, start + len1)
try:
w_index = space.call_method(w_u, 'index', w_v,
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
index bd8d7df8b2..f050138cbb 100644
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -36,14 +36,24 @@ class W_UnicodeObject(W_Root):
_immutable_fields_ = ['_utf8']
@enforceargs(utf8str=str)
- def __init__(self, utf8str, length):
+ def __init__(self, utf8str, length, flag):
assert isinstance(utf8str, str)
assert length >= 0
self._utf8 = utf8str
self._length = length
- self._index_storage = rutf8.null_storage()
- #if not we_are_translated():
- # assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length
+ if flag == rutf8.FLAG_ASCII:
+ self._index_storage = rutf8.UTF8_IS_ASCII
+ elif flag == rutf8.FLAG_HAS_SURROGATES:
+ self._index_storage = rutf8.UTF8_HAS_SURROGATES
+ else:
+ assert flag == rutf8.FLAG_REGULAR
+ self._index_storage = rutf8.null_storage()
+ # the storage can be one of:
+ # - null, unicode with no surrogates
+ # - rutf8.UTF8_HAS_SURROGATES
+ # - rutf8.UTF8_IS_ASCII
+ # - malloced object, which means it has index, then
+ # _index_storage.flags determines the kind
def __repr__(self):
"""representation for debugging purposes"""
@@ -222,7 +232,11 @@ class W_UnicodeObject(W_Root):
assert isinstance(w_value, W_UnicodeObject)
w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
- W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length)
+ W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length,
+ w_value._get_flag())
+ if w_value._index_storage:
+ # copy the storage if it's there
+ w_newobj._index_storage = w_value._index_storage
return w_newobj
def descr_repr(self, space):
@@ -326,29 +340,33 @@ class W_UnicodeObject(W_Root):
def descr_swapcase(self, space):
selfvalue = self._utf8
builder = StringBuilder(len(selfvalue))
+ flag = self._get_flag()
i = 0
while i < len(selfvalue):
ch = rutf8.codepoint_at_pos(selfvalue, i)
i = rutf8.next_codepoint_pos(selfvalue, i)
if unicodedb.isupper(ch):
- rutf8.unichr_as_utf8_append(builder, unicodedb.tolower(ch))
+ ch = unicodedb.tolower(ch)
elif unicodedb.islower(ch):
- rutf8.unichr_as_utf8_append(builder, unicodedb.toupper(ch))
- else:
- rutf8.unichr_as_utf8_append(builder, ch)
- return W_UnicodeObject(builder.build(), self._length)
+ ch = unicodedb.toupper(ch)
+ if ch >= 0x80:
+ flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+ rutf8.unichr_as_utf8_append(builder, ch)
+ return W_UnicodeObject(builder.build(), self._length, flag)
def descr_title(self, space):
if len(self._utf8) == 0:
return self
- return W_UnicodeObject(self.title(self._utf8), self._len())
+ utf8, flag = self.title_unicode(self._utf8)
+ return W_UnicodeObject(utf8, self._len(), flag)
@jit.elidable
- def title(self, value):
+ def title_unicode(self, value):
input = self._utf8
builder = StringBuilder(len(input))
i = 0
previous_is_cased = False
+ flag = self._get_flag()
while i < len(input):
ch = rutf8.codepoint_at_pos(input, i)
i = rutf8.next_codepoint_pos(input, i)
@@ -356,14 +374,17 @@ class W_UnicodeObject(W_Root):
ch = unicodedb.totitle(ch)
else:
ch = unicodedb.tolower(ch)
+ if ch >= 0x80:
+ flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
rutf8.unichr_as_utf8_append(builder, ch)
previous_is_cased = unicodedb.iscased(ch)
- return builder.build()
+ return builder.build(), flag
def descr_translate(self, space, w_table):
input = self._utf8
result = StringBuilder(len(input))
result_length = 0
+ flag = self._get_flag()
i = 0
while i < len(input):
codepoint = rutf8.codepoint_at_pos(input, i)
@@ -380,6 +401,7 @@ class W_UnicodeObject(W_Root):
codepoint = space.int_w(w_newval)
elif isinstance(w_newval, W_UnicodeObject):
result.append(w_newval._utf8)
+ flag = self._combine_flags(flag, w_newval._get_flag())
result_length += w_newval._length
continue
else:
@@ -387,13 +409,15 @@ class W_UnicodeObject(W_Root):
"character mapping must return integer, None "
"or unicode")
try:
+ if codepoint >= 0x80:
+ flag = self._combine_flags(flag, rutf8.FLAG_NORMAL)
rutf8.unichr_as_utf8_append(result, codepoint,
allow_surrogates=True)
result_length += 1
except ValueError:
raise oefmt(space.w_TypeError,
"character mapping must be in range(0x110000)")
- return W_UnicodeObject(result.build(), result_length)
+ return W_UnicodeObject(result.build(), result_length, flag)
def descr_find(self, space, w_sub, w_start=None, w_end=None):
w_result = self._unwrap_and_search(space, w_sub, w_start, w_end)
@@ -472,7 +496,7 @@ class W_UnicodeObject(W_Root):
newlen += dist
oldtoken = token
- return W_UnicodeObject(expanded, newlen)
+ return W_UnicodeObject(expanded, newlen, self._get_flag())
_StringMethods_descr_join = descr_join
def descr_join(self, space, w_list):
@@ -506,11 +530,14 @@ class W_UnicodeObject(W_Root):
def descr_lower(self, space):
builder = StringBuilder(len(self._utf8))
pos = 0
+ flag = self._get_flag()
while pos < len(self._utf8):
lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
+ if lower >= 0x80:
+ flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates?
pos = rutf8.next_codepoint_pos(self._utf8, pos)
- return W_UnicodeObject(builder.build(), self._len())
+ return W_UnicodeObject(builder.build(), self._len(), flag)
def descr_isdecimal(self, space):
return self._is_generic(space, '_isdecimal')
@@ -595,6 +622,22 @@ class W_UnicodeObject(W_Root):
return True
return endswith(value, prefix, start, end)
+ @staticmethod
+ def _combine_flags(self_flag, other_flag):
+ if self_flag == rutf8.FLAG_ASCII and other_flag == rutf8.FLAG_ASCII:
+ return rutf8.FLAG_ASCII
+ elif (self_flag == rutf8.FLAG_HAS_SURROGATES or
+ other_flag == rutf8.FLAG_HAS_SURROGATES):
+ return rutf8.FLAG_HAS_SURROGATES
+ return rutf8.FLAG_REGULAR
+
+ def _get_flag(self):
+ if self._is_ascii():
+ return rutf8.FLAG_ASCII
+ elif self._has_surrogates():
+ return rutf8.FLAG_HAS_SURROGATES
+ return rutf8.FLAG_REGULAR
+
def descr_add(self, space, w_other):
try:
w_other = self.convert_arg_to_w_unicode(space, w_other)
@@ -602,8 +645,9 @@ class W_UnicodeObject(W_Root):
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
raise
+ flag = self._combine_flags(self._get_flag(), w_other._get_flag())
return W_UnicodeObject(self._utf8 + w_other._utf8,
- self._len() + w_other._len())
+ self._len() + w_other._len(), flag)
@jit.look_inside_iff(lambda self, space, list_w, size:
jit.loop_unrolling_heuristic(list_w, size))
@@ -613,6 +657,7 @@ class W_UnicodeObject(W_Root):
prealloc_size = len(value) * (size - 1)
unwrapped = newlist_hint(size)
+ flag = self._get_flag()
for i in range(size):
w_s = list_w[i]
check_item = self._join_check_item(space, w_s)
@@ -625,6 +670,7 @@ class W_UnicodeObject(W_Root):
# XXX Maybe the extra copy here is okay? It was basically going to
# happen anyway, what with being placed into the builder
w_u = self.convert_arg_to_w_unicode(space, w_s)
+ flag = self._combine_flags(flag, w_u._get_flag())
unwrapped.append(w_u._utf8)
lgt += w_u._length
prealloc_size += len(unwrapped[i])
@@ -634,7 +680,7 @@ class W_UnicodeObject(W_Root):
if value and i != 0:
sb.append(value)
sb.append(unwrapped[i])
- return W_UnicodeObject(sb.build(), lgt)
+ return W_UnicodeObject(sb.build(), lgt, flag)
@unwrap_spec(keepends=bool)
def descr_splitlines(self, space, keepends=False):
@@ -663,28 +709,33 @@ class W_UnicodeObject(W_Root):
lgt += line_end_chars
assert eol >= 0
assert sol >= 0
- strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
+ # XXX we can do better with flags here, if we want to
+ strs_w.append(W_UnicodeObject(value[sol:eol], lgt, self._get_flag()))
return space.newlist(strs_w)
def descr_upper(self, space):
value = self._utf8
builder = StringBuilder(len(value))
+ flag = self._get_flag()
i = 0
while i < len(value):
uchar = rutf8.codepoint_at_pos(value, i)
+ uchar = unicodedb.toupper(uchar)
+ if uchar >= 0x80:
+ flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
i = rutf8.next_codepoint_pos(value, i)
- rutf8.unichr_as_utf8_append(builder, unicodedb.toupper(uchar))
- return W_UnicodeObject(builder.build(), self._length)
+ rutf8.unichr_as_utf8_append(builder, uchar)
+ return W_UnicodeObject(builder.build(), self._length, flag)
@unwrap_spec(width=int)
def descr_zfill(self, space, width):
selfval = self._utf8
if len(selfval) == 0:
- return W_UnicodeObject('0' * width, width)
+ return W_UnicodeObject('0' * width, width, rutf8.FLAG_ASCII)
num_zeros = width - self._len()
if num_zeros <= 0:
# cannot return self, in case it is a subclass of str
- return W_UnicodeObject(selfval, self._len())
+ return W_UnicodeObject(selfval, self._len(), self._get_flag())
builder = StringBuilder(num_zeros + len(selfval))
if len(selfval) > 0 and (selfval[0] == '+' or selfval[0] == '-'):
# copy sign to first position
@@ -694,7 +745,7 @@ class W_UnicodeObject(W_Root):
start = 0
builder.append_multiple_char('0', num_zeros)
builder.append_slice(selfval, start, len(selfval))
- return W_UnicodeObject(builder.build(), width)
+ return W_UnicodeObject(builder.build(), width, self._get_flag())
@unwrap_spec(maxsplit=int)
def descr_split(self, space, w_sep=None, maxsplit=-1):
@@ -753,7 +804,7 @@ class W_UnicodeObject(W_Root):
break
i += 1
byte_pos = self._index_to_byte(start + i * step)
- return W_UnicodeObject(builder.build(), sl)
+ return W_UnicodeObject(builder.build(), sl, self._get_flag())
def descr_getslice(self, space, w_start, w_stop):
start, stop = normalize_simple_slice(
@@ -770,22 +821,30 @@ class W_UnicodeObject(W_Root):
assert stop >= 0
byte_start = self._index_to_byte(start)
byte_stop = self._index_to_byte(stop)
- return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
+ return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start,
+ self._get_flag())
def descr_capitalize(self, space):
value = self._utf8
if len(value) == 0:
return self._empty()
+ flag = self._get_flag()
builder = StringBuilder(len(value))
uchar = rutf8.codepoint_at_pos(value, 0)
i = rutf8.next_codepoint_pos(value, 0)
- rutf8.unichr_as_utf8_append(builder, unicodedb.toupper(uchar))
+ ch = unicodedb.toupper(uchar)
+ rutf8.unichr_as_utf8_append(builder, ch)
+ if ch >= 0x80:
+ flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
while i < len(value):
uchar = rutf8.codepoint_at_pos(value, i)
i = rutf8.next_codepoint_pos(value, i)
- rutf8.unichr_as_utf8_append(builder, unicodedb.tolower(uchar))
- return W_UnicodeObject(builder.build(), self._len())
+ ch = unicodedb.tolower(uchar)
+ rutf8.unichr_as_utf8_append(builder, ch)
+ if ch >= 0x80:
+ flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+ return W_UnicodeObject(builder.build(), self._len(), flag)
@unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
def descr_center(self, space, width, w_fillchar):
@@ -804,7 +863,7 @@ class W_UnicodeObject(W_Root):
centered = value
d = 0
- return W_UnicodeObject(centered, self._len() + d)
+ return W_UnicodeObject(centered, self._len() + d, self._get_flag())
def descr_count(self, space, w_sub, w_start=None, w_end=None):
value = self._utf8
@@ -830,11 +889,11 @@ class W_UnicodeObject(W_Root):
if pos < 0:
return space.newtuple([self, self._empty(), self._empty()])
else:
- lgt = rutf8.check_utf8(value, True, stop=pos)
+ lgt, _ = rutf8.check_utf8(value, True, stop=pos)
return space.newtuple(
- [W_UnicodeObject(value[0:pos], lgt), w_sub,
+ [W_UnicodeObject(value[0:pos], lgt, self._get_flag()), w_sub,
W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
- self._len() - lgt - sublen)])
+ self._len() - lgt - sublen, self._get_flag())])
def descr_rpartition(self, space, w_sub):
value = self._utf8
@@ -848,11 +907,11 @@ class W_UnicodeObject(W_Root):
if pos < 0:
return space.newtuple([self._empty(), self._empty(), self])
else:
- lgt = rutf8.check_utf8(value, True, stop=pos)
+ lgt, _ = rutf8.check_utf8(value, True, stop=pos)
return space.newtuple(
- [W_UnicodeObject(value[0:pos], lgt), w_sub,
+ [W_UnicodeObject(value[0:pos], lgt, self._get_flag()), w_sub,
W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
- self._len() - lgt - sublen)])
+ self._len() - lgt - sublen, self._get_flag())])
@unwrap_spec(count=int)
def descr_replace(self, space, w_old, w_new, count=-1):
@@ -870,8 +929,9 @@ class W_UnicodeObject(W_Root):
except OverflowError:
raise oefmt(space.w_OverflowError, "replace string is too long")
+ flag = self._combine_flags(self._get_flag(), w_by._get_flag())
newlength = self._length + replacements * (w_by._length - w_sub._length)
- return W_UnicodeObject(res, newlength)
+ return W_UnicodeObject(res, newlength, flag)
def descr_mul(self, space, w_times):
try:
@@ -883,16 +943,29 @@ class W_UnicodeObject(W_Root):
if times <= 0:
return self._empty()
if len(self._utf8) == 1:
- return W_UnicodeObject(self._utf8[0] * times, times)
- return W_UnicodeObject(self._utf8 * times, times * self._len())
+ return W_UnicodeObject(self._utf8[0] * times, times,
+ self._get_flag())
+ return W_UnicodeObject(self._utf8 * times, times * self._len(),
+ self._get_flag())
descr_rmul = descr_mul
def _get_index_storage(self):
- storage = jit.conditional_call_elidable(self._index_storage,
- rutf8.create_utf8_index_storage, self._utf8, self._length)
+ # XXX write the correct jit.elidable
+ condition = (self._index_storage == rutf8.null_storage() or
+ not bool(self._index_storage.contents))
+ if condition:
+ storage = rutf8.create_utf8_index_storage(self._utf8, self._length)
+ else:
+ storage = self._index_storage
if not jit.isconstant(self):
+ prev_storage = self._index_storage
self._index_storage = storage
+ if prev_storage == rutf8.UTF8_HAS_SURROGATES:
+ flag = rutf8.FLAG_HAS_SURROGATES
+ else:
+ flag = rutf8.FLAG_REGULAR
+ self._index_storage.flag = flag
return storage
def _getitem_result(self, space, index):
@@ -902,9 +975,19 @@ class W_UnicodeObject(W_Root):
raise oefmt(space.w_IndexError, "string index out of range")
start = self._index_to_byte(index)
end = rutf8.next_codepoint_pos(self._utf8, start)
- return W_UnicodeObject(self._utf8[start:end], 1)
+ return W_UnicodeObject(self._utf8[start:end], 1, self._get_flag())
+
+ def _is_ascii(self):
+ return self._index_storage is rutf8.UTF8_IS_ASCII
+
+ def _has_surrogates(self):
+ return (self._index_storage is rutf8.UTF8_HAS_SURROGATES or
+ (bool(self._index_storage) and
+ self._index_storage.flag == rutf8.FLAG_HAS_SURROGATES))
def _index_to_byte(self, index):
+ if self._is_ascii():
+ return index
return rutf8.codepoint_position_at_index(
self._utf8, self._get_index_storage(), index)
@@ -967,6 +1050,7 @@ class W_UnicodeObject(W_Root):
if w_fillchar._len() != 1:
raise oefmt(space.w_TypeError,
"rjust() argument 2 must be a single character")
+ flag = self._combine_flags(self._get_flag(), w_fillchar._get_flag())
d = width - lgt
if d > 0:
if len(w_fillchar._utf8) == 1:
@@ -974,9 +1058,9 @@ class W_UnicodeObject(W_Root):
value = d * w_fillchar._utf8[0] + value
else:
value = d * w_fillchar._utf8 + value
- return W_UnicodeObject(value, width)
+ return W_UnicodeObject(value, width, flag)
- return W_UnicodeObject(value, lgt)
+ return W_UnicodeObject(value, lgt, flag)
@unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
def descr_ljust(self, space, width, w_fillchar):
@@ -985,6 +1069,7 @@ class W_UnicodeObject(W_Root):
if w_fillchar._len() != 1:
raise oefmt(space.w_TypeError,
"ljust() argument 2 must be a single character")
+ flag = self._combine_flags(self._get_flag(), w_fillchar._get_flag())
d = width - self._len()
if d > 0:
if len(w_fillchar._utf8) == 1:
@@ -992,9 +1077,9 @@ class W_UnicodeObject(W_Root):
value = value + d * w_fillchar._utf8[0]
else:
value = value + d * w_fillchar._utf8
- return W_UnicodeObject(value, width)
+ return W_UnicodeObject(value, width, flag)
- return W_UnicodeObject(value, self._len())
+ return W_UnicodeObject(value, self._len(), flag)
def _utf8_sliced(self, start, stop, lgt):
assert start >= 0
@@ -1002,7 +1087,7 @@ class W_UnicodeObject(W_Root):
#if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj),
# space.w_bytes):
# return orig_obj
- return W_UnicodeObject(self._utf8[start:stop], lgt)
+ return W_UnicodeObject(self._utf8[start:stop], lgt, self._get_flag())
def _strip_none(self, space, left, right):
"internal function called by str_xstrip methods"
@@ -1050,7 +1135,7 @@ class W_UnicodeObject(W_Root):
return self._utf8_sliced(lpos, rpos, lgt)
def descr_getnewargs(self, space):
- return space.newtuple([W_UnicodeObject(self._utf8, self._length)])
+ return space.newtuple([W_UnicodeObject(self._utf8, self._length, self._get_flag())])
@@ -1135,11 +1220,11 @@ def decode_object(space, w_obj, encoding, errors):
if encoding == 'ascii':
s = space.charbuf_w(w_obj)
unicodehelper.check_ascii_or_raise(space, s)
- return space.newutf8(s, len(s))
+ return space.newutf8(s, len(s), rutf8.FLAG_ASCII)
if encoding == 'utf-8':
s = space.charbuf_w(w_obj)
- lgt = unicodehelper.check_utf8_or_raise(space, s)
- return space.newutf8(s, lgt)
+ lgt, flag = unicodehelper.check_utf8_or_raise(space, s)
+ return space.newutf8(s, lgt, flag)
w_codecs = space.getbuiltinmodule("_codecs")
w_decode = space.getattr(w_codecs, space.newtext("decode"))
if errors is None:
@@ -1194,7 +1279,7 @@ def unicode_from_string(space, w_bytes):
return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
s = space.bytes_w(w_bytes)
unicodehelper.check_ascii_or_raise(space, s)
- return W_UnicodeObject(s, len(s))
+ return W_UnicodeObject(s, len(s), rutf8.FLAG_ASCII)
class UnicodeDocstrings:
@@ -1741,7 +1826,7 @@ def _create_list_from_unicode(value):
return [s for s in value]
-W_UnicodeObject.EMPTY = W_UnicodeObject('', 0)
+W_UnicodeObject.EMPTY = W_UnicodeObject('', 0, rutf8.FLAG_ASCII)
# Helper for converting int/long
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
index 83347029ca..f06c9fbd2b 100644
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -305,14 +305,14 @@ def _invalid_byte_2_of_4(ordch1, ordch2):
def check_utf8(s, allow_surrogates, start=0, stop=-1):
"""Check that 's' is a utf-8-encoded byte string.
- Returns the length (number of chars) and flags or raise CheckError.
+ Returns the length (number of chars) and flag or raise CheckError.
If allow_surrogates is False, then also raise if we see any.
Note also codepoints_in_utf8(), which also computes the length
faster by assuming that 's' is valid utf-8.
"""
- res, flags = _check_utf8(s, allow_surrogates, start, stop)
+ res, flag = _check_utf8(s, allow_surrogates, start, stop)
if res >= 0:
- return res, flags
+ return res, flag
raise CheckError(~res)
@jit.elidable
@@ -416,12 +416,13 @@ def surrogate_in_utf8(value):
return False
-UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct(
- 'utf8_loc',
- ('baseindex', lltype.Signed),
+UTF8_INDEX_STORAGE = lltype.GcStruct('utf8_loc',
('flag', lltype.Signed),
- ('ofs', lltype.FixedSizeArray(lltype.Char, 16))
- ))
+ ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct(
+ 'utf8_loc_elem',
+ ('baseindex', lltype.Signed),
+ ('ofs', lltype.FixedSizeArray(lltype.Char, 16)))
+ ))))
FLAG_REGULAR = 0
FLAG_HAS_SURROGATES = 1
@@ -429,43 +430,47 @@ FLAG_ASCII = 2
# note that we never need index storage if we're pure ascii, but it's useful
# for passing into W_UnicodeObject.__init__
-ASCII_INDEX_STORAGE_BLOCKS = 5
-ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE,
- ASCII_INDEX_STORAGE_BLOCKS,
- immortal=True)
-for _i in range(ASCII_INDEX_STORAGE_BLOCKS):
- ASCII_INDEX_STORAGE[_i].baseindex = _i * 64
- for _j in range(16):
- ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1)
+#ASCII_INDEX_STORAGE_BLOCKS = 5
+#ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO,
+# ASCII_INDEX_STORAGE_BLOCKS,
+# immortal=True)
+#for _i in range(ASCII_INDEX_STORAGE_BLOCKS):
+# ASCII_INDEX_STORAGE[_i].baseindex = _i * 64
+# for _j in range(16):
+# ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1)
def null_storage():
return lltype.nullptr(UTF8_INDEX_STORAGE)
-UTF8_IS_ASCII = lltype.malloc(UTF8_INDEX_STORAGE, 0, immortal=True)
-UTF8_HAS_SURROGATES = lltype.malloc(UTF8_INDEX_STORAGE, 0, immortal=True)
+UTF8_IS_ASCII = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True)
+UTF8_IS_ASCII.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO)
+UTF8_HAS_SURROGATES = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True)
+UTF8_HAS_SURROGATES.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO)
def create_utf8_index_storage(utf8, utf8len):
""" Create an index storage which stores index of each 4th character
in utf8 encoded unicode string.
"""
- if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64:
- return ASCII_INDEX_STORAGE
+# if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64:
+# return ASCII_INDEX_STORAGE
arraysize = utf8len // 64 + 1
- storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
+ storage = lltype.malloc(UTF8_INDEX_STORAGE)
+ contents = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO, arraysize)
+ storage.contents = contents
baseindex = 0
current = 0
while True:
- storage[current].baseindex = baseindex
+ contents[current].baseindex = baseindex
next = baseindex
for i in range(16):
if utf8len == 0:
next += 1 # assume there is an extra '\x00' character
else:
next = next_codepoint_pos(utf8, next)
- storage[current].ofs[i] = chr(next - baseindex)
+ contents[current].ofs[i] = chr(next - baseindex)
utf8len -= 4
if utf8len < 0:
- assert current + 1 == len(storage)
+ assert current + 1 == len(contents)
break
next = next_codepoint_pos(utf8, next)
next = next_codepoint_pos(utf8, next)
@@ -485,8 +490,8 @@ def codepoint_position_at_index(utf8, storage, index):
this function.
"""
current = index >> 6
- ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
- bytepos = storage[current].baseindex + ofs
+ ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F])
+ bytepos = storage.contents[current].baseindex + ofs
index &= 0x3
if index == 0:
return prev_codepoint_pos(utf8, bytepos)
@@ -504,8 +509,8 @@ def codepoint_at_index(utf8, storage, index):
storage of type UTF8_INDEX_STORAGE
"""
current = index >> 6
- ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
- bytepos = storage[current].baseindex + ofs
+ ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F])
+ bytepos = storage.contents[current].baseindex + ofs
index &= 0x3
if index == 0:
return codepoint_before_pos(utf8, bytepos)