Skip to content

bpo-34523: Support surrogatepass in locale codecs #8995

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions Include/fileutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@
extern "C" {
#endif


#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
typedef enum {
_Py_ERROR_UNKNOWN=0,
_Py_ERROR_STRICT,
_Py_ERROR_SURROGATEESCAPE,
_Py_ERROR_REPLACE,
_Py_ERROR_IGNORE,
_Py_ERROR_BACKSLASHREPLACE,
_Py_ERROR_SURROGATEPASS,
_Py_ERROR_XMLCHARREFREPLACE,
_Py_ERROR_OTHER
} _Py_error_handler;

PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
#endif


#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
const char *arg,
Expand All @@ -26,35 +44,38 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
wchar_t **wstr,
size_t *wlen,
const char **reason,
int surrogateescape);
_Py_error_handler errors);

PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
const wchar_t *text,
char **str,
size_t *error_pos,
const char **reason,
int raw_malloc,
int surrogateescape);
_Py_error_handler errors);

PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
const char *arg,
Py_ssize_t arglen);
#endif


#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
PyAPI_FUNC(int) _Py_DecodeLocaleEx(
const char *arg,
wchar_t **wstr,
size_t *wlen,
const char **reason,
int current_locale,
int surrogateescape);
_Py_error_handler errors);

PyAPI_FUNC(int) _Py_EncodeLocaleEx(
const wchar_t *text,
char **str,
size_t *error_pos,
const char **reason,
int current_locale,
int surrogateescape);
_Py_error_handler errors);
#endif

#ifndef Py_LIMITED_API
Expand Down
118 changes: 113 additions & 5 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@

from test import support

try:
import _testcapi
except ImportError as exc:
_testcapi = None

try:
import ctypes
except ImportError:
Expand Down Expand Up @@ -2051,13 +2056,12 @@ def test_basics(self):

@support.cpython_only
def test_basics_capi(self):
from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
s = "abc123" # all codecs should be able to encode these
for encoding in all_unicode_encodings:
if encoding not in broken_unicode_with_stateful:
# check incremental decoder/encoder (fetched via the C API)
try:
cencoder = codec_incrementalencoder(encoding)
cencoder = _testcapi.codec_incrementalencoder(encoding)
except LookupError: # no IncrementalEncoder
pass
else:
Expand All @@ -2066,7 +2070,7 @@ def test_basics_capi(self):
for c in s:
encodedresult += cencoder.encode(c)
encodedresult += cencoder.encode("", True)
cdecoder = codec_incrementaldecoder(encoding)
cdecoder = _testcapi.codec_incrementaldecoder(encoding)
decodedresult = ""
for c in encodedresult:
decodedresult += cdecoder.decode(bytes([c]))
Expand All @@ -2077,12 +2081,12 @@ def test_basics_capi(self):
if encoding not in ("idna", "mbcs"):
# check incremental decoder/encoder with errors argument
try:
cencoder = codec_incrementalencoder(encoding, "ignore")
cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
except LookupError: # no IncrementalEncoder
pass
else:
encodedresult = b"".join(cencoder.encode(c) for c in s)
cdecoder = codec_incrementaldecoder(encoding, "ignore")
cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
decodedresult = "".join(cdecoder.decode(bytes([c]))
for c in encodedresult)
self.assertEqual(decodedresult, s,
Expand Down Expand Up @@ -3263,5 +3267,109 @@ def test_decode(self):
self.assertEqual(data.decode('latin1'), expected)


@unittest.skipIf(_testcapi is None, 'need _testcapi module')
class LocaleCodecTest(unittest.TestCase):
"""
Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
"""
ENCODING = sys.getfilesystemencoding()
STRINGS = ("ascii", "ulatin1:\xa7\xe9",
"u255:\xff",
"UCS:\xe9\u20ac\U0010ffff",
"surrogates:\uDC80\uDCFF")
BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
SURROGATES = "\uDC80\uDCFF"

def encode(self, text, errors="strict"):
return _testcapi.EncodeLocaleEx(text, 0, errors)

def check_encode_strings(self, errors):
for text in self.STRINGS:
with self.subTest(text=text):
try:
expected = text.encode(self.ENCODING, errors)
except UnicodeEncodeError:
with self.assertRaises(RuntimeError) as cm:
self.encode(self.SURROGATES)
errmsg = str(cm.exception)
self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg)
else:
encoded = self.encode(text, errors)
self.assertEqual(encoded, expected)

def test_encode_strict(self):
self.check_encode_strings("strict")

def test_encode_surrogateescape(self):
self.check_encode_strings("surrogateescape")

def test_encode_surrogatepass(self):
try:
self.encode('', 'surrogatepass')
except ValueError as exc:
if str(exc) == 'unsupported error handler':
self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
f"surrogatepass error handler")
else:
raise

self.check_encode_strings("surrogatepass")

def decode(self, encoded, errors="strict"):
return _testcapi.DecodeLocaleEx(encoded, 0, errors)

def check_decode_strings(self, errors):
is_utf8 = (self.ENCODING == "utf-8")
if is_utf8:
encode_errors = 'surrogateescape'
else:
encode_errors = 'strict'

strings = list(self.BYTES_STRINGS)
for text in self.STRINGS:
try:
encoded = text.encode(self.ENCODING, encode_errors)
if encoded not in strings:
strings.append(encoded)
except UnicodeEncodeError:
encoded = None

if is_utf8:
encoded2 = text.encode(self.ENCODING, 'surrogatepass')
if encoded2 != encoded:
strings.append(encoded2)

for encoded in strings:
with self.subTest(encoded=encoded):
try:
expected = encoded.decode(self.ENCODING, errors)
except UnicodeDecodeError:
with self.assertRaises(RuntimeError) as cm:
self.decode(encoded, errors)
errmsg = str(cm.exception)
self.assertTrue(errmsg.startswith("decode error: "), errmsg)
else:
decoded = self.decode(encoded, errors)
self.assertEqual(decoded, expected)

def test_decode_strict(self):
self.check_decode_strings("strict")

def test_decode_surrogateescape(self):
self.check_decode_strings("surrogateescape")

def test_decode_surrogatepass(self):
try:
self.decode(b'', 'surrogatepass')
except ValueError as exc:
if str(exc) == 'unsupported error handler':
self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
f"surrogatepass error handler")
else:
raise

self.check_decode_strings("surrogatepass")


if __name__ == "__main__":
unittest.main()
94 changes: 94 additions & 0 deletions Modules/_testcapimodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args)
}


static PyObject *
encode_locale_ex(PyObject *self, PyObject *args)
{
PyObject *unicode;
int current_locale = 0;
wchar_t *wstr;
PyObject *res = NULL;
const char *errors = NULL;

if (!PyArg_ParseTuple(args, "U|is", &unicode, &current_locale, &errors)) {
return NULL;
}
wstr = PyUnicode_AsWideCharString(unicode, NULL);
if (wstr == NULL) {
return NULL;
}
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);

char *str = NULL;
size_t error_pos;
const char *reason = NULL;
int ret = _Py_EncodeLocaleEx(wstr,
&str, &error_pos, &reason,
current_locale, error_handler);
PyMem_Free(wstr);

switch(ret) {
case 0:
res = PyBytes_FromString(str);
PyMem_RawFree(str);
break;
case -1:
PyErr_NoMemory();
break;
case -2:
PyErr_Format(PyExc_RuntimeError, "encode error: pos=%zu, reason=%s",
error_pos, reason);
break;
case -3:
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
break;
default:
PyErr_SetString(PyExc_ValueError, "unknow error code");
break;
}
return res;
}


static PyObject *
decode_locale_ex(PyObject *self, PyObject *args)
{
char *str;
int current_locale = 0;
PyObject *res = NULL;
const char *errors = NULL;

if (!PyArg_ParseTuple(args, "y|is", &str, &current_locale, &errors)) {
return NULL;
}
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);

wchar_t *wstr = NULL;
size_t wlen = 0;
const char *reason = NULL;
int ret = _Py_DecodeLocaleEx(str,
&wstr, &wlen, &reason,
current_locale, error_handler);

switch(ret) {
case 0:
res = PyUnicode_FromWideChar(wstr, wlen);
PyMem_RawFree(wstr);
break;
case -1:
PyErr_NoMemory();
break;
case -2:
PyErr_Format(PyExc_RuntimeError, "decode error: pos=%zu, reason=%s",
wlen, reason);
break;
case -3:
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
break;
default:
PyErr_SetString(PyExc_ValueError, "unknow error code");
break;
}
return res;
}


static PyMethodDef TestMethods[] = {
{"raise_exception", raise_exception, METH_VARARGS},
{"raise_memoryerror", raise_memoryerror, METH_NOARGS},
Expand Down Expand Up @@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = {
{"get_mapping_items", get_mapping_items, METH_O},
{"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS},
{"hamt", new_hamt, METH_NOARGS},
{"EncodeLocaleEx", encode_locale_ex, METH_VARARGS},
{"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
{NULL, NULL} /* sentinel */
};

Expand Down
2 changes: 1 addition & 1 deletion Objects/stringlib/codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
Py_ssize_t startpos, endpos, newpos;
Py_ssize_t k;
if (error_handler == _Py_ERROR_UNKNOWN) {
error_handler = get_error_handler(errors);
error_handler = _Py_GetErrorHandler(errors);
}

startpos = i-1;
Expand Down
Loading