Skip to content

Commit 3d4226a

Browse files
authored
bpo-34523: Support surrogatepass in locale codecs (pythonGH-8995)
Add support for the "surrogatepass" error handler in PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() for the UTF-8 encoding. Changes: * _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the surrogatepass error handler (_Py_ERROR_SURROGATEPASS). * _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use the _Py_error_handler enum instead of "int surrogateescape" to pass the error handler. These functions now return -3 if the error handler is unknown. * Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() in test_codecs. * Rename get_error_handler() to _Py_GetErrorHandler() and expose it as a private function. * _freeze_importlib doesn't need config.filesystem_errors="strict" workaround anymore.
1 parent c5989cd commit 3d4226a

File tree

7 files changed

+421
-115
lines changed

7 files changed

+421
-115
lines changed

Include/fileutils.h

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,24 @@
55
extern "C" {
66
#endif
77

8+
9+
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
10+
typedef enum {
11+
_Py_ERROR_UNKNOWN=0,
12+
_Py_ERROR_STRICT,
13+
_Py_ERROR_SURROGATEESCAPE,
14+
_Py_ERROR_REPLACE,
15+
_Py_ERROR_IGNORE,
16+
_Py_ERROR_BACKSLASHREPLACE,
17+
_Py_ERROR_SURROGATEPASS,
18+
_Py_ERROR_XMLCHARREFREPLACE,
19+
_Py_ERROR_OTHER
20+
} _Py_error_handler;
21+
22+
PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
23+
#endif
24+
25+
826
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
927
PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
1028
const char *arg,
@@ -26,35 +44,38 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
2644
wchar_t **wstr,
2745
size_t *wlen,
2846
const char **reason,
29-
int surrogateescape);
47+
_Py_error_handler errors);
3048

3149
PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
3250
const wchar_t *text,
3351
char **str,
3452
size_t *error_pos,
3553
const char **reason,
3654
int raw_malloc,
37-
int surrogateescape);
55+
_Py_error_handler errors);
3856

3957
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
4058
const char *arg,
4159
Py_ssize_t arglen);
60+
#endif
61+
4262

63+
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
4364
PyAPI_FUNC(int) _Py_DecodeLocaleEx(
4465
const char *arg,
4566
wchar_t **wstr,
4667
size_t *wlen,
4768
const char **reason,
4869
int current_locale,
49-
int surrogateescape);
70+
_Py_error_handler errors);
5071

5172
PyAPI_FUNC(int) _Py_EncodeLocaleEx(
5273
const wchar_t *text,
5374
char **str,
5475
size_t *error_pos,
5576
const char **reason,
5677
int current_locale,
57-
int surrogateescape);
78+
_Py_error_handler errors);
5879
#endif
5980

6081
#ifndef Py_LIMITED_API

Lib/test/test_codecs.py

Lines changed: 113 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99

1010
from test import support
1111

12+
try:
13+
import _testcapi
14+
except ImportError as exc:
15+
_testcapi = None
16+
1217
try:
1318
import ctypes
1419
except ImportError:
@@ -2051,13 +2056,12 @@ def test_basics(self):
20512056

20522057
@support.cpython_only
20532058
def test_basics_capi(self):
2054-
from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
20552059
s = "abc123" # all codecs should be able to encode these
20562060
for encoding in all_unicode_encodings:
20572061
if encoding not in broken_unicode_with_stateful:
20582062
# check incremental decoder/encoder (fetched via the C API)
20592063
try:
2060-
cencoder = codec_incrementalencoder(encoding)
2064+
cencoder = _testcapi.codec_incrementalencoder(encoding)
20612065
except LookupError: # no IncrementalEncoder
20622066
pass
20632067
else:
@@ -2066,7 +2070,7 @@ def test_basics_capi(self):
20662070
for c in s:
20672071
encodedresult += cencoder.encode(c)
20682072
encodedresult += cencoder.encode("", True)
2069-
cdecoder = codec_incrementaldecoder(encoding)
2073+
cdecoder = _testcapi.codec_incrementaldecoder(encoding)
20702074
decodedresult = ""
20712075
for c in encodedresult:
20722076
decodedresult += cdecoder.decode(bytes([c]))
@@ -2077,12 +2081,12 @@ def test_basics_capi(self):
20772081
if encoding not in ("idna", "mbcs"):
20782082
# check incremental decoder/encoder with errors argument
20792083
try:
2080-
cencoder = codec_incrementalencoder(encoding, "ignore")
2084+
cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
20812085
except LookupError: # no IncrementalEncoder
20822086
pass
20832087
else:
20842088
encodedresult = b"".join(cencoder.encode(c) for c in s)
2085-
cdecoder = codec_incrementaldecoder(encoding, "ignore")
2089+
cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
20862090
decodedresult = "".join(cdecoder.decode(bytes([c]))
20872091
for c in encodedresult)
20882092
self.assertEqual(decodedresult, s,
@@ -3263,5 +3267,109 @@ def test_decode(self):
32633267
self.assertEqual(data.decode('latin1'), expected)
32643268

32653269

3270+
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3271+
class LocaleCodecTest(unittest.TestCase):
3272+
"""
3273+
Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3274+
"""
3275+
ENCODING = sys.getfilesystemencoding()
3276+
STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3277+
"u255:\xff",
3278+
"UCS:\xe9\u20ac\U0010ffff",
3279+
"surrogates:\uDC80\uDCFF")
3280+
BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3281+
SURROGATES = "\uDC80\uDCFF"
3282+
3283+
def encode(self, text, errors="strict"):
3284+
return _testcapi.EncodeLocaleEx(text, 0, errors)
3285+
3286+
def check_encode_strings(self, errors):
3287+
for text in self.STRINGS:
3288+
with self.subTest(text=text):
3289+
try:
3290+
expected = text.encode(self.ENCODING, errors)
3291+
except UnicodeEncodeError:
3292+
with self.assertRaises(RuntimeError) as cm:
3293+
self.encode(self.SURROGATES)
3294+
errmsg = str(cm.exception)
3295+
self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg)
3296+
else:
3297+
encoded = self.encode(text, errors)
3298+
self.assertEqual(encoded, expected)
3299+
3300+
def test_encode_strict(self):
3301+
self.check_encode_strings("strict")
3302+
3303+
def test_encode_surrogateescape(self):
3304+
self.check_encode_strings("surrogateescape")
3305+
3306+
def test_encode_surrogatepass(self):
3307+
try:
3308+
self.encode('', 'surrogatepass')
3309+
except ValueError as exc:
3310+
if str(exc) == 'unsupported error handler':
3311+
self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3312+
f"surrogatepass error handler")
3313+
else:
3314+
raise
3315+
3316+
self.check_encode_strings("surrogatepass")
3317+
3318+
def decode(self, encoded, errors="strict"):
3319+
return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3320+
3321+
def check_decode_strings(self, errors):
3322+
is_utf8 = (self.ENCODING == "utf-8")
3323+
if is_utf8:
3324+
encode_errors = 'surrogateescape'
3325+
else:
3326+
encode_errors = 'strict'
3327+
3328+
strings = list(self.BYTES_STRINGS)
3329+
for text in self.STRINGS:
3330+
try:
3331+
encoded = text.encode(self.ENCODING, encode_errors)
3332+
if encoded not in strings:
3333+
strings.append(encoded)
3334+
except UnicodeEncodeError:
3335+
encoded = None
3336+
3337+
if is_utf8:
3338+
encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3339+
if encoded2 != encoded:
3340+
strings.append(encoded2)
3341+
3342+
for encoded in strings:
3343+
with self.subTest(encoded=encoded):
3344+
try:
3345+
expected = encoded.decode(self.ENCODING, errors)
3346+
except UnicodeDecodeError:
3347+
with self.assertRaises(RuntimeError) as cm:
3348+
self.decode(encoded, errors)
3349+
errmsg = str(cm.exception)
3350+
self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3351+
else:
3352+
decoded = self.decode(encoded, errors)
3353+
self.assertEqual(decoded, expected)
3354+
3355+
def test_decode_strict(self):
3356+
self.check_decode_strings("strict")
3357+
3358+
def test_decode_surrogateescape(self):
3359+
self.check_decode_strings("surrogateescape")
3360+
3361+
def test_decode_surrogatepass(self):
3362+
try:
3363+
self.decode(b'', 'surrogatepass')
3364+
except ValueError as exc:
3365+
if str(exc) == 'unsupported error handler':
3366+
self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3367+
f"surrogatepass error handler")
3368+
else:
3369+
raise
3370+
3371+
self.check_decode_strings("surrogatepass")
3372+
3373+
32663374
if __name__ == "__main__":
32673375
unittest.main()

Modules/_testcapimodule.c

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args)
45504550
}
45514551

45524552

4553+
static PyObject *
4554+
encode_locale_ex(PyObject *self, PyObject *args)
4555+
{
4556+
PyObject *unicode;
4557+
int current_locale = 0;
4558+
wchar_t *wstr;
4559+
PyObject *res = NULL;
4560+
const char *errors = NULL;
4561+
4562+
if (!PyArg_ParseTuple(args, "U|is", &unicode, &current_locale, &errors)) {
4563+
return NULL;
4564+
}
4565+
wstr = PyUnicode_AsWideCharString(unicode, NULL);
4566+
if (wstr == NULL) {
4567+
return NULL;
4568+
}
4569+
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4570+
4571+
char *str = NULL;
4572+
size_t error_pos;
4573+
const char *reason = NULL;
4574+
int ret = _Py_EncodeLocaleEx(wstr,
4575+
&str, &error_pos, &reason,
4576+
current_locale, error_handler);
4577+
PyMem_Free(wstr);
4578+
4579+
switch(ret) {
4580+
case 0:
4581+
res = PyBytes_FromString(str);
4582+
PyMem_RawFree(str);
4583+
break;
4584+
case -1:
4585+
PyErr_NoMemory();
4586+
break;
4587+
case -2:
4588+
PyErr_Format(PyExc_RuntimeError, "encode error: pos=%zu, reason=%s",
4589+
error_pos, reason);
4590+
break;
4591+
case -3:
4592+
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4593+
break;
4594+
default:
4595+
PyErr_SetString(PyExc_ValueError, "unknow error code");
4596+
break;
4597+
}
4598+
return res;
4599+
}
4600+
4601+
4602+
static PyObject *
4603+
decode_locale_ex(PyObject *self, PyObject *args)
4604+
{
4605+
char *str;
4606+
int current_locale = 0;
4607+
PyObject *res = NULL;
4608+
const char *errors = NULL;
4609+
4610+
if (!PyArg_ParseTuple(args, "y|is", &str, &current_locale, &errors)) {
4611+
return NULL;
4612+
}
4613+
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4614+
4615+
wchar_t *wstr = NULL;
4616+
size_t wlen = 0;
4617+
const char *reason = NULL;
4618+
int ret = _Py_DecodeLocaleEx(str,
4619+
&wstr, &wlen, &reason,
4620+
current_locale, error_handler);
4621+
4622+
switch(ret) {
4623+
case 0:
4624+
res = PyUnicode_FromWideChar(wstr, wlen);
4625+
PyMem_RawFree(wstr);
4626+
break;
4627+
case -1:
4628+
PyErr_NoMemory();
4629+
break;
4630+
case -2:
4631+
PyErr_Format(PyExc_RuntimeError, "decode error: pos=%zu, reason=%s",
4632+
wlen, reason);
4633+
break;
4634+
case -3:
4635+
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4636+
break;
4637+
default:
4638+
PyErr_SetString(PyExc_ValueError, "unknow error code");
4639+
break;
4640+
}
4641+
return res;
4642+
}
4643+
4644+
45534645
static PyMethodDef TestMethods[] = {
45544646
{"raise_exception", raise_exception, METH_VARARGS},
45554647
{"raise_memoryerror", raise_memoryerror, METH_NOARGS},
@@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = {
47714863
{"get_mapping_items", get_mapping_items, METH_O},
47724864
{"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS},
47734865
{"hamt", new_hamt, METH_NOARGS},
4866+
{"EncodeLocaleEx", encode_locale_ex, METH_VARARGS},
4867+
{"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
47744868
{NULL, NULL} /* sentinel */
47754869
};
47764870

Objects/stringlib/codecs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
313313
Py_ssize_t startpos, endpos, newpos;
314314
Py_ssize_t k;
315315
if (error_handler == _Py_ERROR_UNKNOWN) {
316-
error_handler = get_error_handler(errors);
316+
error_handler = _Py_GetErrorHandler(errors);
317317
}
318318

319319
startpos = i-1;

0 commit comments

Comments
 (0)