Skip to content

gh-129173: Use _PyUnicodeError_GetParams in PyCodec_NameReplaceErrors #129135

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 8, 2025
234 changes: 123 additions & 111 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,60 @@ wrong_exception_type(PyObject *exc)
PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)


// --- codecs handlers: utilities ---------------------------------------------

/*
* Return the number of characters (including special prefixes)
* needed to represent 'ch' by codec_handler_write_unicode_hex().
*/
static inline Py_ssize_t
codec_handler_unicode_hex_width(Py_UCS4 ch)
{
if (ch >= 0x10000) {
// format: '\\' + 'U' + 8 hex digits
return 1 + 1 + 8;
}
else if (ch >= 0x100) {
// format: '\\' + 'u' + 4 hex digits
return 1 + 1 + 4;
}
else {
// format: '\\' + 'x' + 2 hex digits
return 1 + 1 + 2;
}
}


/*
* Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
* using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
*/
static inline void
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
{
*(*p)++ = '\\';
if (ch >= 0x10000) {
*(*p)++ = 'U';
*(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
*(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
*(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
*(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
*(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
*(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
}
else if (ch >= 0x100) {
*(*p)++ = 'u';
*(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
*(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
}
else {
*(*p)++ = 'x';
}
*(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
*(*p)++ = Py_hexdigits[ch & 0xf];
}


// --- handler: 'strict' ------------------------------------------------------

PyObject *PyCodec_StrictErrors(PyObject *exc)
Expand Down Expand Up @@ -942,17 +996,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)

Py_ssize_t ressize = 0;
for (Py_ssize_t i = start; i < end; ++i) {
/* object is guaranteed to be "ready" */
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
if (c >= 0x10000) {
ressize += 1 + 1 + 8;
}
else if (c >= 0x100) {
ressize += 1 + 1 + 4;
}
else {
ressize += 1 + 1 + 2;
}
ressize += codec_handler_unicode_hex_width(c);
}
PyObject *res = PyUnicode_New(ressize, 127);
if (res == NULL) {
Expand All @@ -962,122 +1007,86 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
for (Py_ssize_t i = start; i < end; ++i) {
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
*outp++ = '\\';
if (c >= 0x00010000) {
*outp++ = 'U';
*outp++ = Py_hexdigits[(c >> 28) & 0xf];
*outp++ = Py_hexdigits[(c >> 24) & 0xf];
*outp++ = Py_hexdigits[(c >> 20) & 0xf];
*outp++ = Py_hexdigits[(c >> 16) & 0xf];
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
}
else if (c >= 0x100) {
*outp++ = 'u';
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
}
else {
*outp++ = 'x';
}
*outp++ = Py_hexdigits[(c >> 4) & 0xf];
*outp++ = Py_hexdigits[c & 0xf];
codec_handler_write_unicode_hex(&outp, c);
}
assert(_PyUnicode_CheckConsistency(res, 1));
Py_DECREF(obj);
return Py_BuildValue("(Nn)", res, end);
}


// --- handler: 'namereplace' -------------------------------------------------

PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
{
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
PyObject *restuple;
PyObject *object;
Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
Py_UCS1 *outp;
Py_ssize_t ressize;
int replsize;
Py_UCS4 c;
char buffer[256]; /* NAME_MAXLEN */
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
_PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
if (ucnhash_capi == NULL) {
return NULL;
if (!_PyIsUnicodeEncodeError(exc)) {
wrong_exception_type(exc);
return NULL;
}

_PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
if (ucnhash_capi == NULL) {
return NULL;
}

PyObject *obj;
Py_ssize_t start, end;
if (_PyUnicodeError_GetParams(exc,
&obj, NULL,
&start, &end, NULL, false) < 0)
{
return NULL;
}

char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
Py_ssize_t imax = start, ressize = 0, replsize;
for (; imax < end; ++imax) {
Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
// If 'c' is recognized by getname(), the corresponding replacement
// is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
// characters. Failures of getname() are ignored by the handler.
replsize = 1 + 1 + 1 + strlen(buffer) + 1;
}
for (i = start, ressize = 0; i < end; ++i) {
/* object is guaranteed to be "ready" */
c = PyUnicode_READ_CHAR(object, i);
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
replsize = 1+1+1+(int)strlen(buffer)+1;
}
else if (c >= 0x10000) {
replsize = 1+1+8;
}
else if (c >= 0x100) {
replsize = 1+1+4;
}
else
replsize = 1+1+2;
if (ressize > PY_SSIZE_T_MAX - replsize)
break;
ressize += replsize;
else {
replsize = codec_handler_unicode_hex_width(c);
}
end = i;
res = PyUnicode_New(ressize, 127);
if (res==NULL)
return NULL;
for (i = start, outp = PyUnicode_1BYTE_DATA(res);
i < end; ++i) {
c = PyUnicode_READ_CHAR(object, i);
*outp++ = '\\';
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
*outp++ = 'N';
*outp++ = '{';
strcpy((char *)outp, buffer);
outp += strlen(buffer);
*outp++ = '}';
continue;
}
if (c >= 0x00010000) {
*outp++ = 'U';
*outp++ = Py_hexdigits[(c>>28)&0xf];
*outp++ = Py_hexdigits[(c>>24)&0xf];
*outp++ = Py_hexdigits[(c>>20)&0xf];
*outp++ = Py_hexdigits[(c>>16)&0xf];
*outp++ = Py_hexdigits[(c>>12)&0xf];
*outp++ = Py_hexdigits[(c>>8)&0xf];
}
else if (c >= 0x100) {
*outp++ = 'u';
*outp++ = Py_hexdigits[(c>>12)&0xf];
*outp++ = Py_hexdigits[(c>>8)&0xf];
}
else
*outp++ = 'x';
*outp++ = Py_hexdigits[(c>>4)&0xf];
*outp++ = Py_hexdigits[c&0xf];
if (ressize > PY_SSIZE_T_MAX - replsize) {
break;
}

assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
assert(_PyUnicode_CheckConsistency(res, 1));
restuple = Py_BuildValue("(Nn)", res, end);
Py_DECREF(object);
return restuple;
ressize += replsize;
}
else {
wrong_exception_type(exc);

PyObject *res = PyUnicode_New(ressize, 127);
if (res == NULL) {
Py_DECREF(obj);
return NULL;
}

Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
for (Py_ssize_t i = start; i < imax; ++i) {
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
*outp++ = '\\';
*outp++ = 'N';
*outp++ = '{';
(void)strcpy((char *)outp, buffer);
outp += strlen(buffer);
*outp++ = '}';
}
else {
codec_handler_write_unicode_hex(&outp, c);
}
}

assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
assert(_PyUnicode_CheckConsistency(res, 1));
PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
Py_DECREF(obj);
return restuple;
}


#define ENC_UNKNOWN -1
#define ENC_UTF8 0
#define ENC_UTF16BE 1
Expand Down Expand Up @@ -1421,11 +1430,14 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
return PyCodec_BackslashReplaceErrors(exc);
}

static PyObject *namereplace_errors(PyObject *self, PyObject *exc)

static inline PyObject *
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
{
return PyCodec_NameReplaceErrors(exc);
}


static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
{
return PyCodec_SurrogatePassErrors(exc);
Expand Down
Loading