Skip to content

gh-129173: Use _PyUnicodeError_GetParams in PyCodec_SurrogatePassErrors #129134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 14, 2025
282 changes: 162 additions & 120 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1095,7 +1095,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
#define ENC_UTF32LE 4

static int
get_standard_encoding(const char *encoding, int *bytelength)
get_standard_encoding_impl(const char *encoding, int *bytelength)
{
if (Py_TOLOWER(encoding[0]) == 'u' &&
Py_TOLOWER(encoding[1]) == 't' &&
Expand Down Expand Up @@ -1153,172 +1153,212 @@ get_standard_encoding(const char *encoding, int *bytelength)
return ENC_UNKNOWN;
}

/* This handler is declared static until someone demonstrates
a need to call it directly. */

static int
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
{
const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
if (encoding_cstr == NULL) {
return -1;
}
*code = get_standard_encoding_impl(encoding_cstr, bytelength);
return 0;
}


// --- handler: 'surrogatepass' -----------------------------------------------

static PyObject *
PyCodec_SurrogatePassErrors(PyObject *exc)
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
PyObject *encode;
const char *encoding;
int code;
int bytelength;
Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
if (encoding == NULL) {
return NULL;
}
int code, bytelength;
int rc = get_standard_encoding(encoding, &code, &bytelength);
Py_DECREF(encoding);
if (rc < 0) {
return NULL;
}
if (code == ENC_UNKNOWN) {
goto bail;
}

if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
unsigned char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
if (code == ENC_UNKNOWN) {
/* Not supported, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(object);
return NULL;
}
PyObject *obj;
Py_ssize_t objlen, start, end, slen;
if (_PyUnicodeError_GetParams(exc,
&obj, &objlen,
&start, &end, &slen, false) < 0)
{
return NULL;
}

if (end - start > PY_SSIZE_T_MAX / bytelength)
end = start + PY_SSIZE_T_MAX / bytelength;
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
if (!res) {
Py_DECREF(object);
return NULL;
if (slen > PY_SSIZE_T_MAX / bytelength) {
end = start + PY_SSIZE_T_MAX / bytelength;
end = Py_MIN(end, objlen);
slen = Py_MAX(0, end - start);
}

PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
if (res == NULL) {
Py_DECREF(obj);
return NULL;
}

unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
for (Py_ssize_t i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* Not a surrogate, fail with original exception */
Py_DECREF(obj);
Py_DECREF(res);
goto bail;
}
outp = (unsigned char*)PyBytes_AsString(res);
for (i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* Not a surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(res);
Py_DECREF(object);
return NULL;
}
switch (code) {
case ENC_UTF8:
switch (code) {
case ENC_UTF8: {
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
break;
case ENC_UTF16LE:
*outp++ = (unsigned char) ch;
}
case ENC_UTF16LE: {
*outp++ = (unsigned char)ch;
*outp++ = (unsigned char)(ch >> 8);
break;
case ENC_UTF16BE:
}
case ENC_UTF16BE: {
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)ch;
break;
case ENC_UTF32LE:
*outp++ = (unsigned char) ch;
}
case ENC_UTF32LE: {
*outp++ = (unsigned char)ch;
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 24);
break;
case ENC_UTF32BE:
}
case ENC_UTF32BE: {
*outp++ = (unsigned char)(ch >> 24);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)ch;
break;
}
}
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
Py_DECREF(object);
return restuple;
}
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
const unsigned char *p;
Py_UCS4 ch = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeDecodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL;
p = (const unsigned char*)PyBytes_AS_STRING(object);
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
if (code == ENC_UNKNOWN) {
/* Not supported, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(object);
return NULL;
}

/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
p += start;
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
switch (code) {
case ENC_UTF8:
Py_DECREF(obj);
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
return restuple;

bail:
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}


static PyObject *
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
{
PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
if (encoding == NULL) {
return NULL;
}
int code, bytelength;
int rc = get_standard_encoding(encoding, &code, &bytelength);
Py_DECREF(encoding);
if (rc < 0) {
return NULL;
}
if (code == ENC_UNKNOWN) {
goto bail;
}

PyObject *obj;
Py_ssize_t objlen, start, end, slen;
if (_PyUnicodeError_GetParams(exc,
&obj, &objlen,
&start, &end, &slen, true) < 0)
{
return NULL;
}

/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
Py_UCS4 ch = 0;
const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
p += start;

if (objlen - start >= bytelength) {
switch (code) {
case ENC_UTF8: {
if ((p[0] & 0xf0) == 0xe0 &&
(p[1] & 0xc0) == 0x80 &&
(p[2] & 0xc0) == 0x80) {
(p[2] & 0xc0) == 0x80)
{
/* it's a three-byte code */
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
ch = ((p[0] & 0x0f) << 12) +
((p[1] & 0x3f) << 6) +
(p[2] & 0x3f);
}
break;
case ENC_UTF16LE:
}
case ENC_UTF16LE: {
ch = p[1] << 8 | p[0];
break;
case ENC_UTF16BE:
}
case ENC_UTF16BE: {
ch = p[0] << 8 | p[1];
break;
case ENC_UTF32LE:
}
case ENC_UTF32LE: {
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
break;
case ENC_UTF32BE:
}
case ENC_UTF32BE: {
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
break;
}
}
}
Py_DECREF(obj);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
goto bail;
}

Py_DECREF(object);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* it's not a surrogate - fail */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}
res = PyUnicode_FromOrdinal(ch);
if (res == NULL)
return NULL;
return Py_BuildValue("(Nn)", res, start + bytelength);
PyObject *res = PyUnicode_FromOrdinal(ch);
if (res == NULL) {
return NULL;
}
return Py_BuildValue("(Nn)", res, start + bytelength);

bail:
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}


/* This handler is declared static until someone demonstrates
a need to call it directly. */
static PyObject *
PyCodec_SurrogatePassErrors(PyObject *exc)
{
if (_PyIsUnicodeEncodeError(exc)) {
return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
}
else if (_PyIsUnicodeDecodeError(exc)) {
return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
}
else {
wrong_exception_type(exc);
return NULL;
}
}


static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject *exc)
{
Expand Down Expand Up @@ -1438,11 +1478,13 @@ namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
}


static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
static inline PyObject *
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
{
return PyCodec_SurrogatePassErrors(exc);
}


static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
{
return PyCodec_SurrogateEscapeErrors(exc);
Expand Down
Loading