Skip to content

Commit 096329f

Browse files
pganssleAlexey Izbyshev
authored andcommitted
bpo-34454: fix .fromisoformat() methods crashing on inputs with surrogate code points (GH-8862)
The current C implementations **crash** if the input includes a surrogate Unicode code point, which is not possible to encode in UTF-8. Important notes: 1. It is possible to pass a non-UTF-8 string as a separator to the `.isoformat()` methods. 2. The pure-Python `datetime.fromisoformat()` implementation accepts strings with a surrogate as the separator. In `datetime.fromisoformat()`, in the special case of non-UTF-8 separators, this implementation will take a performance hit by making a copy of the input string and replacing the separator with 'T'. Co-authored-by: Alexey Izbyshev <[email protected]> Co-authored-by: Paul Ganssle <[email protected]>
1 parent c33bb5d commit 096329f

File tree

3 files changed

+84
-10
lines changed

3 files changed

+84
-10
lines changed

Lib/test/datetimetester.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1667,6 +1667,7 @@ def test_fromisoformat_fails(self):
16671667
# Test that fromisoformat() fails on invalid values
16681668
bad_strs = [
16691669
'', # Empty string
1670+
'\ud800', # bpo-34454: Surrogate code point
16701671
'009-03-04', # Not 10 characters
16711672
'123456789', # Not a date
16721673
'200a-12-04', # Invalid character in year
@@ -1675,6 +1676,7 @@ def test_fromisoformat_fails(self):
16751676
'2009-01-32', # Invalid day
16761677
'2009-02-29', # Invalid leap day
16771678
'20090228', # Valid ISO8601 output not from isoformat()
1679+
'2009\ud80002\ud80028', # Separators are surrogate codepoints
16781680
]
16791681

16801682
for bad_str in bad_strs:
@@ -2587,7 +2589,8 @@ def test_fromisoformat_separators(self):
25872589
' ', 'T', '\u007f', # 1-bit widths
25882590
'\u0080', 'ʁ', # 2-bit widths
25892591
'ᛇ', '時', # 3-bit widths
2590-
'🐍' # 4-bit widths
2592+
'🐍', # 4-bit widths
2593+
'\ud800', # bpo-34454: Surrogate code point
25912594
]
25922595

25932596
for sep in separators:
@@ -2639,6 +2642,7 @@ def test_fromisoformat_fails_datetime(self):
26392642
# Test that fromisoformat() fails on invalid values
26402643
bad_strs = [
26412644
'', # Empty string
2645+
'\ud800', # bpo-34454: Surrogate code point
26422646
'2009.04-19T03', # Wrong first separator
26432647
'2009-04.19T03', # Wrong second separator
26442648
'2009-04-19T0a', # Invalid hours
@@ -2652,6 +2656,8 @@ def test_fromisoformat_fails_datetime(self):
26522656
'2009-04-19T03:15:45.123456+24:30', # Invalid time zone offset
26532657
'2009-04-19T03:15:45.123456-24:30', # Invalid negative offset
26542658
'2009-04-10ᛇᛇᛇᛇᛇ12:15', # Too many unicode separators
2659+
'2009-04\ud80010T12:15', # Surrogate char in date
2660+
'2009-04-10T12\ud80015', # Surrogate char in time
26552661
'2009-04-19T1', # Incomplete hours
26562662
'2009-04-19T12:3', # Incomplete minutes
26572663
'2009-04-19T12:30:4', # Incomplete seconds
@@ -3521,6 +3527,7 @@ def test_fromisoformat_timespecs(self):
35213527
def test_fromisoformat_fails(self):
35223528
bad_strs = [
35233529
'', # Empty string
3530+
'12\ud80000', # Invalid separator - surrogate char
35243531
'12:', # Ends on a separator
35253532
'12:30:', # Ends on a separator
35263533
'12:30:15.', # Ends on a separator
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix the .fromisoformat() methods of datetime types crashing when given
2+
unicode with non-UTF-8-encodable code points. Specifically,
3+
datetime.fromisoformat() now accepts surrogate unicode code points used as
4+
the separator. Report and tests by Alexey Izbyshev, patch by Paul Ganssle.

Modules/_datetimemodule.c

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2883,6 +2883,9 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
28832883
Py_ssize_t len;
28842884

28852885
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
2886+
if (dt_ptr == NULL) {
2887+
goto invalid_string_error;
2888+
}
28862889

28872890
int year = 0, month = 0, day = 0;
28882891

@@ -2894,12 +2897,15 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
28942897
}
28952898

28962899
if (rv < 0) {
2897-
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s",
2898-
dt_ptr);
2899-
return NULL;
2900+
goto invalid_string_error;
29002901
}
29012902

29022903
return new_date_subclass_ex(year, month, day, cls);
2904+
2905+
invalid_string_error:
2906+
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
2907+
dtstr);
2908+
return NULL;
29032909
}
29042910

29052911

@@ -4258,15 +4264,18 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
42584264
Py_ssize_t len;
42594265
const char *p = PyUnicode_AsUTF8AndSize(tstr, &len);
42604266

4267+
if (p == NULL) {
4268+
goto invalid_string_error;
4269+
}
4270+
42614271
int hour = 0, minute = 0, second = 0, microsecond = 0;
42624272
int tzoffset, tzimicrosecond = 0;
42634273
int rv = parse_isoformat_time(p, len,
42644274
&hour, &minute, &second, &microsecond,
42654275
&tzoffset, &tzimicrosecond);
42664276

42674277
if (rv < 0) {
4268-
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p);
4269-
return NULL;
4278+
goto invalid_string_error;
42704279
}
42714280

42724281
PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset,
@@ -4286,6 +4295,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
42864295

42874296
Py_DECREF(tzinfo);
42884297
return t;
4298+
4299+
invalid_string_error:
4300+
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr);
4301+
return NULL;
42894302
}
42904303

42914304

@@ -4839,6 +4852,33 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
48394852
return result;
48404853
}
48414854

4855+
static PyObject *
4856+
_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
4857+
// `fromisoformat` allows surrogate characters in exactly one position,
4858+
// the separator; to allow datetime_fromisoformat to make the simplifying
4859+
// assumption that all valid strings can be encoded in UTF-8, this function
4860+
// replaces any surrogate character separators with `T`.
4861+
Py_ssize_t len = PyUnicode_GetLength(dtstr);
4862+
*needs_decref = 0;
4863+
if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
4864+
return dtstr;
4865+
}
4866+
4867+
PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
4868+
if (str_out == NULL) {
4869+
return NULL;
4870+
}
4871+
4872+
if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
4873+
PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
4874+
Py_DECREF(str_out);
4875+
return NULL;
4876+
}
4877+
4878+
*needs_decref = 1;
4879+
return str_out;
4880+
}
4881+
48424882
static PyObject *
48434883
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
48444884
assert(dtstr != NULL);
@@ -4848,9 +4888,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
48484888
return NULL;
48494889
}
48504890

4891+
int needs_decref = 0;
4892+
dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
4893+
if (dtstr == NULL) {
4894+
goto error;
4895+
}
4896+
48514897
Py_ssize_t len;
48524898
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
4853-
const char * p = dt_ptr;
4899+
4900+
if (dt_ptr == NULL) {
4901+
goto invalid_string_error;
4902+
}
4903+
4904+
const char *p = dt_ptr;
48544905

48554906
int year = 0, month = 0, day = 0;
48564907
int hour = 0, minute = 0, second = 0, microsecond = 0;
@@ -4883,20 +4934,32 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
48834934
&tzoffset, &tzusec);
48844935
}
48854936
if (rv < 0) {
4886-
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr);
4887-
return NULL;
4937+
goto invalid_string_error;
48884938
}
48894939

48904940
PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
48914941
if (tzinfo == NULL) {
4892-
return NULL;
4942+
goto error;
48934943
}
48944944

48954945
PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
48964946
second, microsecond, tzinfo, cls);
48974947

48984948
Py_DECREF(tzinfo);
4949+
if (needs_decref) {
4950+
Py_DECREF(dtstr);
4951+
}
48994952
return dt;
4953+
4954+
invalid_string_error:
4955+
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
4956+
4957+
error:
4958+
if (needs_decref) {
4959+
Py_DECREF(dtstr);
4960+
}
4961+
4962+
return NULL;
49004963
}
49014964

49024965

0 commit comments

Comments
 (0)