From 0dcaca6fcf407c3b9891499f1a3e314619d67165 Mon Sep 17 00:00:00 2001 From: Aaron Gokaslan Date: Fri, 28 Oct 2022 11:26:30 -0400 Subject: [PATCH 1/6] (bugfix): Improve bytes to str decoding error handling --- include/pybind11/pytypes.h | 9 +++++++++ tests/test_pytypes.cpp | 3 +++ tests/test_pytypes.py | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/include/pybind11/pytypes.h b/include/pybind11/pytypes.h index 37fc49a0e1..80a2e2228e 100644 --- a/include/pybind11/pytypes.h +++ b/include/pybind11/pytypes.h @@ -1432,6 +1432,9 @@ class str : public object { str(const char *c, const SzType &n) : object(PyUnicode_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) { if (!m_ptr) { + if (PyErr_Occurred()) { + throw error_already_set(); + } pybind11_fail("Could not allocate string object!"); } } @@ -1441,6 +1444,9 @@ class str : public object { // NOLINTNEXTLINE(google-explicit-constructor) str(const char *c = "") : object(PyUnicode_FromString(c), stolen_t{}) { if (!m_ptr) { + if (PyErr_Occurred()) { + throw error_already_set(); + } pybind11_fail("Could not allocate string object!"); } } @@ -1598,6 +1604,9 @@ inline str::str(const bytes &b) { } auto obj = reinterpret_steal(PyUnicode_FromStringAndSize(buffer, length)); if (!obj) { + if (PyErr_Occurred()) { + throw error_already_set(); + } pybind11_fail("Could not allocate string object!"); } m_ptr = obj.release().ptr(); diff --git a/tests/test_pytypes.cpp b/tests/test_pytypes.cpp index 99237ccef7..bee40a8f27 100644 --- a/tests/test_pytypes.cpp +++ b/tests/test_pytypes.cpp @@ -219,6 +219,9 @@ TEST_SUBMODULE(pytypes, m) { return py::make_tuple(s1, s2); }); + m.def("bytes_to_str_explicit", + [](const py::bytes &encoded_str) { return py::str(encoded_str); }); + // test_bytes m.def("bytes_from_char_ssize_t", []() { return py::bytes{"green", (py::ssize_t) 5}; }); m.def("bytes_from_char_size_t", []() { return py::bytes{"purple", (py::size_t) 6}; }); diff --git a/tests/test_pytypes.py b/tests/test_pytypes.py index 3ed7b9c946..510faefa59 100644 --- a/tests/test_pytypes.py +++ b/tests/test_pytypes.py @@ -244,6 +244,12 @@ def __repr__(self): m.str_from_string_from_str(ucs_surrogates_str) +def test_surrogate_pairs_unicode_error(): + input_str = "\ud83d\ude4f".encode("utf-8", "surrogatepass") + with pytest.raises(UnicodeDecodeError): + m.bytes_to_str_explicit(input_str) + + def test_bytes(doc): assert m.bytes_from_char_ssize_t().decode() == "green" assert m.bytes_from_char_size_t().decode() == "purple" From f57c20ed1613b0c9a2ce15b367f76e0437b22585 Mon Sep 17 00:00:00 2001 From: Aaron Gokaslan Date: Fri, 28 Oct 2022 11:32:58 -0400 Subject: [PATCH 2/6] regroup test --- tests/test_pytypes.cpp | 5 ++--- tests/test_pytypes.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_pytypes.cpp b/tests/test_pytypes.cpp index bee40a8f27..216ca537cc 100644 --- a/tests/test_pytypes.cpp +++ b/tests/test_pytypes.cpp @@ -207,6 +207,8 @@ TEST_SUBMODULE(pytypes, m) { m.def("str_from_char_size_t", []() { return py::str{"blue", (py::size_t) 4}; }); m.def("str_from_string", []() { return py::str(std::string("baz")); }); m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); }); + m.def("str_from_bytes_input", + [](const py::bytes &encoded_str) { return py::str(encoded_str); }); m.def("str_from_object", [](const py::object &obj) { return py::str(obj); }); m.def("repr_from_object", [](const py::object &obj) { return py::repr(obj); }); m.def("str_from_handle", [](py::handle h) { return py::str(h); }); @@ -219,9 +221,6 @@ TEST_SUBMODULE(pytypes, m) { return py::make_tuple(s1, s2); }); - m.def("bytes_to_str_explicit", - [](const py::bytes &encoded_str) { return py::str(encoded_str); }); - // test_bytes m.def("bytes_from_char_ssize_t", []() { return py::bytes{"green", (py::ssize_t) 5}; }); m.def("bytes_from_char_size_t", []() { return py::bytes{"purple", (py::size_t) 6}; }); diff --git a/tests/test_pytypes.py b/tests/test_pytypes.py index 510faefa59..69c47c6cbf 100644 --- a/tests/test_pytypes.py +++ b/tests/test_pytypes.py @@ -247,7 +247,7 @@ def __repr__(self): def test_surrogate_pairs_unicode_error(): input_str = "\ud83d\ude4f".encode("utf-8", "surrogatepass") with pytest.raises(UnicodeDecodeError): - m.bytes_to_str_explicit(input_str) + m.str_from_bytes_input(input_str) def test_bytes(doc): From eede8f2ac28ab7d7321761c77c4ba086640d7dd3 Mon Sep 17 00:00:00 2001 From: Aaron Gokaslan Date: Fri, 28 Oct 2022 11:39:35 -0400 Subject: [PATCH 3/6] Further broaden tests --- tests/test_pytypes.cpp | 1 + tests/test_pytypes.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_pytypes.cpp b/tests/test_pytypes.cpp index 216ca537cc..bc508ac797 100644 --- a/tests/test_pytypes.cpp +++ b/tests/test_pytypes.cpp @@ -206,6 +206,7 @@ TEST_SUBMODULE(pytypes, m) { m.def("str_from_char_ssize_t", []() { return py::str{"red", (py::ssize_t) 3}; }); m.def("str_from_char_size_t", []() { return py::str{"blue", (py::size_t) 4}; }); m.def("str_from_string", []() { return py::str(std::string("baz")); }); + m.def("str_from_string_input", [](const std::string &stri) { return py::str(stri); }); m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); }); m.def("str_from_bytes_input", [](const py::bytes &encoded_str) { return py::str(encoded_str); }); diff --git a/tests/test_pytypes.py b/tests/test_pytypes.py index 69c47c6cbf..035e07312a 100644 --- a/tests/test_pytypes.py +++ b/tests/test_pytypes.py @@ -244,7 +244,10 @@ def __repr__(self): m.str_from_string_from_str(ucs_surrogates_str) -def test_surrogate_pairs_unicode_error(): +@pytest.mark.parametrize( + "func", [m.str_from_bytes_input, m.str_from_string_input, m.str_from_object] +) +def test_surrogate_pairs_unicode_error(func): input_str = "\ud83d\ude4f".encode("utf-8", "surrogatepass") with pytest.raises(UnicodeDecodeError): m.str_from_bytes_input(input_str) From f4021d842f740394912b3f34d83a0639a09ef7ce Mon Sep 17 00:00:00 2001 From: Aaron Gokaslan Date: Fri, 28 Oct 2022 11:47:51 -0400 Subject: [PATCH 4/6] Add another decode error test --- tests/test_pytypes.cpp | 1 + tests/test_pytypes.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_pytypes.cpp b/tests/test_pytypes.cpp index bc508ac797..c6802f3a9a 100644 --- a/tests/test_pytypes.cpp +++ b/tests/test_pytypes.cpp @@ -207,6 +207,7 @@ TEST_SUBMODULE(pytypes, m) { m.def("str_from_char_size_t", []() { return py::str{"blue", (py::size_t) 4}; }); m.def("str_from_string", []() { return py::str(std::string("baz")); }); m.def("str_from_string_input", [](const std::string &stri) { return py::str(stri); }); + m.def("str_from_cstr_input", [](const std::string &stri) { return py::str(stri.c_str()); }); m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); }); m.def("str_from_bytes_input", [](const py::bytes &encoded_str) { return py::str(encoded_str); }); diff --git a/tests/test_pytypes.py b/tests/test_pytypes.py index 035e07312a..401c5db666 100644 --- a/tests/test_pytypes.py +++ b/tests/test_pytypes.py @@ -245,7 +245,13 @@ def __repr__(self): @pytest.mark.parametrize( - "func", [m.str_from_bytes_input, m.str_from_string_input, m.str_from_object] + "func", + [ + m.str_from_bytes_input, + m.str_from_string_input, + m.str_from_cstr_input, + m.str_from_object, + ], ) def test_surrogate_pairs_unicode_error(func): input_str = "\ud83d\ude4f".encode("utf-8", "surrogatepass") From bae70ef6beb8688d15f7c98d85e0c738e9d18ea3 Mon Sep 17 00:00:00 2001 From: Aaron Gokaslan Date: Fri, 28 Oct 2022 12:07:04 -0400 Subject: [PATCH 5/6] Fix bug in tests --- tests/test_pytypes.cpp | 2 +- tests/test_pytypes.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_pytypes.cpp b/tests/test_pytypes.cpp index c6802f3a9a..3f691929cb 100644 --- a/tests/test_pytypes.cpp +++ b/tests/test_pytypes.cpp @@ -207,7 +207,7 @@ TEST_SUBMODULE(pytypes, m) { m.def("str_from_char_size_t", []() { return py::str{"blue", (py::size_t) 4}; }); m.def("str_from_string", []() { return py::str(std::string("baz")); }); m.def("str_from_string_input", [](const std::string &stri) { return py::str(stri); }); - m.def("str_from_cstr_input", [](const std::string &stri) { return py::str(stri.c_str()); }); + m.def("str_from_cstr_input", [](const char *c_str) { return py::str(c_str); }); m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); }); m.def("str_from_bytes_input", [](const py::bytes &encoded_str) { return py::str(encoded_str); }); diff --git a/tests/test_pytypes.py b/tests/test_pytypes.py index 401c5db666..d3245b4e1b 100644 --- a/tests/test_pytypes.py +++ b/tests/test_pytypes.py @@ -248,15 +248,14 @@ def __repr__(self): "func", [ m.str_from_bytes_input, - m.str_from_string_input, m.str_from_cstr_input, - m.str_from_object, + m.str_from_string_input, ], ) def test_surrogate_pairs_unicode_error(func): input_str = "\ud83d\ude4f".encode("utf-8", "surrogatepass") with pytest.raises(UnicodeDecodeError): - m.str_from_bytes_input(input_str) + func(input_str) def test_bytes(doc): From b4308ccb5d153499bc2998712c3f11ff56c7e9e9 Mon Sep 17 00:00:00 2001 From: Aaron Gokaslan Date: Sat, 29 Oct 2022 10:36:50 -0400 Subject: [PATCH 6/6] Reviewer suggestions --- tests/test_pytypes.cpp | 3 ++- tests/test_pytypes.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_pytypes.cpp b/tests/test_pytypes.cpp index 3f691929cb..ea8d03958b 100644 --- a/tests/test_pytypes.cpp +++ b/tests/test_pytypes.cpp @@ -206,11 +206,12 @@ TEST_SUBMODULE(pytypes, m) { m.def("str_from_char_ssize_t", []() { return py::str{"red", (py::ssize_t) 3}; }); m.def("str_from_char_size_t", []() { return py::str{"blue", (py::size_t) 4}; }); m.def("str_from_string", []() { return py::str(std::string("baz")); }); - m.def("str_from_string_input", [](const std::string &stri) { return py::str(stri); }); + m.def("str_from_std_string_input", [](const std::string &stri) { return py::str(stri); }); m.def("str_from_cstr_input", [](const char *c_str) { return py::str(c_str); }); m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); }); m.def("str_from_bytes_input", [](const py::bytes &encoded_str) { return py::str(encoded_str); }); + m.def("str_from_object", [](const py::object &obj) { return py::str(obj); }); m.def("repr_from_object", [](const py::object &obj) { return py::repr(obj); }); m.def("str_from_handle", [](py::handle h) { return py::str(h); }); diff --git a/tests/test_pytypes.py b/tests/test_pytypes.py index d3245b4e1b..079ee7ca50 100644 --- a/tests/test_pytypes.py +++ b/tests/test_pytypes.py @@ -249,7 +249,7 @@ def __repr__(self): [ m.str_from_bytes_input, m.str_from_cstr_input, - m.str_from_string_input, + m.str_from_std_string_input, ], ) def test_surrogate_pairs_unicode_error(func):