diff --git a/docs/changelog.rst b/docs/changelog.rst index 62f0ac9b67..84f1352fb9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -6,10 +6,15 @@ Changelog Starting with version 1.8.0, pybind11 releases use a `semantic versioning `_ policy. -v2.6.3 (TBA, not yet released) +v2.7.0 (TBA, not yet released) ------------------------------ -* Details to follow here +* ``py::str`` changed to exclusively hold `PyUnicodeObject`. Previously + ``py::str`` could also hold `bytes`, which is probably surprising, was + never documented, and can mask bugs (e.g. accidental use of ``py::str`` + instead of ``py::bytes``). + `#2409 `_ + v2.6.2 (Jan 26, 2021) --------------------- diff --git a/docs/upgrade.rst b/docs/upgrade.rst index f13fbcf56d..5691879c85 100644 --- a/docs/upgrade.rst +++ b/docs/upgrade.rst @@ -10,6 +10,31 @@ modernization and other useful information. .. _upgrade-guide-2.6: +v2.7 +==== + +*Before* v2.7, ``py::str`` can hold ``PyUnicodeObject`` or ``PyBytesObject``, +and ``py::isinstance()`` is ``true`` for both ``py::str`` and +``py::bytes``. Starting with v2.7, ``py::str`` exclusively holds +``PyUnicodeObject`` (`#2409 `_), +and ``py::isinstance()`` is ``true`` only for ``py::str``. To help in +the transition of user code, the ``PYBIND11_STR_LEGACY_PERMISSIVE`` macro +is provided as an escape hatch to go back to the legacy behavior. This macro +will be removed in future releases. Two types of required fixes are expected +to be common: + +* Accidental use of ``py::str`` instead of ``py::bytes``, masked by the legacy + behavior. These are probably very easy to fix, by changing from + ``py::str`` to ``py::bytes``. + +* Reliance on py::isinstance(obj) being ``true`` for + ``py::bytes``. This is likely to be easy to fix in most cases by adding + ``|| py::isinstance(obj)``, but a fix may be more involved, e.g. if + ``py::isinstance`` appears in a template. Such situations will require + careful review and custom fixes. + + + v2.6 ==== diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 4f9f5d7f78..395ab732e4 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -1656,6 +1656,17 @@ struct pyobject_caster { template ::value, int> = 0> bool load(handle src, bool /* convert */) { +#if PY_MAJOR_VERSION < 3 && !defined(PYBIND11_STR_LEGACY_PERMISSIVE) + // For Python 2, without this implicit conversion, Python code would + // need to be cluttered with six.ensure_text() or similar, only to be + // un-cluttered later after Python 2 support is dropped. + if (std::is_same::value && isinstance(src)) { + PyObject *str_from_bytes = PyUnicode_FromEncodedObject(src.ptr(), "utf-8", nullptr); + if (!str_from_bytes) throw error_already_set(); + value = reinterpret_steal(str_from_bytes); + return true; + } +#endif if (!isinstance(src)) return false; value = reinterpret_borrow(src); diff --git a/include/pybind11/detail/common.h b/include/pybind11/detail/common.h index e314cc017c..4d33a7a8a3 100644 --- a/include/pybind11/detail/common.h +++ b/include/pybind11/detail/common.h @@ -163,6 +163,19 @@ #include #include +// #define PYBIND11_STR_LEGACY_PERMISSIVE +// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject +// (probably surprising and never documented, but this was the +// legacy behavior until and including v2.6.x). As a side-effect, +// pybind11::isinstance() is true for both pybind11::str and +// pybind11::bytes. +// If UNDEFINED, pybind11::str can only hold PyUnicodeObject, and +// pybind11::isinstance() is true only for pybind11::str. +// However, for Python 2 only (!), the pybind11::str caster +// implicitly decodes bytes to PyUnicodeObject. This is to ease +// the transition from the legacy behavior to the non-permissive +// behavior. + #if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions #define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr) #define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check diff --git a/include/pybind11/pytypes.h b/include/pybind11/pytypes.h index 0a5bfef278..fce3fa2d34 100644 --- a/include/pybind11/pytypes.h +++ b/include/pybind11/pytypes.h @@ -756,7 +756,12 @@ inline bool PyIterable_Check(PyObject *obj) { inline bool PyNone_Check(PyObject *o) { return o == Py_None; } inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; } +#ifdef PYBIND11_STR_LEGACY_PERMISSIVE inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); } +#define PYBIND11_STR_CHECK_FUN detail::PyUnicode_Check_Permissive +#else +#define PYBIND11_STR_CHECK_FUN PyUnicode_Check +#endif inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; } @@ -936,7 +941,7 @@ class bytes; class str : public object { public: - PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str) + PYBIND11_OBJECT_CVT(str, object, PYBIND11_STR_CHECK_FUN, raw_str) str(const char *c, size_t n) : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) { diff --git a/include/pybind11/stl.h b/include/pybind11/stl.h index 721bb669f0..18fbafb1e2 100644 --- a/include/pybind11/stl.h +++ b/include/pybind11/stl.h @@ -144,7 +144,7 @@ template struct list_caster { using value_conv = make_caster; bool load(handle src, bool convert) { - if (!isinstance(src) || isinstance(src)) + if (!isinstance(src) || isinstance(src) || isinstance(src)) return false; auto s = reinterpret_borrow(src); value.clear(); diff --git a/tests/test_pytypes.cpp b/tests/test_pytypes.cpp index 709611d222..d8fd77a9bb 100644 --- a/tests/test_pytypes.cpp +++ b/tests/test_pytypes.cpp @@ -413,4 +413,15 @@ TEST_SUBMODULE(pytypes, m) { // test_builtin_functions m.def("get_len", [](py::handle h) { return py::len(h); }); + +#ifdef PYBIND11_STR_LEGACY_PERMISSIVE + m.attr("PYBIND11_STR_LEGACY_PERMISSIVE") = true; +#endif + + m.def("isinstance_pybind11_bytes", [](py::object o) { return py::isinstance(o); }); + m.def("isinstance_pybind11_str", [](py::object o) { return py::isinstance(o); }); + + m.def("pass_to_pybind11_bytes", [](py::bytes b) { return py::len(b); }); + m.def("pass_to_pybind11_str", [](py::str s) { return py::len(s); }); + m.def("pass_to_std_string", [](std::string s) { return s.size(); }); } diff --git a/tests/test_pytypes.py b/tests/test_pytypes.py index b1509a0200..301015ae4d 100644 --- a/tests/test_pytypes.py +++ b/tests/test_pytypes.py @@ -120,14 +120,17 @@ def __repr__(self): assert s1 == s2 malformed_utf8 = b"\x80" - assert m.str_from_object(malformed_utf8) is malformed_utf8 # To be fixed; see #2380 + if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"): + assert m.str_from_object(malformed_utf8) is malformed_utf8 + elif env.PY2: + with pytest.raises(UnicodeDecodeError): + m.str_from_object(malformed_utf8) + else: + assert m.str_from_object(malformed_utf8) == "b'\\x80'" if env.PY2: - # with pytest.raises(UnicodeDecodeError): - # m.str_from_object(malformed_utf8) with pytest.raises(UnicodeDecodeError): m.str_from_handle(malformed_utf8) else: - # assert m.str_from_object(malformed_utf8) == "b'\\x80'" assert m.str_from_handle(malformed_utf8) == "b'\\x80'" @@ -303,13 +306,26 @@ def test_pybind11_str_raw_str(): valid_orig = u"DZ" valid_utf8 = valid_orig.encode("utf-8") valid_cvt = cvt(valid_utf8) - assert type(valid_cvt) == bytes # Probably surprising. - assert valid_cvt == b"\xc7\xb1" + if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"): + assert valid_cvt is valid_utf8 + else: + assert type(valid_cvt) is unicode if env.PY2 else str # noqa: F821 + if env.PY2: + assert valid_cvt == valid_orig + else: + assert valid_cvt == "b'\\xc7\\xb1'" malformed_utf8 = b"\x80" - malformed_cvt = cvt(malformed_utf8) - assert type(malformed_cvt) == bytes # Probably surprising. - assert malformed_cvt == b"\x80" + if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"): + assert cvt(malformed_utf8) is malformed_utf8 + else: + if env.PY2: + with pytest.raises(UnicodeDecodeError): + cvt(malformed_utf8) + else: + malformed_cvt = cvt(malformed_utf8) + assert type(malformed_cvt) is str + assert malformed_cvt == "b'\\x80'" def test_implicit_casting(): @@ -488,3 +504,40 @@ def test_builtin_functions(): "object of type 'generator' has no len()", "'generator' has no length", ] # PyPy + + +def test_isinstance_string_types(): + assert m.isinstance_pybind11_bytes(b"") + assert not m.isinstance_pybind11_bytes(u"") + + assert m.isinstance_pybind11_str(u"") + if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"): + assert m.isinstance_pybind11_str(b"") + else: + assert not m.isinstance_pybind11_str(b"") + + +def test_pass_bytes_or_unicode_to_string_types(): + assert m.pass_to_pybind11_bytes(b"Bytes") == 5 + with pytest.raises(TypeError): + m.pass_to_pybind11_bytes(u"Str") + + if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE") or env.PY2: + assert m.pass_to_pybind11_str(b"Bytes") == 5 + else: + with pytest.raises(TypeError): + m.pass_to_pybind11_str(b"Bytes") + assert m.pass_to_pybind11_str(u"Str") == 3 + + assert m.pass_to_std_string(b"Bytes") == 5 + assert m.pass_to_std_string(u"Str") == 3 + + malformed_utf8 = b"\x80" + if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"): + assert m.pass_to_pybind11_str(malformed_utf8) == 1 + elif env.PY2: + with pytest.raises(UnicodeDecodeError): + m.pass_to_pybind11_str(malformed_utf8) + else: + with pytest.raises(TypeError): + m.pass_to_pybind11_str(malformed_utf8)