Skip to content

Add C++20 char8_t/u8string support #2026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions include/pybind11/cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
#include <string_view>
#endif

#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
# define PYBIND11_HAS_U8STRING
#endif

NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
NAMESPACE_BEGIN(detail)

Expand Down Expand Up @@ -988,6 +992,9 @@ template <typename type> class type_caster<std::reference_wrapper<type>> {

template <typename CharT> using is_std_char_type = any_of<
std::is_same<CharT, char>, /* std::string */
#if defined(PYBIND11_HAS_U8STRING)
std::is_same<CharT, char8_t>, /* std::u8string */
#endif
std::is_same<CharT, char16_t>, /* std::u16string */
std::is_same<CharT, char32_t>, /* std::u32string */
std::is_same<CharT, wchar_t> /* std::wstring */
Expand Down Expand Up @@ -1191,6 +1198,9 @@ template <typename StringType, bool IsView = false> struct string_caster {
// Simplify life by being able to assume standard char sizes (the standard only guarantees
// minimums, but Python requires exact sizes)
static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
#if defined(PYBIND11_HAS_U8STRING)
static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1, "Unsupported char8_t size != 1");
#endif
static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
// wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
Expand All @@ -1209,7 +1219,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
#if PY_MAJOR_VERSION >= 3
return load_bytes(load_src);
#else
if (sizeof(CharT) == 1) {
if (std::is_same<CharT, char>::value) {
return load_bytes(load_src);
}

Expand Down Expand Up @@ -1269,7 +1279,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
// without any encoding/decoding attempt). For other C++ char sizes this is a no-op.
// which supports loading a unicode from a str, doesn't take this path.
template <typename C = CharT>
bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) {
bool load_bytes(enable_if_t<std::is_same<C, char>::value, handle> src) {
if (PYBIND11_BYTES_CHECK(src.ptr())) {
// We were passed a Python 3 raw bytes; accept it into a std::string or char*
// without any encoding attempt.
Expand All @@ -1284,7 +1294,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
}

template <typename C = CharT>
bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) { return false; }
bool load_bytes(enable_if_t<!std::is_same<C, char>::value, handle>) { return false; }
};

template <typename CharT, class Traits, class Allocator>
Expand Down
22 changes: 20 additions & 2 deletions tests/test_builtin_casters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ TEST_SUBMODULE(builtin_casters, m) {
else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
wstr.push_back(0x7a); // z

m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
m.def("good_utf8_string", []() { return std::string((const char*)u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z
m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z
m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
Expand Down Expand Up @@ -60,6 +60,18 @@ TEST_SUBMODULE(builtin_casters, m) {
m.def("strlen", [](char *s) { return strlen(s); });
m.def("string_length", [](std::string s) { return s.length(); });

#ifdef PYBIND11_HAS_U8STRING
m.attr("has_u8string") = true;
m.def("good_utf8_u8string", []() { return std::u8string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
m.def("bad_utf8_u8string", []() { return std::u8string((const char8_t*)"abc\xd0" "def"); });

m.def("u8_char8_Z", []() -> char8_t { return u8'Z'; });

// test_single_char_arguments
m.def("ord_char8", [](char8_t c) -> int { return static_cast<unsigned char>(c); });
m.def("ord_char8_lv", [](char8_t &c) -> int { return static_cast<unsigned char>(c); });
#endif

// test_string_view
#ifdef PYBIND11_HAS_STRING_VIEW
m.attr("has_string_view") = true;
Expand All @@ -69,9 +81,15 @@ TEST_SUBMODULE(builtin_casters, m) {
m.def("string_view_chars", [](std::string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
m.def("string_view16_chars", [](std::u16string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
m.def("string_view32_chars", [](std::u32string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
m.def("string_view_return", []() { return std::string_view(u8"utf8 secret \U0001f382"); });
m.def("string_view_return", []() { return std::string_view((const char*)u8"utf8 secret \U0001f382"); });
m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });

# ifdef PYBIND11_HAS_U8STRING
m.def("string_view8_print", [](std::u8string_view s) { py::print(s, s.size()); });
m.def("string_view8_chars", [](std::u8string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
m.def("string_view8_return", []() { return std::u8string_view(u8"utf8 secret \U0001f382"); });
# endif
#endif

// test_integer_casting
Expand Down
39 changes: 39 additions & 0 deletions tests/test_builtin_casters.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def test_unicode_conversion():
assert m.good_utf16_string() == u"b‽🎂𝐀z"
assert m.good_utf32_string() == u"a𝐀🎂‽z"
assert m.good_wchar_string() == u"a⸘𝐀z"
if hasattr(m, "has_u8string"):
assert m.good_utf8_u8string() == u"Say utf8‽ 🎂 𝐀"

with pytest.raises(UnicodeDecodeError):
m.bad_utf8_string()
Expand All @@ -29,12 +31,17 @@ def test_unicode_conversion():
if hasattr(m, "bad_wchar_string"):
with pytest.raises(UnicodeDecodeError):
m.bad_wchar_string()
if hasattr(m, "has_u8string"):
with pytest.raises(UnicodeDecodeError):
m.bad_utf8_u8string()

assert m.u8_Z() == 'Z'
assert m.u8_eacute() == u'é'
assert m.u16_ibang() == u'‽'
assert m.u32_mathbfA() == u'𝐀'
assert m.wchar_heart() == u'♥'
if hasattr(m, "has_u8string"):
assert m.u8_char8_Z() == 'Z'


def test_single_char_arguments():
Expand Down Expand Up @@ -92,6 +99,17 @@ def toobig_message(r):
assert m.ord_wchar(u'aa')
assert str(excinfo.value) == toolong_message

if hasattr(m, "has_u8string"):
assert m.ord_char8(u'a') == 0x61 # simple ASCII
assert m.ord_char8_lv(u'b') == 0x62
assert m.ord_char8(u'é') == 0xE9 # requires 2 bytes in utf-8, but can be stuffed in a char
with pytest.raises(ValueError) as excinfo:
assert m.ord_char8(u'Ā') == 0x100 # requires 2 bytes, doesn't fit in a char
assert str(excinfo.value) == toobig_message(0x100)
with pytest.raises(ValueError) as excinfo:
assert m.ord_char8(u'ab')
assert str(excinfo.value) == toolong_message


def test_bytes_to_string():
"""Tests the ability to pass bytes to C++ string-accepting functions. Note that this is
Expand All @@ -116,10 +134,15 @@ def test_string_view(capture):
assert m.string_view_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
assert m.string_view16_chars("Hi 🎂") == [72, 105, 32, 0xd83c, 0xdf82]
assert m.string_view32_chars("Hi 🎂") == [72, 105, 32, 127874]
if hasattr(m, "has_u8string"):
assert m.string_view8_chars("Hi") == [72, 105]
assert m.string_view8_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]

assert m.string_view_return() == "utf8 secret 🎂"
assert m.string_view16_return() == "utf16 secret 🎂"
assert m.string_view32_return() == "utf32 secret 🎂"
if hasattr(m, "has_u8string"):
assert m.string_view8_return() == "utf8 secret 🎂"

with capture:
m.string_view_print("Hi")
Expand All @@ -132,6 +155,14 @@ def test_string_view(capture):
utf16 🎂 8
utf32 🎂 7
"""
if hasattr(m, "has_u8string"):
with capture:
m.string_view8_print("Hi")
m.string_view8_print("utf8 🎂")
assert capture == """
Hi 2
utf8 🎂 9
"""

with capture:
m.string_view_print("Hi, ascii")
Expand All @@ -144,6 +175,14 @@ def test_string_view(capture):
Hi, utf16 🎂 12
Hi, utf32 🎂 11
"""
if hasattr(m, "has_u8string"):
with capture:
m.string_view8_print("Hi, ascii")
m.string_view8_print("Hi, utf8 🎂")
assert capture == """
Hi, ascii 9
Hi, utf8 🎂 13
"""


def test_integer_casting():
Expand Down