Skip to content

Commit 76e1350

Browse files
author
VemundH
committed
Add C++20 char8_t/u8string support
1 parent dada605 commit 76e1350

File tree

3 files changed

+69
-4
lines changed

3 files changed

+69
-4
lines changed

β€Žinclude/pybind11/cast.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
#include <string_view>
3333
#endif
3434

35+
#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
36+
# define PYBIND11_HAS_U8STRING
37+
#endif
38+
3539
NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
3640
NAMESPACE_BEGIN(detail)
3741

@@ -988,6 +992,9 @@ template <typename type> class type_caster<std::reference_wrapper<type>> {
988992

989993
template <typename CharT> using is_std_char_type = any_of<
990994
std::is_same<CharT, char>, /* std::string */
995+
#if defined(PYBIND11_HAS_U8STRING)
996+
std::is_same<CharT, char8_t>, /* std::u8string */
997+
#endif
991998
std::is_same<CharT, char16_t>, /* std::u16string */
992999
std::is_same<CharT, char32_t>, /* std::u32string */
9931000
std::is_same<CharT, wchar_t> /* std::wstring */
@@ -1191,6 +1198,9 @@ template <typename StringType, bool IsView = false> struct string_caster {
11911198
// Simplify life by being able to assume standard char sizes (the standard only guarantees
11921199
// minimums, but Python requires exact sizes)
11931200
static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
1201+
#if defined(PYBIND11_HAS_U8STRING)
1202+
static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1, "Unsupported char8_t size != 1");
1203+
#endif
11941204
static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
11951205
static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
11961206
// wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
@@ -1209,7 +1219,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
12091219
#if PY_MAJOR_VERSION >= 3
12101220
return load_bytes(load_src);
12111221
#else
1212-
if (sizeof(CharT) == 1) {
1222+
if (std::is_same<CharT, char>::value) {
12131223
return load_bytes(load_src);
12141224
}
12151225

@@ -1269,7 +1279,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
12691279
// without any encoding/decoding attempt). For other C++ char sizes this is a no-op.
12701280
// which supports loading a unicode from a str, doesn't take this path.
12711281
template <typename C = CharT>
1272-
bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) {
1282+
bool load_bytes(enable_if_t<std::is_same<C, char>::value, handle> src) {
12731283
if (PYBIND11_BYTES_CHECK(src.ptr())) {
12741284
// We were passed a Python 3 raw bytes; accept it into a std::string or char*
12751285
// without any encoding attempt.
@@ -1284,7 +1294,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
12841294
}
12851295

12861296
template <typename C = CharT>
1287-
bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) { return false; }
1297+
bool load_bytes(enable_if_t<!std::is_same<C, char>::value, handle>) { return false; }
12881298
};
12891299

12901300
template <typename CharT, class Traits, class Allocator>

β€Žtests/test_builtin_casters.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@ TEST_SUBMODULE(builtin_casters, m) {
6060
m.def("strlen", [](char *s) { return strlen(s); });
6161
m.def("string_length", [](std::string s) { return s.length(); });
6262

63+
#ifdef PYBIND11_HAS_U8STRING
64+
m.attr("has_u8string") = true;
65+
m.def("good_utf8_u8string", []() { return std::u8string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8β€½ πŸŽ‚ 𝐀
66+
m.def("bad_utf8_u8string", []() { return std::u8string((const char8_t*)"abc\xd0" "def"); });
67+
68+
m.def("u8_char8_Z", []() -> char8_t { return u8'Z'; });
69+
70+
// test_single_char_arguments
71+
m.def("ord_char8", [](char8_t c) -> int { return static_cast<unsigned char>(c); });
72+
m.def("ord_char8_lv", [](char8_t &c) -> int { return static_cast<unsigned char>(c); });
73+
#endif
74+
6375
// test_string_view
6476
#ifdef PYBIND11_HAS_STRING_VIEW
6577
m.attr("has_string_view") = true;
@@ -72,6 +84,12 @@ TEST_SUBMODULE(builtin_casters, m) {
7284
m.def("string_view_return", []() { return std::string_view((const char*)u8"utf8 secret \U0001f382"); });
7385
m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
7486
m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });
87+
88+
# ifdef PYBIND11_HAS_U8STRING
89+
m.def("string_view8_print", [](std::u8string_view s) { py::print(s, s.size()); });
90+
m.def("string_view8_chars", [](std::u8string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
91+
m.def("string_view8_return", []() { return std::u8string_view(u8"utf8 secret \U0001f382"); });
92+
# endif
7593
#endif
7694

7795
// test_integer_casting

β€Žtests/test_builtin_casters.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ def test_unicode_conversion():
1515
assert m.good_utf16_string() == u"bβ€½πŸŽ‚π€z"
1616
assert m.good_utf32_string() == u"aπ€πŸŽ‚β€½z"
1717
assert m.good_wchar_string() == u"aβΈ˜π€z"
18+
if hasattr(m, "has_u8string") :
19+
assert m.good_utf8_u8string() == u"Say utf8β€½ πŸŽ‚ 𝐀"
1820

1921
with pytest.raises(UnicodeDecodeError):
2022
m.bad_utf8_string()
@@ -29,13 +31,17 @@ def test_unicode_conversion():
2931
if hasattr(m, "bad_wchar_string"):
3032
with pytest.raises(UnicodeDecodeError):
3133
m.bad_wchar_string()
34+
if hasattr(m, "has_u8string") :
35+
with pytest.raises(UnicodeDecodeError):
36+
m.bad_utf8_u8string()
3237

3338
assert m.u8_Z() == 'Z'
3439
assert m.u8_eacute() == u'Γ©'
3540
assert m.u16_ibang() == u'β€½'
3641
assert m.u32_mathbfA() == u'𝐀'
3742
assert m.wchar_heart() == u'β™₯'
38-
43+
if hasattr(m, "has_u8string") :
44+
assert m.u8_char8_Z() == 'Z'
3945

4046
def test_single_char_arguments():
4147
"""Tests failures for passing invalid inputs to char-accepting functions"""
@@ -92,6 +98,16 @@ def toobig_message(r):
9298
assert m.ord_wchar(u'aa')
9399
assert str(excinfo.value) == toolong_message
94100

101+
if hasattr(m, "has_u8string") :
102+
assert m.ord_char8(u'a') == 0x61 # simple ASCII
103+
assert m.ord_char8_lv(u'b') == 0x62
104+
assert m.ord_char8(u'Γ©') == 0xE9 # requires 2 bytes in utf-8, but can be stuffed in a char
105+
with pytest.raises(ValueError) as excinfo:
106+
assert m.ord_char8(u'Δ€') == 0x100 # requires 2 bytes, doesn't fit in a char
107+
assert str(excinfo.value) == toobig_message(0x100)
108+
with pytest.raises(ValueError) as excinfo:
109+
assert m.ord_char8(u'ab')
110+
assert str(excinfo.value) == toolong_message
95111

96112
def test_bytes_to_string():
97113
"""Tests the ability to pass bytes to C++ string-accepting functions. Note that this is
@@ -116,10 +132,15 @@ def test_string_view(capture):
116132
assert m.string_view_chars("Hi πŸŽ‚") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
117133
assert m.string_view16_chars("Hi πŸŽ‚") == [72, 105, 32, 0xd83c, 0xdf82]
118134
assert m.string_view32_chars("Hi πŸŽ‚") == [72, 105, 32, 127874]
135+
if hasattr(m, "has_u8string") :
136+
assert m.string_view8_chars("Hi") == [72, 105]
137+
assert m.string_view8_chars("Hi πŸŽ‚") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
119138

120139
assert m.string_view_return() == "utf8 secret πŸŽ‚"
121140
assert m.string_view16_return() == "utf16 secret πŸŽ‚"
122141
assert m.string_view32_return() == "utf32 secret πŸŽ‚"
142+
if hasattr(m, "has_u8string") :
143+
assert m.string_view8_return() == "utf8 secret πŸŽ‚"
123144

124145
with capture:
125146
m.string_view_print("Hi")
@@ -132,6 +153,14 @@ def test_string_view(capture):
132153
utf16 πŸŽ‚ 8
133154
utf32 πŸŽ‚ 7
134155
"""
156+
if hasattr(m, "has_u8string") :
157+
with capture:
158+
m.string_view8_print("Hi")
159+
m.string_view8_print("utf8 πŸŽ‚")
160+
assert capture == """
161+
Hi 2
162+
utf8 πŸŽ‚ 9
163+
"""
135164

136165
with capture:
137166
m.string_view_print("Hi, ascii")
@@ -144,6 +173,14 @@ def test_string_view(capture):
144173
Hi, utf16 πŸŽ‚ 12
145174
Hi, utf32 πŸŽ‚ 11
146175
"""
176+
if hasattr(m, "has_u8string") :
177+
with capture:
178+
m.string_view8_print("Hi, ascii")
179+
m.string_view8_print("Hi, utf8 πŸŽ‚")
180+
assert capture == """
181+
Hi, ascii 9
182+
Hi, utf8 πŸŽ‚ 13
183+
"""
147184

148185

149186
def test_integer_casting():

0 commit comments

Comments
Β (0)