Skip to content

Commit 6fe53d2

Browse files
committed
Add support for u16/u32strings and literals
This adds support for wchar{16,32}_t character literals and the associated std::u{16,32}string types. It also folds the character/string conversion into a single type_caster template, since the type casters for string and wstring were mostly the same anyway.
1 parent 245c87f commit 6fe53d2

File tree

5 files changed

+135
-88
lines changed

5 files changed

+135
-88
lines changed

docs/advanced/cast/overview.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,26 @@ as arguments and return values, refer to the section on binding :ref:`classes`.
9494
+------------------------------------+---------------------------+-------------------------------+
9595
| ``char`` | Character literal | :file:`pybind11/pybind11.h` |
9696
+------------------------------------+---------------------------+-------------------------------+
97+
| ``char16_t`` | UTF-16 character literal | :file:`pybind11/pybind11.h` |
98+
+------------------------------------+---------------------------+-------------------------------+
99+
| ``char32_t`` | UTF-32 character literal | :file:`pybind11/pybind11.h` |
100+
+------------------------------------+---------------------------+-------------------------------+
97101
| ``wchar_t`` | Wide character literal | :file:`pybind11/pybind11.h` |
98102
+------------------------------------+---------------------------+-------------------------------+
99103
| ``const char *`` | UTF-8 string literal | :file:`pybind11/pybind11.h` |
100104
+------------------------------------+---------------------------+-------------------------------+
105+
| ``const char16_t *`` | UTF-16 string literal | :file:`pybind11/pybind11.h` |
106+
+------------------------------------+---------------------------+-------------------------------+
107+
| ``const char32_t *`` | UTF-32 string literal | :file:`pybind11/pybind11.h` |
108+
+------------------------------------+---------------------------+-------------------------------+
101109
| ``const wchar_t *`` | Wide string literal | :file:`pybind11/pybind11.h` |
102110
+------------------------------------+---------------------------+-------------------------------+
103111
| ``std::string`` | STL dynamic UTF-8 string | :file:`pybind11/pybind11.h` |
104112
+------------------------------------+---------------------------+-------------------------------+
113+
| ``std::u16string`` | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h` |
114+
+------------------------------------+---------------------------+-------------------------------+
115+
| ``std::u32string`` | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h` |
116+
+------------------------------------+---------------------------+-------------------------------+
105117
| ``std::wstring`` | STL dynamic wide string | :file:`pybind11/pybind11.h` |
106118
+------------------------------------+---------------------------+-------------------------------+
107119
| ``std::pair<T1, T2>`` | Pair of two custom types | :file:`pybind11/pybind11.h` |

include/pybind11/cast.h

Lines changed: 57 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -471,8 +471,15 @@ template <typename type> class type_caster<std::reference_wrapper<type>> : publi
471471
template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>
472472

473473

474+
template <typename CharT> using is_std_char_type = any_of<
475+
std::is_same<CharT, char>, /* std::string */
476+
std::is_same<CharT, char16_t>, /* std::u16string */
477+
std::is_same<CharT, char32_t>, /* std::u32string */
478+
std::is_same<CharT, wchar_t> /* std::wstring */
479+
>;
480+
474481
template <typename T>
475-
struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value>> {
482+
struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
476483
typedef typename std::conditional<sizeof(T) <= sizeof(long), long, long long>::type _py_type_0;
477484
typedef typename std::conditional<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>::type _py_type_1;
478485
typedef typename std::conditional<std::is_floating_point<T>::value, double, _py_type_1>::type py_type;
@@ -613,41 +620,24 @@ template <> class type_caster<bool> {
613620
PYBIND11_TYPE_CASTER(bool, _("bool"));
614621
};
615622

616-
template <> class type_caster<std::string> {
617-
public:
618-
bool load(handle src, bool) {
619-
object temp;
620-
handle load_src = src;
621-
if (!src) {
622-
return false;
623-
} else if (PyUnicode_Check(load_src.ptr())) {
624-
temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(load_src.ptr()));
625-
if (!temp) { PyErr_Clear(); return false; } // UnicodeEncodeError
626-
load_src = temp;
627-
}
628-
char *buffer;
629-
ssize_t length;
630-
int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(load_src.ptr(), &buffer, &length);
631-
if (err == -1) { PyErr_Clear(); return false; } // TypeError
632-
value = std::string(buffer, (size_t) length);
633-
success = true;
634-
return true;
635-
}
623+
// Helper class for UTF-{8,16,32} strings:
624+
template <typename CharT, class Traits, class Allocator>
625+
struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>> {
626+
static constexpr unsigned int UTF_N =
627+
std::is_same<CharT, char>::value ? 8 :
628+
std::is_same<CharT, char16_t>::value ? 16 :
629+
std::is_same<CharT, char32_t>::value ? 32 :
630+
(sizeof(CharT) == 2 ? 16 : 32); /* std::wstring is UTF-16 on Windows, UTF-32 everywhere else */
636631

637-
static handle cast(const std::string &src, return_value_policy /* policy */, handle /* parent */) {
638-
handle s = PyUnicode_FromStringAndSize(src.c_str(), (ssize_t) src.length());
639-
if (!s)
640-
throw error_already_set();
641-
return s;
642-
}
632+
static constexpr const char *encoding = UTF_N == 8 ? "utf8" : UTF_N == 16 ? "utf16" : "utf32";
643633

644-
PYBIND11_TYPE_CASTER(std::string, _(PYBIND11_STRING_NAME));
645-
protected:
646-
bool success = false;
647-
};
634+
// C++ only requires char/char16_t/char32_t to be at least 8/16/32 bits, but Python's encoding
635+
// assumes exactly 1/2/4 bytes:
636+
static_assert(sizeof(CharT) == UTF_N / 8,
637+
"Internal error: string type_caster requires 1/2/4-sized character types");
638+
639+
using StringType = std::basic_string<CharT, Traits, Allocator>;
648640

649-
template <> class type_caster<std::wstring> {
650-
public:
651641
bool load(handle src, bool) {
652642
object temp;
653643
handle load_src = src;
@@ -658,78 +648,60 @@ template <> class type_caster<std::wstring> {
658648
if (!temp) { PyErr_Clear(); return false; }
659649
load_src = temp;
660650
}
661-
wchar_t *buffer = nullptr;
662-
ssize_t length = -1;
663-
#if PY_MAJOR_VERSION >= 3
664-
buffer = PyUnicode_AsWideCharString(load_src.ptr(), &length);
665-
#else
666-
temp = reinterpret_steal<object>(PyUnicode_AsEncodedString(
667-
load_src.ptr(), sizeof(wchar_t) == sizeof(short)
668-
? "utf16" : "utf32", nullptr));
669-
670-
if (temp) {
671-
int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), (char **) &buffer, &length);
672-
if (err == -1) { buffer = nullptr; } // TypeError
673-
length = length / (ssize_t) sizeof(wchar_t) - 1; ++buffer; // Skip BOM
674-
}
675-
#endif
676-
if (!buffer) { PyErr_Clear(); return false; }
677-
value = std::wstring(buffer, (size_t) length);
651+
652+
object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
653+
load_src.ptr(), encoding, nullptr));
654+
if (!utfNbytes) { PyErr_Clear(); return false; }
655+
656+
const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
657+
size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
658+
if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
659+
value = StringType(buffer, length);
678660
success = true;
679661
return true;
680662
}
681663

682-
static handle cast(const std::wstring &src, return_value_policy /* policy */, handle /* parent */) {
683-
return PyUnicode_FromWideChar(src.c_str(), (ssize_t) src.length());
664+
static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
665+
const char *buffer = reinterpret_cast<const char *>(src.c_str());
666+
ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
667+
handle s = PyUnicode_Decode(buffer, nbytes, encoding, nullptr);
668+
if (!s) throw error_already_set();
669+
return s;
684670
}
685671

686-
PYBIND11_TYPE_CASTER(std::wstring, _(PYBIND11_STRING_NAME));
672+
PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
687673
protected:
688674
bool success = false;
689675
};
690676

691-
template <> class type_caster<char> : public type_caster<std::string> {
677+
template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>>
678+
: type_caster<std::basic_string<CharT>> {
679+
using StringType = std::basic_string<CharT>;
680+
using StringCaster = type_caster<StringType>;
681+
using StringCaster::success;
682+
using StringCaster::value;
692683
public:
693684
bool load(handle src, bool convert) {
694685
if (src.is_none()) return true;
695-
return type_caster<std::string>::load(src, convert);
686+
return StringCaster::load(src, convert);
696687
}
697688

698-
static handle cast(const char *src, return_value_policy /* policy */, handle /* parent */) {
689+
static handle cast(const CharT *src, return_value_policy policy, handle parent) {
699690
if (src == nullptr) return none().inc_ref();
700-
return PyUnicode_FromString(src);
691+
return StringCaster::cast(StringType(src), policy, parent);
701692
}
702693

703-
static handle cast(char src, return_value_policy /* policy */, handle /* parent */) {
704-
char str[2] = { src, '\0' };
705-
return PyUnicode_DecodeLatin1(str, 1, nullptr);
706-
}
707-
708-
operator char*() { return success ? (char *) value.c_str() : nullptr; }
709-
operator char&() { return value[0]; }
710-
711-
static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
712-
};
713-
714-
template <> class type_caster<wchar_t> : public type_caster<std::wstring> {
715-
public:
716-
bool load(handle src, bool convert) {
717-
if (src.is_none()) return true;
718-
return type_caster<std::wstring>::load(src, convert);
719-
}
720-
721-
static handle cast(const wchar_t *src, return_value_policy /* policy */, handle /* parent */) {
722-
if (src == nullptr) return none().inc_ref();
723-
return PyUnicode_FromWideChar(src, (ssize_t) wcslen(src));
724-
}
725-
726-
static handle cast(wchar_t src, return_value_policy /* policy */, handle /* parent */) {
727-
wchar_t wstr[2] = { src, L'\0' };
728-
return PyUnicode_FromWideChar(wstr, 1);
694+
static handle cast(CharT src, return_value_policy policy, handle parent) {
695+
if (std::is_same<char, CharT>::value) {
696+
handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
697+
if (!s) throw error_already_set();
698+
return s;
699+
}
700+
return StringCaster::cast(StringType(1, src), policy, parent);
729701
}
730702

731-
operator wchar_t*() { return success ? (wchar_t *) value.c_str() : nullptr; }
732-
operator wchar_t&() { return value[0]; }
703+
operator CharT*() { return success ? (CharT *) value.c_str() : nullptr; }
704+
operator CharT&() { return value[0]; }
733705

734706
static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
735707
};

include/pybind11/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
112112
#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
113113
#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
114+
#define PYBIND11_BYTES_SIZE PyBytes_Size
114115
#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
115116
#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
116117
#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) PyLong_AsUnsignedLongLong(o)
@@ -129,6 +130,7 @@
129130
#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
130131
#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
131132
#define PYBIND11_BYTES_AS_STRING PyString_AsString
133+
#define PYBIND11_BYTES_SIZE PyString_Size
132134
#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
133135
#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
134136
#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) (PyInt_Check(o) ? (unsigned long long) PyLong_AsUnsignedLong(o) : PyLong_AsUnsignedLongLong(o))

tests/test_python_types.cpp

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
# include <fcntl.h>
1818
#endif
1919

20+
#if defined(_MSC_VER)
21+
# pragma warning(push)
22+
# pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
23+
#endif
24+
2025
class ExamplePythonTypes {
2126
public:
2227
static ExamplePythonTypes *new_instance() {
@@ -427,6 +432,36 @@ test_initializer python_types([](py::module &m) {
427432
);
428433
});
429434

435+
// Some test characters in utf16 and utf32 encodings. The last one (the 𝐀) contains a null byte
436+
// (Note also that all of these need to be in Unicode 5.2, since that the last version Python
437+
// 5.2.0 supports; sadly this means 💩 and 🎂 can't be used in Python 2.7.
438+
char32_t a32 = 0x61 /*a*/, z32 = 0x7a /*z*/, ib32 = 0x203d /**/, clef32 = 0x1d11e /*𝄞*/, mathbfA32 = 0x1d400 /*𝐀*/;
439+
char16_t b16 = 0x62 /*b*/, z16 = 0x7a, ib16 = 0x203d, clef16_1 = 0xd834, clef16_2 = 0xdd1e, mathbfA16_1 = 0xd835, mathbfA16_2 = 0xdc00;
440+
std::wstring wstr;
441+
wstr.push_back(0x61); // a
442+
wstr.push_back(0x2e18); //
443+
if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
444+
else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
445+
wstr.push_back(0x7a); // z
446+
447+
m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001d11e \U0001d400"); }); // Say utf8‽ 𝄞 𝐀
448+
m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, clef16_1, clef16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽𝄞𝐀z
449+
m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, clef32, ib32, z32 }); }); // a𝐀𝄞‽z
450+
m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
430451
m.def("bad_utf8_string", []() { return std::string("abc\xd0" "def"); });
431-
m.def("good_utf8_string", []() { return std::string(u8"Say what‽ 🎂"); });
452+
m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); });
453+
// Under Python 2.7, and invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError
454+
if (PY_MAJOR_VERSION >= 3)
455+
m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
456+
if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2)
457+
m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); });
458+
m.def("u8_Z", []() -> char { return 'Z'; });
459+
m.def("u8_eacute", []() -> char { return '\xe9'; });
460+
m.def("u16_ibang", [=]() -> char16_t { return ib16; });
461+
m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; });
462+
m.def("wchar_heart", []() -> wchar_t { return 0x2665; });
432463
});
464+
465+
#if defined(_MSC_VER)
466+
# pragma warning(pop)
467+
#endif

tests/test_python_types.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,34 @@ def test_implicit_casting():
415415

416416
def test_unicode_conversion():
417417
"""Tests unicode conversion and error reporting."""
418-
from pybind11_tests import (good_utf8_string, bad_utf8_string)
418+
import pybind11_tests
419+
from pybind11_tests import (good_utf8_string, bad_utf8_string,
420+
good_utf16_string, bad_utf16_string,
421+
good_utf32_string, # bad_utf32_string,
422+
good_wchar_string, # bad_wchar_string,
423+
u8_Z, u8_eacute, u16_ibang, u32_mathbfA, wchar_heart)
424+
425+
assert good_utf8_string() == u"Say utf8‽ 𝄞 𝐀"
426+
assert good_utf16_string() == u"b‽𝄞𝐀z"
427+
assert good_utf32_string() == u"a𝐀𝄞‽z"
428+
assert good_wchar_string() == u"a⸘𝐀z"
419429

420-
assert good_utf8_string() == "Say what‽ 🎂"
421430
with pytest.raises(UnicodeDecodeError):
422431
bad_utf8_string()
432+
433+
with pytest.raises(UnicodeDecodeError):
434+
bad_utf16_string()
435+
436+
# These are provided only if they actually fail (they don't when 32-bit and under Python 2.7)
437+
if hasattr(pybind11_tests, "bad_utf32_string"):
438+
with pytest.raises(UnicodeDecodeError):
439+
pybind11_tests.bad_utf32_string()
440+
if hasattr(pybind11_tests, "bad_wchar_string"):
441+
with pytest.raises(UnicodeDecodeError):
442+
pybind11_tests.bad_wchar_string()
443+
444+
assert u8_Z() == 'Z'
445+
assert u8_eacute() == u'é'
446+
assert u16_ibang() == u'‽'
447+
assert u32_mathbfA() == u'𝐀'
448+
assert wchar_heart() == u'♥'

0 commit comments

Comments
 (0)