Skip to content

Commit d1e691e

Browse files
author
Daniel Kroening
authored
Merge pull request #541 from smowton/string-refine-unicode
utf8 to utf16 conversion and utf16 to ascii
2 parents 9f6ca11 + 927d54a commit d1e691e

File tree

3 files changed

+85
-0
lines changed

3 files changed

+85
-0
lines changed

.travis.yml

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ matrix:
2828
packages:
2929
- libwww-perl
3030
- clang-3.7
31+
- libstdc++-5-dev
3132
- libubsan0
3233
before_install:
3334
- mkdir bin ; ln -s /usr/bin/clang-3.7 bin/gcc

src/util/unicode.cpp

+80
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ Author: Daniel Kroening, [email protected]
77
\*******************************************************************/
88

99
#include <cstring>
10+
#include <locale>
11+
#include <codecvt>
12+
#include <iomanip>
13+
#include <sstream>
1014

1115
#include "unicode.h"
1216

@@ -258,3 +262,79 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide)
258262

259263
return argv_narrow;
260264
}
265+
266+
/*******************************************************************\
267+
268+
Function: utf8_to_utf16_big_endian
269+
270+
Inputs: String in UTF-8 format
271+
272+
Outputs: String in UTF-16BE format
273+
274+
Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
275+
276+
\*******************************************************************/
277+
278+
std::wstring utf8_to_utf16_big_endian(const std::string& in)
279+
{
280+
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t> > converter;
281+
return converter.from_bytes(in);
282+
}
283+
284+
/*******************************************************************\
285+
286+
Function: utf8_to_utf16_little_endian
287+
288+
Inputs: String in UTF-8 format
289+
290+
Outputs: String in UTF-16LE format
291+
292+
Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
293+
294+
\*******************************************************************/
295+
296+
std::wstring utf8_to_utf16_little_endian(const std::string& in)
297+
{
298+
const std::codecvt_mode mode=std::codecvt_mode::little_endian;
299+
300+
// default largest value codecvt_utf8_utf16 reads without error is 0x10ffff
301+
// see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16
302+
const unsigned long maxcode=0x10ffff;
303+
304+
typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
305+
std::wstring_convert<codecvt_utf8_utf16t> converter;
306+
return converter.from_bytes(in);
307+
}
308+
309+
/*******************************************************************\
310+
311+
Function: utf16_little_endian_to_ascii
312+
313+
Inputs: String in UTF-16LE format
314+
315+
Outputs: String in US-ASCII format, with \uxxxx escapes for other
316+
characters
317+
318+
Purpose:
319+
320+
\*******************************************************************/
321+
322+
std::string utf16_little_endian_to_ascii(const std::wstring& in)
323+
{
324+
std::ostringstream result;
325+
std::locale loc;
326+
for(const auto c : in)
327+
{
328+
if(c<=255 && isprint(c, loc))
329+
result << (unsigned char)c;
330+
else
331+
{
332+
result << "\\u"
333+
<< std::hex
334+
<< std::setw(4)
335+
<< std::setfill('0')
336+
<< (unsigned int)c;
337+
}
338+
}
339+
return result.str();
340+
}

src/util/unicode.h

+4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ std::wstring widen(const std::string &s);
2222
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
2323
std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s);
2424

25+
std::wstring utf8_to_utf16_big_endian(const std::string &);
26+
std::wstring utf8_to_utf16_little_endian(const std::string &);
27+
std::string utf16_little_endian_to_ascii(const std::wstring &in);
28+
2529
const char **narrow_argv(int argc, const wchar_t **argv_wide);
2630

2731
#endif // CPROVER_UTIL_UNICODE_H

0 commit comments

Comments
 (0)