diff --git a/src/java_bytecode/expr2java.cpp b/src/java_bytecode/expr2java.cpp index d39b56145d2..206718b2c81 100644 --- a/src/java_bytecode/expr2java.cpp +++ b/src/java_bytecode/expr2java.cpp @@ -15,6 +15,7 @@ Author: Daniel Kroening, kroening@cs.cmu.edu #include #include #include +#include #include #include @@ -191,20 +192,7 @@ std::string expr2javat::convert_constant( if(to_integer(src, int_value)) UNREACHABLE; - dest+="(char)'"; - - if(int_value>=' ' && int_value<127) - dest+=static_cast(int_value.to_long()); - else - { - std::string hex=integer2string(int_value, 16); - while(hex.size()<4) hex='0'+hex; - dest+='\\'; - dest+='u'; - dest+=hex; - } - - dest+='\''; + dest += "(char)'" + utf16_little_endian_to_java(int_value.to_long()) + '\''; return dest; } else if(src.type()==java_byte_type()) diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp index 4603d6d8023..6f329e2203c 100644 --- a/src/util/unicode.cpp +++ b/src/util/unicode.cpp @@ -284,39 +284,64 @@ std::wstring utf8_to_utf16_little_endian(const std::string &in) return utf8_to_utf16(in, swap_bytes); } -/// \par parameters: String in UTF-16LE format -/// \return String in US-ASCII format, with \uxxxx escapes for other characters +/// \param ch: UTF-16LE character +/// \param result: stream to receive string in US-ASCII format, with \\uxxxx +/// escapes for other characters +/// \param loc: locale to check for printable characters +static void utf16_little_endian_to_java( + const wchar_t ch, + std::ostringstream &result, + const std::locale &loc) +{ + // \u unicode characters are translated very early by the Java compiler and so + // \u000a or \u000d would become a newline character in a char constant, which + // is illegal. Instead use \n or \r. + if(ch == '\n') + result << "\\n"; + else if(ch == '\r') + result << "\\r"; + // \f, \b and \t do not need to be escaped, but this will improve readability + // of generated tests. + else if(ch == '\f') + result << "\\f"; + else if(ch == '\b') + result << "\\b"; + else if(ch == '\t') + result << "\\t"; + else if(ch <= 255 && isprint(ch, loc)) + { + const auto uch = static_cast(ch); + // ", \ and ' need to be escaped. + if(uch == '"' || uch == '\\' || uch == '\'') + result << '\\'; + result << uch; + } + else + { + // Format ch as a hexadecimal unicode character padded to four digits with + // zeros. + result << "\\u" << std::hex << std::setw(4) << std::setfill('0') + << static_cast(ch); + } +} + +/// \param ch: UTF-16LE character +/// \return String in US-ASCII format, with \\uxxxx escapes for other characters +std::string utf16_little_endian_to_java(const wchar_t ch) +{ + std::ostringstream result; + const std::locale loc; + utf16_little_endian_to_java(ch, result, loc); + return result.str(); +} + +/// \param in: String in UTF-16LE format +/// \return String in US-ASCII format, with \\uxxxx escapes for other characters std::string utf16_little_endian_to_java(const std::wstring &in) { std::ostringstream result; const std::locale loc; for(const auto ch : in) - { - if(ch=='\n') - result << "\\n"; - else if(ch=='\r') - result << "\\r"; - else if(ch=='\f') - result << "\\f"; - else if(ch=='\b') - result << "\\b"; - else if(ch=='\t') - result << "\\t"; - else if(ch<=255 && isprint(ch, loc)) - { - const auto uch=static_cast(ch); - if(uch=='"' || uch=='\\') - result << '\\'; - result << uch; - } - else - { - result << "\\u" - << std::hex - << std::setw(4) - << std::setfill('0') - << static_cast(ch); - } - } + utf16_little_endian_to_java(ch, result, loc); return result.str(); } diff --git a/src/util/unicode.h b/src/util/unicode.h index 2133123f50c..b8329ae1ba0 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -26,6 +26,7 @@ std::string utf32_to_utf8(const std::basic_string &s); std::wstring utf8_to_utf16_big_endian(const std::string &); std::wstring utf8_to_utf16_little_endian(const std::string &); +std::string utf16_little_endian_to_java(const wchar_t ch); std::string utf16_little_endian_to_java(const std::wstring &in); std::vector narrow_argv(int argc, const wchar_t **argv_wide);