diff --git a/.travis.yml b/.travis.yml index fbd3915b515..23c4417f536 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,7 @@ matrix: packages: - libwww-perl - clang-3.7 + - libstdc++-5-dev - libubsan0 before_install: - mkdir bin ; ln -s /usr/bin/clang-3.7 bin/gcc diff --git a/src/java_bytecode/java_bytecode_convert_class.cpp b/src/java_bytecode/java_bytecode_convert_class.cpp index 8d42611bfe0..f222c668627 100644 --- a/src/java_bytecode/java_bytecode_convert_class.cpp +++ b/src/java_bytecode/java_bytecode_convert_class.cpp @@ -27,11 +27,13 @@ class java_bytecode_convert_classt:public messaget symbol_tablet &_symbol_table, message_handlert &_message_handler, bool _disable_runtime_checks, - size_t _max_array_length): + size_t _max_array_length, + bool _string_refinement_enabled): messaget(_message_handler), symbol_table(_symbol_table), disable_runtime_checks(_disable_runtime_checks), - max_array_length(_max_array_length) + max_array_length(_max_array_length), + string_refinement_enabled(_string_refinement_enabled) { } @@ -41,6 +43,9 @@ class java_bytecode_convert_classt:public messaget if(parse_tree.loading_successful) convert(parse_tree.parsed_class); + else if(string_refinement_enabled && + parse_tree.parsed_class.name=="java.lang.String") + add_string_type(); else generate_class_stub(parse_tree.parsed_class.name); } @@ -52,6 +57,7 @@ class java_bytecode_convert_classt:public messaget symbol_tablet &symbol_table; const bool disable_runtime_checks; const size_t max_array_length; + bool string_refinement_enabled; // conversion void convert(const classt &c); @@ -59,6 +65,7 @@ class java_bytecode_convert_classt:public messaget void generate_class_stub(const irep_idt &class_name); void add_array_types(); + void add_string_type(); }; /*******************************************************************\ @@ -322,13 +329,15 @@ bool java_bytecode_convert_class( symbol_tablet &symbol_table, message_handlert &message_handler, bool disable_runtime_checks, - size_t max_array_length) + size_t max_array_length, + bool string_refinement_enabled) { java_bytecode_convert_classt java_bytecode_convert_class( symbol_table, message_handler, disable_runtime_checks, - max_array_length); + max_array_length, + string_refinement_enabled); try { @@ -352,3 +361,63 @@ bool java_bytecode_convert_class( return true; } + +/*******************************************************************\ + +Function: java_bytecode_convert_classt::add_string_type + + Purpose: Implements the java.lang.String type in the case that + we provide an internal implementation. + +\*******************************************************************/ + +void java_bytecode_convert_classt::add_string_type() +{ + class_typet string_type; + string_type.set_tag("java.lang.String"); + string_type.components().resize(3); + string_type.components()[0].set_name("@java.lang.Object"); + string_type.components()[0].type()=symbol_typet("java::java.lang.Object"); + string_type.components()[1].set_name("length"); + string_type.components()[1].set_pretty_name("length"); + string_type.components()[1].type()=java_int_type(); + string_type.components()[2].set_name("data"); + string_type.components()[2].set_pretty_name("data"); + // Use a pointer-to-unbounded-array instead of a pointer-to-char. + // Saves some casting in the string refinement algorithm but may + // be unnecessary. + string_type.components()[2].type()=pointer_typet( + array_typet(java_char_type(), infinity_exprt(java_int_type()))); + string_type.add_base(symbol_typet("java::java.lang.Object")); + + symbolt string_symbol; + string_symbol.name="java::java.lang.String"; + string_symbol.base_name="java.lang.String"; + string_symbol.type=string_type; + string_symbol.is_type=true; + + symbol_table.add(string_symbol); + + // Also add a stub of `String.equals` so that remove-virtual-functions + // generates a check for Object.equals vs. String.equals. + // No need to fill it in, as pass_preprocess will replace the calls again. + symbolt string_equals_symbol; + string_equals_symbol.name= + "java::java.lang.String.equals:(Ljava/lang/Object;)Z"; + string_equals_symbol.base_name="java.lang.String.equals"; + string_equals_symbol.pretty_name="java.lang.String.equals"; + string_equals_symbol.mode=ID_java; + + code_typet string_equals_type; + string_equals_type.return_type()=java_boolean_type(); + code_typet::parametert thisparam; + thisparam.set_this(); + thisparam.type()=pointer_typet(symbol_typet(string_symbol.name)); + code_typet::parametert otherparam; + otherparam.type()=pointer_typet(symbol_typet("java::java.lang.Object")); + string_equals_type.parameters().push_back(thisparam); + string_equals_type.parameters().push_back(otherparam); + string_equals_symbol.type=std::move(string_equals_type); + + symbol_table.add(string_equals_symbol); +} diff --git a/src/java_bytecode/java_bytecode_convert_class.h b/src/java_bytecode/java_bytecode_convert_class.h index ffaaf6e34da..babf5655220 100644 --- a/src/java_bytecode/java_bytecode_convert_class.h +++ b/src/java_bytecode/java_bytecode_convert_class.h @@ -19,6 +19,7 @@ bool java_bytecode_convert_class( symbol_tablet &symbol_table, message_handlert &message_handler, bool disable_runtime_checks, - size_t max_array_length); + size_t max_array_length, + bool string_refinement_enabled); #endif // CPROVER_JAVA_BYTECODE_JAVA_BYTECODE_CONVERT_CLASS_H diff --git a/src/java_bytecode/java_bytecode_language.cpp b/src/java_bytecode/java_bytecode_language.cpp index 1d9d9f57b41..7822f281dac 100644 --- a/src/java_bytecode/java_bytecode_language.cpp +++ b/src/java_bytecode/java_bytecode_language.cpp @@ -40,6 +40,7 @@ void java_bytecode_languaget::get_language_options(const cmdlinet &cmd) { disable_runtime_checks=cmd.isset("disable-runtime-check"); assume_inputs_non_null=cmd.isset("java-assume-inputs-non-null"); + string_refinement_enabled=cmd.isset("string-refine"); if(cmd.isset("java-max-input-array-length")) max_nondet_array_length= std::stoi(cmd.get_value("java-max-input-array-length")); @@ -200,13 +201,14 @@ bool java_bytecode_languaget::typecheck( symbol_table, get_message_handler(), disable_runtime_checks, - max_user_array_length)) + max_user_array_length, + string_refinement_enabled)) return true; } // now typecheck all if(java_bytecode_typecheck( - symbol_table, get_message_handler())) + symbol_table, get_message_handler(), string_refinement_enabled)) return true; return false; diff --git a/src/java_bytecode/java_bytecode_language.h b/src/java_bytecode/java_bytecode_language.h index 5937f6f4fd4..a13c27d27be 100644 --- a/src/java_bytecode/java_bytecode_language.h +++ b/src/java_bytecode/java_bytecode_language.h @@ -82,6 +82,7 @@ class java_bytecode_languaget:public languaget // - array size for newarray size_t max_nondet_array_length; // maximal length for non-det array creation size_t max_user_array_length; // max size for user code created arrays + bool string_refinement_enabled; }; languaget *new_java_bytecode_language(); diff --git a/src/java_bytecode/java_bytecode_typecheck.cpp b/src/java_bytecode/java_bytecode_typecheck.cpp index 9e1c5f2a732..25d197b19b9 100644 --- a/src/java_bytecode/java_bytecode_typecheck.cpp +++ b/src/java_bytecode/java_bytecode_typecheck.cpp @@ -126,10 +126,11 @@ Function: java_bytecode_typecheck bool java_bytecode_typecheck( symbol_tablet &symbol_table, - message_handlert &message_handler) + message_handlert &message_handler, + bool string_refinement_enabled) { java_bytecode_typecheckt java_bytecode_typecheck( - symbol_table, message_handler); + symbol_table, message_handler, string_refinement_enabled); return java_bytecode_typecheck.typecheck_main(); } diff --git a/src/java_bytecode/java_bytecode_typecheck.h b/src/java_bytecode/java_bytecode_typecheck.h index 7b12a4ffbb5..dcbb9192187 100644 --- a/src/java_bytecode/java_bytecode_typecheck.h +++ b/src/java_bytecode/java_bytecode_typecheck.h @@ -21,7 +21,8 @@ Author: Daniel Kroening, kroening@kroening.com bool java_bytecode_typecheck( symbol_tablet &symbol_table, - message_handlert &message_handler); + message_handlert &message_handler, + bool string_refinement_enabled); bool java_bytecode_typecheck( exprt &expr, @@ -33,10 +34,12 @@ class java_bytecode_typecheckt:public typecheckt public: java_bytecode_typecheckt( symbol_tablet &_symbol_table, - message_handlert &_message_handler): + message_handlert &_message_handler, + bool _string_refinement_enabled): typecheckt(_message_handler), symbol_table(_symbol_table), - ns(symbol_table) + ns(symbol_table), + string_refinement_enabled(_string_refinement_enabled) { } @@ -48,6 +51,7 @@ class java_bytecode_typecheckt:public typecheckt protected: symbol_tablet &symbol_table; const namespacet ns; + bool string_refinement_enabled; void typecheck_type_symbol(symbolt &); void typecheck_non_type_symbol(symbolt &); diff --git a/src/java_bytecode/java_bytecode_typecheck_expr.cpp b/src/java_bytecode/java_bytecode_typecheck_expr.cpp index 129a60f4005..3f6dd9ab30c 100644 --- a/src/java_bytecode/java_bytecode_typecheck_expr.cpp +++ b/src/java_bytecode/java_bytecode_typecheck_expr.cpp @@ -8,9 +8,12 @@ Author: Daniel Kroening, kroening@kroening.com #include #include +#include +#include #include "java_bytecode_typecheck.h" #include "java_pointer_casts.h" +#include "java_types.h" /*******************************************************************\ @@ -98,6 +101,15 @@ static void escape_non_alnum(std::string &toescape) ch='_'; } +static array_exprt utf16_to_array(const std::wstring& in) +{ + const auto jchar=java_char_type(); + array_exprt ret(array_typet(jchar, infinity_exprt(java_int_type()))); + for(const auto c : in) + ret.copy_to_operands(from_integer(c, jchar)); + return ret; +} + /*******************************************************************\ Function: java_bytecode_typecheckt::typecheck_expr_java_string_literal @@ -118,7 +130,7 @@ void java_bytecode_typecheckt::typecheck_expr_java_string_literal(exprt &expr) auto findit=string_literal_to_symbol_name.find(value); if(findit!=string_literal_to_symbol_name.end()) { - expr=symbol_exprt(findit->second, pointer_typet(string_type)); + expr=address_of_exprt(symbol_exprt(findit->second, string_type)); return; } @@ -138,7 +150,7 @@ void java_bytecode_typecheckt::typecheck_expr_java_string_literal(exprt &expr) symbolt new_symbol; new_symbol.name=identifier_id; - new_symbol.type=pointer_typet(string_type); + new_symbol.type=string_type; new_symbol.base_name="Literal"; new_symbol.pretty_name=value; new_symbol.mode=ID_java; @@ -146,13 +158,56 @@ void java_bytecode_typecheckt::typecheck_expr_java_string_literal(exprt &expr) new_symbol.is_lvalue=true; new_symbol.is_static_lifetime=true; // These are basically const global data. + if(string_refinement_enabled) + { + // Initialise the string with a constant utf-16 array: + symbolt array_symbol; + array_symbol.name=identifier_str.str()+"_constarray"; + array_symbol.type=array_typet( + java_char_type(), infinity_exprt(java_int_type())); + array_symbol.base_name="Literal_constarray"; + array_symbol.pretty_name=value; + array_symbol.mode=ID_java; + array_symbol.is_type=false; + array_symbol.is_lvalue=true; + // These are basically const global data: + array_symbol.is_static_lifetime=true; + array_symbol.is_state_var=true; + auto literal_array=utf16_to_array( + utf8_to_utf16_little_endian(id2string(value))); + array_symbol.value=literal_array; + + if(symbol_table.add(array_symbol)) + throw "failed to add constarray symbol to symtab"; + + symbol_typet jlo_symbol("java::java.lang.Object"); + const auto& jlo_struct=to_struct_type(ns.follow(jlo_symbol)); + const auto& jls_struct=to_struct_type(ns.follow(string_type)); + + struct_exprt literal_init(new_symbol.type); + struct_exprt jlo_init(jlo_symbol); + jlo_init.copy_to_operands( + constant_exprt("java::java.lang.String", + jlo_struct.components()[0].type())); + jlo_init.copy_to_operands( + from_integer(0, jlo_struct.components()[1].type())); + literal_init.move_to_operands(jlo_init); + literal_init.copy_to_operands( + from_integer(literal_array.operands().size(), + jls_struct.components()[1].type())); + literal_init.copy_to_operands( + address_of_exprt(array_symbol.symbol_expr())); + + new_symbol.value=literal_init; + } + if(symbol_table.add(new_symbol)) { error() << "failed to add string literal symbol to symbol table" << eom; throw 0; } - expr=new_symbol.symbol_expr(); + expr=address_of_exprt(new_symbol.symbol_expr()); } /*******************************************************************\ diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp index 4502605c600..dc1a70e95f7 100644 --- a/src/util/unicode.cpp +++ b/src/util/unicode.cpp @@ -7,6 +7,10 @@ Author: Daniel Kroening, kroening@kroening.com \*******************************************************************/ #include +#include +#include +#include +#include #include "unicode.h" @@ -253,3 +257,42 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide) return argv_narrow; } + +std::wstring utf8_to_utf16_big_endian(const std::string& in) +{ + std::wstring_convert > converter; + return converter.from_bytes(in); +} + +std::wstring utf8_to_utf16_little_endian(const std::string& in) +{ + const std::codecvt_mode mode=std::codecvt_mode::little_endian; + + // default largest value codecvt_utf8_utf16 reads without error is 0x10ffff + // see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16 + const unsigned long maxcode=0x10ffff; + + typedef std::codecvt_utf8_utf16 codecvt_utf8_utf16t; + std::wstring_convert converter; + return converter.from_bytes(in); +} + +std::string utf16_little_endian_to_ascii(const std::wstring& in) +{ + std::ostringstream result; + std::locale loc; + for(const auto c : in) + { + if(c<=255 && isprint(c, loc)) + result << (unsigned char)c; + else + { + result << "\\u" + << std::hex + << std::setw(4) + << std::setfill('0') + << (unsigned int)c; + } + } + return result.str(); +} diff --git a/src/util/unicode.h b/src/util/unicode.h index edad95039f0..1e5040344d0 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -22,6 +22,10 @@ std::wstring widen(const std::string &s); std::string utf32_to_utf8(const std::basic_string &s); std::string utf16_to_utf8(const std::basic_string &s); +std::wstring utf8_to_utf16_big_endian(const std::string&); +std::wstring utf8_to_utf16_little_endian(const std::string&); +std::string utf16_little_endian_to_ascii(const std::wstring& in); + const char **narrow_argv(int argc, const wchar_t **argv_wide); #endif // CPROVER_UTIL_UNICODE_H