Skip to content

Introduce String type for java when string-refinement is activated #491

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ matrix:
packages:
- libwww-perl
- clang-3.7
- libstdc++-5-dev
- libubsan0
before_install:
- mkdir bin ; ln -s /usr/bin/clang-3.7 bin/gcc
Expand Down
77 changes: 73 additions & 4 deletions src/java_bytecode/java_bytecode_convert_class.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ class java_bytecode_convert_classt:public messaget
symbol_tablet &_symbol_table,
message_handlert &_message_handler,
bool _disable_runtime_checks,
size_t _max_array_length):
size_t _max_array_length,
bool _string_refinement_enabled):
messaget(_message_handler),
symbol_table(_symbol_table),
disable_runtime_checks(_disable_runtime_checks),
max_array_length(_max_array_length)
max_array_length(_max_array_length),
string_refinement_enabled(_string_refinement_enabled)
{
}

Expand All @@ -41,6 +43,9 @@ class java_bytecode_convert_classt:public messaget

if(parse_tree.loading_successful)
convert(parse_tree.parsed_class);
else if(string_refinement_enabled &&
parse_tree.parsed_class.name=="java.lang.String")
add_string_type();
else
generate_class_stub(parse_tree.parsed_class.name);
}
Expand All @@ -52,13 +57,15 @@ class java_bytecode_convert_classt:public messaget
symbol_tablet &symbol_table;
const bool disable_runtime_checks;
const size_t max_array_length;
bool string_refinement_enabled;

// conversion
void convert(const classt &c);
void convert(symbolt &class_symbol, const fieldt &f);

void generate_class_stub(const irep_idt &class_name);
void add_array_types();
void add_string_type();
};

/*******************************************************************\
Expand Down Expand Up @@ -322,13 +329,15 @@ bool java_bytecode_convert_class(
symbol_tablet &symbol_table,
message_handlert &message_handler,
bool disable_runtime_checks,
size_t max_array_length)
size_t max_array_length,
bool string_refinement_enabled)
{
java_bytecode_convert_classt java_bytecode_convert_class(
symbol_table,
message_handler,
disable_runtime_checks,
max_array_length);
max_array_length,
string_refinement_enabled);

try
{
Expand All @@ -352,3 +361,63 @@ bool java_bytecode_convert_class(

return true;
}

/*******************************************************************\

Function: java_bytecode_convert_classt::add_string_type

Purpose: Implements the java.lang.String type in the case that
we provide an internal implementation.

\*******************************************************************/

void java_bytecode_convert_classt::add_string_type()
{
class_typet string_type;
string_type.set_tag("java.lang.String");
string_type.components().resize(3);
string_type.components()[0].set_name("@java.lang.Object");
string_type.components()[0].type()=symbol_typet("java::java.lang.Object");
string_type.components()[1].set_name("length");
string_type.components()[1].set_pretty_name("length");
string_type.components()[1].type()=java_int_type();
string_type.components()[2].set_name("data");
string_type.components()[2].set_pretty_name("data");
// Use a pointer-to-unbounded-array instead of a pointer-to-char.
// Saves some casting in the string refinement algorithm but may
// be unnecessary.
string_type.components()[2].type()=pointer_typet(
array_typet(java_char_type(), infinity_exprt(java_int_type())));
string_type.add_base(symbol_typet("java::java.lang.Object"));

symbolt string_symbol;
string_symbol.name="java::java.lang.String";
string_symbol.base_name="java.lang.String";
string_symbol.type=string_type;
string_symbol.is_type=true;

symbol_table.add(string_symbol);

// Also add a stub of `String.equals` so that remove-virtual-functions
// generates a check for Object.equals vs. String.equals.
// No need to fill it in, as pass_preprocess will replace the calls again.
symbolt string_equals_symbol;
string_equals_symbol.name=
"java::java.lang.String.equals:(Ljava/lang/Object;)Z";
string_equals_symbol.base_name="java.lang.String.equals";
string_equals_symbol.pretty_name="java.lang.String.equals";
string_equals_symbol.mode=ID_java;

code_typet string_equals_type;
string_equals_type.return_type()=java_boolean_type();
code_typet::parametert thisparam;
thisparam.set_this();
thisparam.type()=pointer_typet(symbol_typet(string_symbol.name));
code_typet::parametert otherparam;
otherparam.type()=pointer_typet(symbol_typet("java::java.lang.Object"));
string_equals_type.parameters().push_back(thisparam);
string_equals_type.parameters().push_back(otherparam);
string_equals_symbol.type=std::move(string_equals_type);

symbol_table.add(string_equals_symbol);
}
3 changes: 2 additions & 1 deletion src/java_bytecode/java_bytecode_convert_class.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ bool java_bytecode_convert_class(
symbol_tablet &symbol_table,
message_handlert &message_handler,
bool disable_runtime_checks,
size_t max_array_length);
size_t max_array_length,
bool string_refinement_enabled);

#endif // CPROVER_JAVA_BYTECODE_JAVA_BYTECODE_CONVERT_CLASS_H
6 changes: 4 additions & 2 deletions src/java_bytecode/java_bytecode_language.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ void java_bytecode_languaget::get_language_options(const cmdlinet &cmd)
{
disable_runtime_checks=cmd.isset("disable-runtime-check");
assume_inputs_non_null=cmd.isset("java-assume-inputs-non-null");
string_refinement_enabled=cmd.isset("string-refine");
if(cmd.isset("java-max-input-array-length"))
max_nondet_array_length=
std::stoi(cmd.get_value("java-max-input-array-length"));
Expand Down Expand Up @@ -200,13 +201,14 @@ bool java_bytecode_languaget::typecheck(
symbol_table,
get_message_handler(),
disable_runtime_checks,
max_user_array_length))
max_user_array_length,
string_refinement_enabled))
return true;
}

// now typecheck all
if(java_bytecode_typecheck(
symbol_table, get_message_handler()))
symbol_table, get_message_handler(), string_refinement_enabled))
return true;

return false;
Expand Down
1 change: 1 addition & 0 deletions src/java_bytecode/java_bytecode_language.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class java_bytecode_languaget:public languaget
// - array size for newarray
size_t max_nondet_array_length; // maximal length for non-det array creation
size_t max_user_array_length; // max size for user code created arrays
bool string_refinement_enabled;
};

languaget *new_java_bytecode_language();
Expand Down
5 changes: 3 additions & 2 deletions src/java_bytecode/java_bytecode_typecheck.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,11 @@ Function: java_bytecode_typecheck

bool java_bytecode_typecheck(
symbol_tablet &symbol_table,
message_handlert &message_handler)
message_handlert &message_handler,
bool string_refinement_enabled)
{
java_bytecode_typecheckt java_bytecode_typecheck(
symbol_table, message_handler);
symbol_table, message_handler, string_refinement_enabled);
return java_bytecode_typecheck.typecheck_main();
}

Expand Down
10 changes: 7 additions & 3 deletions src/java_bytecode/java_bytecode_typecheck.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ Author: Daniel Kroening, [email protected]

bool java_bytecode_typecheck(
symbol_tablet &symbol_table,
message_handlert &message_handler);
message_handlert &message_handler,
bool string_refinement_enabled);

bool java_bytecode_typecheck(
exprt &expr,
Expand All @@ -33,10 +34,12 @@ class java_bytecode_typecheckt:public typecheckt
public:
java_bytecode_typecheckt(
symbol_tablet &_symbol_table,
message_handlert &_message_handler):
message_handlert &_message_handler,
bool _string_refinement_enabled):
typecheckt(_message_handler),
symbol_table(_symbol_table),
ns(symbol_table)
ns(symbol_table),
string_refinement_enabled(_string_refinement_enabled)
{
}

Expand All @@ -48,6 +51,7 @@ class java_bytecode_typecheckt:public typecheckt
protected:
symbol_tablet &symbol_table;
const namespacet ns;
bool string_refinement_enabled;

void typecheck_type_symbol(symbolt &);
void typecheck_non_type_symbol(symbolt &);
Expand Down
61 changes: 58 additions & 3 deletions src/java_bytecode/java_bytecode_typecheck_expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ Author: Daniel Kroening, [email protected]

#include <util/std_expr.h>
#include <util/prefix.h>
#include <util/arith_tools.h>
#include <util/unicode.h>

#include "java_bytecode_typecheck.h"
#include "java_pointer_casts.h"
#include "java_types.h"

/*******************************************************************\

Expand Down Expand Up @@ -98,6 +101,15 @@ static void escape_non_alnum(std::string &toescape)
ch='_';
}

static array_exprt utf16_to_array(const std::wstring& in)
{
const auto jchar=java_char_type();
array_exprt ret(array_typet(jchar, infinity_exprt(java_int_type())));
for(const auto c : in)
ret.copy_to_operands(from_integer(c, jchar));
return ret;
}

/*******************************************************************\

Function: java_bytecode_typecheckt::typecheck_expr_java_string_literal
Expand All @@ -118,7 +130,7 @@ void java_bytecode_typecheckt::typecheck_expr_java_string_literal(exprt &expr)
auto findit=string_literal_to_symbol_name.find(value);
if(findit!=string_literal_to_symbol_name.end())
{
expr=symbol_exprt(findit->second, pointer_typet(string_type));
expr=address_of_exprt(symbol_exprt(findit->second, string_type));
return;
}

Expand All @@ -138,21 +150,64 @@ void java_bytecode_typecheckt::typecheck_expr_java_string_literal(exprt &expr)

symbolt new_symbol;
new_symbol.name=identifier_id;
new_symbol.type=pointer_typet(string_type);
new_symbol.type=string_type;
new_symbol.base_name="Literal";
new_symbol.pretty_name=value;
new_symbol.mode=ID_java;
new_symbol.is_type=false;
new_symbol.is_lvalue=true;
new_symbol.is_static_lifetime=true; // These are basically const global data.

if(string_refinement_enabled)
{
// Initialise the string with a constant utf-16 array:
symbolt array_symbol;
array_symbol.name=identifier_str.str()+"_constarray";
array_symbol.type=array_typet(
java_char_type(), infinity_exprt(java_int_type()));
array_symbol.base_name="Literal_constarray";
array_symbol.pretty_name=value;
array_symbol.mode=ID_java;
array_symbol.is_type=false;
array_symbol.is_lvalue=true;
// These are basically const global data:
array_symbol.is_static_lifetime=true;
array_symbol.is_state_var=true;
auto literal_array=utf16_to_array(
utf8_to_utf16_little_endian(id2string(value)));
array_symbol.value=literal_array;

if(symbol_table.add(array_symbol))
throw "failed to add constarray symbol to symtab";

symbol_typet jlo_symbol("java::java.lang.Object");
const auto& jlo_struct=to_struct_type(ns.follow(jlo_symbol));
const auto& jls_struct=to_struct_type(ns.follow(string_type));

struct_exprt literal_init(new_symbol.type);
struct_exprt jlo_init(jlo_symbol);
jlo_init.copy_to_operands(
constant_exprt("java::java.lang.String",
jlo_struct.components()[0].type()));
jlo_init.copy_to_operands(
from_integer(0, jlo_struct.components()[1].type()));
literal_init.move_to_operands(jlo_init);
literal_init.copy_to_operands(
from_integer(literal_array.operands().size(),
jls_struct.components()[1].type()));
literal_init.copy_to_operands(
address_of_exprt(array_symbol.symbol_expr()));

new_symbol.value=literal_init;
}

if(symbol_table.add(new_symbol))
{
error() << "failed to add string literal symbol to symbol table" << eom;
throw 0;
}

expr=new_symbol.symbol_expr();
expr=address_of_exprt(new_symbol.symbol_expr());
}

/*******************************************************************\
Expand Down
43 changes: 43 additions & 0 deletions src/util/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ Author: Daniel Kroening, [email protected]
\*******************************************************************/

#include <cstring>
#include <locale>
#include <codecvt>
#include <sstream>
#include <iomanip>

#include "unicode.h"

Expand Down Expand Up @@ -253,3 +257,42 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide)

return argv_narrow;
}

std::wstring utf8_to_utf16_big_endian(const std::string& in)
{
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t> > converter;
return converter.from_bytes(in);
}

std::wstring utf8_to_utf16_little_endian(const std::string& in)
{
const std::codecvt_mode mode=std::codecvt_mode::little_endian;

// default largest value codecvt_utf8_utf16 reads without error is 0x10ffff
// see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16
const unsigned long maxcode=0x10ffff;

typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
std::wstring_convert<codecvt_utf8_utf16t> converter;
return converter.from_bytes(in);
}

std::string utf16_little_endian_to_ascii(const std::wstring& in)
{
std::ostringstream result;
std::locale loc;
for(const auto c : in)
{
if(c<=255 && isprint(c, loc))
result << (unsigned char)c;
else
{
result << "\\u"
<< std::hex
<< std::setw(4)
<< std::setfill('0')
<< (unsigned int)c;
}
}
return result.str();
}
4 changes: 4 additions & 0 deletions src/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ std::wstring widen(const std::string &s);
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s);

std::wstring utf8_to_utf16_big_endian(const std::string&);
std::wstring utf8_to_utf16_little_endian(const std::string&);
std::string utf16_little_endian_to_ascii(const std::wstring& in);

const char **narrow_argv(int argc, const wchar_t **argv_wide);

#endif // CPROVER_UTIL_UNICODE_H