From fcfaf09d1dfdbe01093e1f776aa046da8cb7db61 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 26 Aug 2024 16:45:33 -0400 Subject: [PATCH] fix Rust through CMake --- Dockerfile | 2 +- benchmarks/CMakeLists.txt | 35 ++++ .../competitors/serde-benchmark/Cargo.lock | 103 +++++++++++ .../competitors/serde-benchmark/Cargo.toml | 17 ++ .../competitors/serde-benchmark/README.md | 18 ++ .../competitors/serde-benchmark/cbindgen.toml | 12 ++ benchmarks/competitors/serde-benchmark/lib.rs | 170 ++++++++++++++++++ .../serde-benchmark/serde_benchmark.h | 32 ++++ benchmarks/src/CMakeLists.txt | 6 +- .../src/benchmark_serialization_twitter.cpp | 30 +++- benchmarks/src/nlohmann_twitter_data.hpp | 53 +++++- benchmarks/src/twitter_data.hpp | 1 + 12 files changed, 473 insertions(+), 6 deletions(-) create mode 100644 benchmarks/competitors/serde-benchmark/Cargo.lock create mode 100644 benchmarks/competitors/serde-benchmark/Cargo.toml create mode 100644 benchmarks/competitors/serde-benchmark/README.md create mode 100644 benchmarks/competitors/serde-benchmark/cbindgen.toml create mode 100644 benchmarks/competitors/serde-benchmark/lib.rs create mode 100644 benchmarks/competitors/serde-benchmark/serde_benchmark.h diff --git a/Dockerfile b/Dockerfile index ce711c3..69c9705 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ FROM debian:12 LABEL maintainer "LLVM Developers" # Install packages for minimal useful image. RUN apt-get update && \ - apt-get install -y --no-install-recommends build-essential ca-certificates libcurl4-openssl-dev cmake make wget python3 python3-dev sudo curl ninja-build vim git binutils && \ + apt-get install -y --no-install-recommends build-essential ca-certificates rust-all libcurl4-openssl-dev cmake make wget python3 python3-dev sudo curl ninja-build vim git binutils && \ rm -rf /var/lib/apt/lists/* # Copy build results of stage 1 to /usr/local. COPY --from=builder /tmp/clang-install/ /usr/local/ diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 96e037e..c60e891 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -7,5 +7,40 @@ CPMAddPackage( PATCHES reflect-cpp.patch GIT_TAG v0.13.0 ) + + + +if(NOT WIN32) +# We want the check whether Rust is available before trying to build a crate. +CPMAddPackage( + NAME corrosion + GITHUB_REPOSITORY corrosion-rs/corrosion + VERSION 0.4.4 + DOWNLOAD_ONLY ON + OPTIONS "Rust_FIND_QUIETLY OFF" +) +include("${corrosion_SOURCE_DIR}/cmake/FindRust.cmake") +endif() + +if(RUST_FOUND) + message(STATUS "Rust found: " ${Rust_VERSION} ) + add_subdirectory("${corrosion_SOURCE_DIR}" "${PROJECT_BINARY_DIR}/_deps/corrosion" EXCLUDE_FROM_ALL) + # Important: we want to build in release mode! + corrosion_import_crate(MANIFEST_PATH "competitors/serde-benchmark/Cargo.toml" NO_LINKER_OVERRIDE PROFILE release) +else() + message(STATUS "Rust/Cargo is unavailable." ) + message(STATUS "We will not benchmark serde-benchmark." ) + if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + message(STATUS "Under macOS, you may be able to install rust with") + message(STATUS "curl https://sh.rustup.rs -sSf | sh") + elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + message(STATUS "Under Linux, you may be able to install rust with a command such as") + message(STATUS "apt-get install cargo" ) + message(STATUS "or" ) + message(STATUS "curl https://sh.rustup.rs -sSf | sh") + endif() +endif() + + add_subdirectory(simpleparser) add_subdirectory(src) \ No newline at end of file diff --git a/benchmarks/competitors/serde-benchmark/Cargo.lock b/benchmarks/competitors/serde-benchmark/Cargo.lock new file mode 100644 index 0000000..30d45e5 --- /dev/null +++ b/benchmarks/competitors/serde-benchmark/Cargo.lock @@ -0,0 +1,103 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "libc" +version = "0.2.158" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "serde" +version = "1.0.209" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde-benchmark" +version = "0.1.0" +dependencies = [ + "libc", + "serde", + "serde_json", +] + +[[package]] +name = "serde_derive" +version = "1.0.209" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.127" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/benchmarks/competitors/serde-benchmark/Cargo.toml b/benchmarks/competitors/serde-benchmark/Cargo.toml new file mode 100644 index 0000000..eb7a731 --- /dev/null +++ b/benchmarks/competitors/serde-benchmark/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "serde-benchmark" +version = "0.1.0" + +[lib] +path = "lib.rs" +crate-type = ["cdylib"] + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +libc = "0.2" +serde_json = "1.0" + +[profile.release] +opt-level = 3 +debug = false +lto = true diff --git a/benchmarks/competitors/serde-benchmark/README.md b/benchmarks/competitors/serde-benchmark/README.md new file mode 100644 index 0000000..21c5057 --- /dev/null +++ b/benchmarks/competitors/serde-benchmark/README.md @@ -0,0 +1,18 @@ +## Rust Serde FFI + +This folder includes FFI bindings for rust/serde. + +### Links + +- https://github.com/eqrion/cbindgen/blob/master/docs.md +- https://gist.github.com/zbraniecki/b251714d77ffebbc73c03447f2b2c69f +- https://michael-f-bryan.github.io/rust-ffi-guide/setting_up.html + +### Building + +- Generating cbindgen output + - Install dependencies with `brew install cbindgen` or `apt-get install cbindgen` or `cargo install cbindgen` or the equivalent: we used `cargo install --version 0.23.0 cbindgen`. + - Go to the directory where this README.md file is located + - Generate with `cbindgen --config cbindgen.toml --crate serde-benchmark --output serde_benchmark.h` +- Building + - Run with `cargo build --release` diff --git a/benchmarks/competitors/serde-benchmark/cbindgen.toml b/benchmarks/competitors/serde-benchmark/cbindgen.toml new file mode 100644 index 0000000..9b333e3 --- /dev/null +++ b/benchmarks/competitors/serde-benchmark/cbindgen.toml @@ -0,0 +1,12 @@ +autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */" +include_version = true +braces = "SameLine" +line_length = 100 +tab_width = 2 +language = "C++" +namespaces = ["serde_benchmark"] +include_guard = "serde_benchmark_ffi_h" + +[parse] +parse_deps = true +include = ["serde_json", "serde"] diff --git a/benchmarks/competitors/serde-benchmark/lib.rs b/benchmarks/competitors/serde-benchmark/lib.rs new file mode 100644 index 0000000..5e6062b --- /dev/null +++ b/benchmarks/competitors/serde-benchmark/lib.rs @@ -0,0 +1,170 @@ +extern crate serde; +extern crate serde_json; +extern crate libc; +use std::slice; + +use libc::{c_char, size_t}; +use serde::{Serialize, Deserialize}; + +/******************************************************/ +/******************************************************/ +/** + * Warning: the C++ code may not generate the same JSON. + */ + /******************************************************/ + /******************************************************/ + +// This has no equivalent in C++: +#[derive(Serialize, Deserialize)] +pub struct Metadata { + result_type: String, + iso_language_code: String, +} + +#[derive(Serialize, Deserialize)] +pub struct User { + id: i64, + id_str: String, + name: String, + screen_name: String, + location: String, + description: String, + // C++ does not have those: + // url: Option, + //protected: bool, + //listed_count: i64, + //created_at: String, + //favourites_count: i64, + //utc_offset: Option, + //time_zone: Option, + //geo_enabled: bool, + verified: bool, + followers_count: i64, + friends_count: i64, + statuses_count: i64, + // C++ does not have those: + //lang: String, + //profile_background_color: String, + //profile_background_image_url: String, + //profile_background_image_url_https: String, + //profile_background_tile: bool, + //profile_image_url: String, + //profile_image_url_https: String, + //profile_banner_url: Option, + //profile_link_color: String, + //profile_sidebar_border_color: String, + //profile_sidebar_fill_color: String, + //profile_text_color: String, + //profile_use_background_image: bool, + //default_profile: bool, + //default_profile_image: bool, + //following: bool, + //follow_request_sent: bool, + //notifications: bool, +} + +#[derive(Serialize, Deserialize)] +pub struct Hashtag { + text: String, + + // C++ has those but D. Lemire does not know what they are, they don't appear in the JSON: + // int64_t indices_start; + // int64_t indices_end; +} + +#[derive(Serialize, Deserialize)] +pub struct Url { + url: String, + expanded_url: String, + display_url: String, + // C++ has those but D. Lemire does not know what they are, they don't appear in the JSON: + // int64_t indices_start; + // int64_t indices_end; +} + +#[derive(Serialize, Deserialize)] +pub struct UserMention { + id: i64, + name: String, + screen_name: String, + // Not in the C++ equivalent: + //id_str: String, + //indices: Vec, + // C++ has those but D. Lemire does not know what they are, they don't appear in the JSON: + // int64_t indices_start; + // int64_t indices_end; +} + +#[derive(Serialize, Deserialize)] +pub struct Entities { + hashtags: Vec, + urls: Vec, + user_mentions: Vec, +} + +#[derive(Serialize, Deserialize)] +pub struct Status { + created_at: String, + id: i64, + text: String, + user: User, + entities: Entities, + retweet_count: i64, + favorite_count: i64, + favorited: bool, + retweeted: bool, + // None of these are in the C++ equivalent: + /* + metadata: Metadata, + id_str: String, + source: String, + truncated: bool, + in_reply_to_status_id: Option, + in_reply_to_status_id_str: Option, + in_reply_to_user_id: Option, + in_reply_to_user_id_str: Option, + in_reply_to_screen_name: Option, + geo: Option, + coordinates: Option, + place: Option, + contributors: Option, + lang: String, + */ +} + +#[derive(Serialize, Deserialize)] +pub struct TwitterData { + statuses: Vec, +} + +#[no_mangle] +pub unsafe extern "C" fn twitter_from_str(raw_input: *const c_char, raw_input_length: size_t) -> *mut TwitterData { + let input = std::str::from_utf8_unchecked(slice::from_raw_parts(raw_input as *const u8, raw_input_length)); + match serde_json::from_str(&input) { + Ok(result) => Box::into_raw(Box::new(result)), + Err(_) => std::ptr::null_mut(), + } +} + +#[no_mangle] +pub unsafe extern "C" fn str_from_twitter(raw: *mut TwitterData) -> *const c_char { + let twitter_thing = { &*raw }; + let serialized = serde_json::to_string(&twitter_thing).unwrap(); + return std::ffi::CString::new(serialized.as_str()).unwrap().into_raw() +} + + +#[no_mangle] +pub unsafe extern "C" fn free_twitter(raw: *mut TwitterData) { + if raw.is_null() { + return; + } + + drop(Box::from_raw(raw)) +} + + +#[no_mangle] +pub unsafe extern fn free_string(ptr: *const c_char) { + let _ = std::ffi::CString::from_raw(ptr as *mut _); +} diff --git a/benchmarks/competitors/serde-benchmark/serde_benchmark.h b/benchmarks/competitors/serde-benchmark/serde_benchmark.h new file mode 100644 index 0000000..070cae5 --- /dev/null +++ b/benchmarks/competitors/serde-benchmark/serde_benchmark.h @@ -0,0 +1,32 @@ +#ifndef serde_benchmark_ffi_h +#define serde_benchmark_ffi_h + +/* Generated with cbindgen:0.24.3 */ + +/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */ + +#include +#include +#include +#include +#include + +namespace serde_benchmark { + +struct TwitterData; + +extern "C" { + +TwitterData *twitter_from_str(const char *raw_input, size_t raw_input_length); + +const char *str_from_twitter(TwitterData *raw); + +void free_twitter(TwitterData *raw); + +void free_string(const char *ptr); + +} // extern "C" + +} // namespace serde_benchmark + +#endif // serde_benchmark_ffi_h diff --git a/benchmarks/src/CMakeLists.txt b/benchmarks/src/CMakeLists.txt index b6853df..d4adb29 100644 --- a/benchmarks/src/CMakeLists.txt +++ b/benchmarks/src/CMakeLists.txt @@ -14,7 +14,11 @@ endif() add_executable(SerializationBenchmark benchmark_serialization.cpp) add_executable(SerializationTwitterBenchmark benchmark_serialization_twitter.cpp) - + if(TARGET serde-benchmark) + message(STATUS "serde-benchmark target was created. Linking benchmarks and serde-benchmark.") + target_link_libraries(SerializationTwitterBenchmark PRIVATE serde-benchmark) + target_compile_definitions(SerializationTwitterBenchmark PRIVATE SIMDJSON_RUST_VERSION="${Rust_VERSION}") +endif() target_link_libraries(SerializationTwitterBenchmark PRIVATE simpleparser simdjson::serialization nlohmann_json::nlohmann_json simdjson::simdjson) target_link_libraries(SerializationBenchmark PRIVATE simpleparser simdjson::serialization nlohmann_json::nlohmann_json simdjson::simdjson) diff --git a/benchmarks/src/benchmark_serialization_twitter.cpp b/benchmarks/src/benchmark_serialization_twitter.cpp index 12c15d5..862b602 100644 --- a/benchmarks/src/benchmark_serialization_twitter.cpp +++ b/benchmarks/src/benchmark_serialization_twitter.cpp @@ -18,6 +18,23 @@ #include "benchmark_reflect_serialization_twitter.hpp" #endif +#ifdef SIMDJSON_RUST_VERSION +#include "../competitors/serde-benchmark/serde_benchmark.h" + + +void bench_rust(serde_benchmark::TwitterData *data) { + const char * output = serde_benchmark::str_from_twitter(data); + size_t output_volume = strlen(output); + printf("# output volume: %zu bytes\n", output_volume); + volatile size_t measured_volume = 0; + pretty_print(1, output_volume, "bench_rust", + bench([&data, &measured_volume, &output_volume]() { + const char * output = serde_benchmark::str_from_twitter(data); + serde_benchmark::free_string(output); + })); +} +#endif + template void bench_fast_simpler(T &data) { simdjson::json_builder::string_builder b; simdjson::json_builder::fast_to_json_string(b, data); @@ -203,15 +220,24 @@ int main() { test_correctness(json_str); - // Benchmarking the serialization + + // Loading up the data into a structure. simpleparser::json_parser::JsonParser parser(json_str); auto json_value = parser.parse(); TwitterData my_struct; - bench_nlohmann(my_struct); simpleparser::json_builder::from_json(json_value, my_struct); + + // Benchmarking the serialization + bench_nlohmann(my_struct); bench_fast_simpler(my_struct); bench_fast_with_alloc(my_struct); bench_fast_with_assign(my_struct); +#ifdef SIMDJSON_RUST_VERSION + printf("# WARNING: The Rust benchmark may not be directly comparable since it does not use an equivalent data structure."); + serde_benchmark::TwitterData * td = serde_benchmark::twitter_from_str(json_str.c_str(), json_str.size()); + bench_rust(td); + serde_benchmark::free_twitter(td); +#endif #if SIMDJSON_BENCH_CPP_REFLECT bench_reflect_cpp(my_struct); #endif diff --git a/benchmarks/src/nlohmann_twitter_data.hpp b/benchmarks/src/nlohmann_twitter_data.hpp index 7b5ff07..21c8894 100644 --- a/benchmarks/src/nlohmann_twitter_data.hpp +++ b/benchmarks/src/nlohmann_twitter_data.hpp @@ -56,13 +56,62 @@ void to_json(nlohmann::json &j, const Status &s) { {"retweeted", s.retweeted}}; } + +std::string nlohmann_serialize(const std::vector& v) { + nlohmann::json a = nlohmann::json::array(); + for(const Hashtag & h : v) { + a.push_back(nlohmann::json{{"text", h.text}, + {"indices_start", h.indices_start}, + {"indices_end", h.indices_end}}); + } + return a.dump(); +} +std::string nlohmann_serialize(const std::vector& v) { + nlohmann::json a = nlohmann::json::array(); + for(const Url & u : v) { + a.push_back(nlohmann::json{{"url", u.url}, + {"expanded_url", u.expanded_url}, + {"display_url", u.display_url}, + {"indices_start", u.indices_start}, + {"indices_end", u.indices_end}}); + } + return a.dump(); +} +std::string nlohmann_serialize(const std::vector& v) { + nlohmann::json a = nlohmann::json::array(); + for(const UserMention & um : v) { + a.push_back(nlohmann::json{{"id", um.id}, + {"name", um.name}, + {"screen_name", um.screen_name}, + {"indices_start", um.indices_start}, + {"indices_end", um.indices_end}}); + } + return a.dump(); +} + +std::string nlohmann_serialize(const std::vector& v) { + nlohmann::json a = nlohmann::json::array(); + for(const Status & s : v) { + a.push_back(nlohmann::json{{"created_at", s.created_at}, + {"id", s.id}, + {"text", s.text}, + {"user", s.user}, + {"entities", s.entities}, + {"retweet_count", s.retweet_count}, + {"favorite_count", s.favorite_count}, + {"favorited", s.favorited}, + {"retweeted", s.retweeted}}); + } + return a.dump(); +} + + void to_json(nlohmann::json &j, const TwitterData &t) { j = nlohmann::json{{"statuses", t.statuses}}; } std::string nlohmann_serialize(const TwitterData &data) { - nlohmann::json j = data; - return j.dump(); + return nlohmann_serialize(data.statuses); } #endif // NLOHMANN_TWITTER_DATA_HPP diff --git a/benchmarks/src/twitter_data.hpp b/benchmarks/src/twitter_data.hpp index 6add518..70637c7 100644 --- a/benchmarks/src/twitter_data.hpp +++ b/benchmarks/src/twitter_data.hpp @@ -6,6 +6,7 @@ struct User { int64_t id; + std::string id_str; std::string name; std::string screen_name; std::string location;