diff --git a/include/ada.h b/include/ada.h index bb64c3683..bdd9a6390 100644 --- a/include/ada.h +++ b/include/ada.h @@ -17,6 +17,7 @@ #include "ada/state.h" #include "ada/unicode.h" #include "ada/url-inl.h" +#include "ada/url_components.h" // Public API #include "ada/ada_version.h" diff --git a/include/ada/helpers.h b/include/ada/helpers.h index ac28766f6..046b35a49 100644 --- a/include/ada/helpers.h +++ b/include/ada/helpers.h @@ -18,6 +18,12 @@ */ namespace ada::helpers { + /** + * @private + */ + template + void encode_json(std::string_view view, out_iter out); + /** * This function is used to prune a fragment from a url, and returning the removed string if input has fragment. * diff --git a/include/ada/url-inl.h b/include/ada/url-inl.h index 6fe4170bf..63a81c875 100644 --- a/include/ada/url-inl.h +++ b/include/ada/url-inl.h @@ -7,6 +7,12 @@ #include "ada/checkers.h" #include "ada/url.h" +#include "ada/url_components.h" +#include +#include +#if ADA_REGULAR_VISUAL_STUDIO +#include +#endif // ADA_REGULAR_VISUAL_STUDIO namespace ada { [[nodiscard]] ada_really_inline bool url::includes_credentials() const noexcept { @@ -73,6 +79,107 @@ namespace ada { inline std::ostream& operator<<(std::ostream& out, const ada::url& u) { return out << u.to_string(); } + + // number of 'leading zeroes'. + inline int leading_zeroes(uint32_t input_num) { +#if ADA_REGULAR_VISUAL_STUDIO + unsigned long leading_zero(0); + unsigned long in(input_num); + return _BitScanReverse(&leading_zero, in) ? int(31 - leading_zero) : 32; +#else + return __builtin_clz(input_num); +#endif// ADA_REGULAR_VISUAL_STUDIO + } + + // integer logarithm of x (ceil(log2(x))) + inline int int_log2(uint32_t x) { + return 31 - leading_zeroes(x | 1); + } + + // faster than std::to_string(x).size(). + inline int fast_digit_count(uint32_t x) { + // Compiles to very few instructions. Note that the + // table is static and thus effectively a constant. + // We leave it inside the function because it is meaningless + // outside of it (this comes at no performance cost). + const static uint64_t table[] = { + 4294967296, 8589934582, 8589934582, 8589934582, 12884901788, + 12884901788, 12884901788, 17179868184, 17179868184, 17179868184, + 21474826480, 21474826480, 21474826480, 21474826480, 25769703776, + 25769703776, 25769703776, 30063771072, 30063771072, 30063771072, + 34349738368, 34349738368, 34349738368, 34349738368, 38554705664, + 38554705664, 38554705664, 41949672960, 41949672960, 41949672960, + 42949672960, 42949672960}; + return int((x + table[int_log2(x)]) >> 32); + } + + [[nodiscard]] ada_really_inline ada::url_components url::get_components() noexcept { + url_components out{}; + + // protocol ends with ':'. for example: "https:" + out.protocol_end = uint32_t(get_scheme().size()); + + // Trailing index is always the next character of the current one. + size_t running_index = out.protocol_end + 1; + + if (host.has_value()) { + // 2 characters for "//" and 1 character for starting index + out.host_start = out.protocol_end + 3; + + if (includes_credentials()) { + out.username_end = uint32_t(out.host_start + username.size() - 1); + + out.host_start += uint32_t(username.size() + 1); + + if (!password.empty()) { + out.host_start += uint32_t(password.size() + 1); + } + } else { + out.username_end = out.host_start; + } + + out.host_end = uint32_t(out.host_start + host.value().size()) - 1; + running_index = out.host_end + 1; + } else { + // Update host start and end date to the same index, since it does not exist. + out.host_start = out.protocol_end + 1; + out.host_end = out.protocol_end + 1; + + size_t url_delimiter_count = std::count(path.begin(), path.end(), '/'); + + if (!has_opaque_path && url_delimiter_count > 1 && path.length() >= 2 && path[0] == '/' && path[1] == '/') { + // If url’s host is null, url does not have an opaque path, url’s path’s size is greater than 1, + // and url’s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output. + running_index = out.protocol_end + 3; + } else { + running_index = out.protocol_end + 1; + } + } + + if (port.has_value()) { + out.port = *port; + running_index += fast_digit_count(*port) + 1; // Port omits ':' + } + + out.pathname_start = uint32_t(running_index); + + if (!path.empty()) { + running_index += path.size(); + } + + if (query.has_value()) { + out.search_start = uint32_t(running_index); + running_index += get_search().size(); + if (get_search().size() == 0) { running_index++; } + } + + if (fragment.has_value()) { + out.hash_start = uint32_t(running_index); + } + + return out; + } + } // namespace ada #endif // ADA_URL_H diff --git a/include/ada/url.h b/include/ada/url.h index e434599ce..7fb80f096 100644 --- a/include/ada/url.h +++ b/include/ada/url.h @@ -11,6 +11,7 @@ #include "ada/serializers.h" #include "ada/unicode.h" #include "ada/log.h" +#include "ada/url_components.h" #include #include @@ -349,6 +350,25 @@ namespace ada { */ std::string to_string() const; + /** + * Useful for implementing efficient serialization for the URL. + * + * https://user@pass:example.com:1234/foo/bar?baz#quux + * | | | | ^^^^| | | + * | | | | | | | `----- hash_start + * | | | | | | `--------- search_start + * | | | | | `----------------- pathname_start + * | | | | `--------------------- port + * | | | `----------------------- host_end + * | | `---------------------------------- host_start + * | `--------------------------------------- username_end + * `---------------------------------------------- protocol_end + * + * Inspired after servo/url + * @see https://github.com/servo/rust-url/blob/b65a45515c10713f6d212e6726719a020203cc98/url/src/quirks.rs#L31 + */ + [[nodiscard]] ada_really_inline ada::url_components get_components() noexcept; + private: /** diff --git a/include/ada/url_components.h b/include/ada/url_components.h new file mode 100644 index 000000000..38616678b --- /dev/null +++ b/include/ada/url_components.h @@ -0,0 +1,64 @@ +/** + * @file url_components.h + * @brief Declaration for the URL Components + */ +#ifndef ADA_URL_COMPONENTS_H +#define ADA_URL_COMPONENTS_H + +#include "ada/common_defs.h" + +#include +#include + +namespace ada { + + /** + * We design the url_components struct so that it is as small + * and simple as possible. This version uses 32 bytes. + * + * This struct is used to extract components from a single 'href'. + */ + struct url_components { + constexpr static uint32_t omitted = uint32_t(-1); + + url_components() = default; + url_components(const url_components &u) = default; + url_components(url_components &&u) noexcept = default; + url_components &operator=(url_components &&u) noexcept = default; + url_components &operator=(const url_components &u) = default; + ~url_components() = default; + + /* + * By using 32-bit integers, we implicitly assume that the URL string + * cannot exceed 4 GB. + * + * https://user@pass:example.com:1234/foo/bar?baz#quux + * | | | | ^^^^| | | + * | | | | | | | `----- hash_start + * | | | | | | `--------- search_start + * | | | | | `----------------- pathname_start + * | | | | `--------------------- port + * | | | `----------------------- host_end + * | | `---------------------------------- host_start + * | `--------------------------------------- username_end + * `---------------------------------------------- protocol_end + */ + uint32_t protocol_end{0}; + uint32_t username_end{0}; + uint32_t host_start{0}; + uint32_t host_end{0}; + uint32_t port{omitted}; + uint32_t pathname_start{0}; + uint32_t search_start{omitted}; + uint32_t hash_start{omitted}; + + /** + * @private + * Converts a url_components to JSON stringified version. + */ + std::string to_string() const; + + }; // struct url_components + +} // namespace ada +#endif \ No newline at end of file diff --git a/src/ada.cpp b/src/ada.cpp index 41812bb8c..968e8893d 100644 --- a/src/ada.cpp +++ b/src/ada.cpp @@ -8,3 +8,4 @@ #include "url-getters.cpp" #include "url-setters.cpp" #include "parser.cpp" +#include "url_components.cpp" diff --git a/src/parser.cpp b/src/parser.cpp index efe2462be..160a83703 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -6,7 +6,7 @@ #include "ada/log.h" #include - +#include #include #include @@ -22,6 +22,10 @@ namespace ada::parser { ada::state state = ada::state::SCHEME_START; ada::url url = ada::url(); + // We refuse to parse URL strings that exceed 4GB. Such strings are almost + // surely the result of a bug or are otherwise a security concern. + if(user_input.size() >= std::string_view::size_type(std::numeric_limits::max)) { url.is_valid = false; } + // If we are provided with an invalid base, or the optional_url was invalid, // we must return. if(base_url != nullptr) { url.is_valid &= base_url->is_valid; } diff --git a/src/url.cpp b/src/url.cpp index fed5b366f..e4c5860a8 100644 --- a/src/url.cpp +++ b/src/url.cpp @@ -522,4 +522,5 @@ namespace ada { if(!host.has_value()) { return false; } return checkers::verify_dns_length(host.value()); } + } // namespace ada diff --git a/src/url_components.cpp b/src/url_components.cpp new file mode 100644 index 000000000..9ead09857 --- /dev/null +++ b/src/url_components.cpp @@ -0,0 +1,51 @@ +#include "ada.h" +#include "ada/helpers.h" + +#include +#include +#include + +namespace ada { + + std::string url_components::to_string() const { + std::string answer; + auto back = std::back_insert_iterator(answer); + answer.append("{\n"); + + answer.append("\t\"protocol_end\":\""); + helpers::encode_json(std::to_string(protocol_end), back); + answer.append("\",\n"); + + answer.append("\t\"username_end\":\""); + helpers::encode_json(std::to_string(username_end), back); + answer.append("\",\n"); + + answer.append("\t\"host_start\":\""); + helpers::encode_json(std::to_string(host_start), back); + answer.append("\",\n"); + + answer.append("\t\"host_end\":\""); + helpers::encode_json(std::to_string(host_end), back); + answer.append("\",\n"); + + answer.append("\t\"port\":\""); + helpers::encode_json(std::to_string(port), back); + answer.append("\",\n"); + + answer.append("\t\"pathname_start\":\""); + helpers::encode_json(std::to_string(pathname_start), back); + answer.append("\",\n"); + + answer.append("\t\"search_start\":\""); + helpers::encode_json(std::to_string(search_start), back); + answer.append("\",\n"); + + answer.append("\t\"hash_start\":\""); + helpers::encode_json(std::to_string(hash_start), back); + answer.append("\",\n"); + + answer.append("\n}"); + return answer; + } + +} // namespace ada diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1ae55c764..94dd44ad4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -9,15 +9,16 @@ include(${PROJECT_SOURCE_DIR}/cmake/add-cpp-test.cmake) link_libraries(ada) add_cpp_test(wpt_tests) +add_cpp_test(url_components) target_link_libraries(wpt_tests PRIVATE simdjson) +target_link_libraries(url_components PRIVATE simdjson) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") -if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9) -target_link_libraries(wpt_tests PUBLIC stdc++fs) + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9) + target_link_libraries(wpt_tests PUBLIC stdc++fs) + target_link_libraries(url_components PUBLIC stdc++fs) + endif() endif() -endif() - add_cpp_test(basic_fuzzer) add_cpp_test(from_file_tests) add_cpp_test(basic_tests) - diff --git a/tests/url_components.cpp b/tests/url_components.cpp new file mode 100644 index 000000000..bb75db398 --- /dev/null +++ b/tests/url_components.cpp @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ada.h" +#include "ada/character_sets-inl.h" +#include "ada/parser.h" +#include "ada/url_components.h" + +// We think that these examples have bad domains. +std::set bad_domains = {"http://./", "http://../", "http://foo.09.."}; + +// This function copies your input onto a memory buffer that +// has just the necessary size. This will entice tools to detect +// an out-of-bound access. +ada::result ada_parse(std::string_view view,const ada::url* base = nullptr) { + std::cout << "about to parse '" << view << "' [" << view.size() << " bytes]" << std::endl; + std::unique_ptr buffer(new char[view.size()]); + memcpy(buffer.get(), view.data(), view.size()); + return ada::parse(std::string_view(buffer.get(), view.size()), base); +} + +#include "simdjson.h" + +using namespace simdjson; + +#ifndef WPT_DATA_DIR +#define WPT_DATA_DIR "wpt/" +#endif +const char *URLTESTDATA_JSON = WPT_DATA_DIR "urltestdata.json"; + +#define TEST_START() \ + do { \ + std::cout << "> Running " << __func__ << " ..." << std::endl; \ + } while (0); +#define RUN_TEST(ACTUAL) \ + do { \ + if (!(ACTUAL)) { \ + return false; \ + } \ + } while (0); +#define TEST_FAIL(MESSAGE) \ + do { \ + std::cerr << "FAIL: " << (MESSAGE) << std::endl; \ + return false; \ + } while (0); +#define TEST_SUCCEED() \ + do { \ + return true; \ + } while (0); +#define TEST_ASSERT(LHS, RHS, MESSAGE) \ + do { \ + if (LHS != RHS) { \ + std::cerr << "Mismatch: '" << LHS << "' - '" << RHS << "'" << std::endl; \ + TEST_FAIL(MESSAGE); \ + } \ + } while (0); \ + +bool file_exists(const char *filename) { + namespace fs = std::filesystem; + std::filesystem::path f{filename}; + if (std::filesystem::exists(filename)) { + std::cout << " file found: " << filename << std::endl; + return true; + } else { + std::cout << " file missing: " << filename << std::endl; + return false; + } +} + +bool urltestdata_encoding(const char* source) { + + TEST_START() + ondemand::parser parser; + size_t counter{}; + + RUN_TEST(file_exists(source)); + padded_string json = padded_string::load(source); + std::cout << " loaded " << source << " (" << json.size() << " kB)" + << std::endl; + ondemand::document doc = parser.iterate(json); + for (auto element : doc.get_array()) { + if (element.type() == ondemand::json_type::string) { + std::string_view comment = element.get_string().value(); + std::cout << comment << std::endl; + } else if (element.type() == ondemand::json_type::object) { + ondemand::object object = element.get_object(); + std::string element_string = std::string(std::string_view(object.raw_json())); + object.reset(); + + auto input_element = object["input"]; + std::string_view input{}; + bool allow_replacement_characters = true; + if (input_element.get_string(allow_replacement_characters).get(input)) { + std::cout << "I could not parse " << element_string << std::endl; + return false; + } + std::cout << "input='" << input << "' [" << input.size() << " bytes]" << std::endl; + std::string_view base; + ada::result base_url; + if (!object["base"].get(base)) { + std::cout << "base=" << base << std::endl; + base_url = ada_parse(base); + if(!base_url) { + bool failure = false; + if (!object["failure"].get(failure) && failure == true) { + // We are good. Failure was expected. + continue; // We can't proceed any further. + } else { + TEST_ASSERT(base_url.has_value(), true, "Based should not have failred " + element_string); + } + } + } + bool failure = false; + ada::result input_url = (!object["base"].get(base)) ? ada_parse(input, &*base_url) : ada_parse(input); + + if (object["failure"].get(failure)) { + auto url = input_url.value(); + auto out = url.get_components(); + auto href = url.get_href(); + + TEST_ASSERT(out.protocol_end, url.get_protocol().size() - 1, "protocol_end mismatch " + out.to_string()); + + if (!url.username.empty()) { + size_t username_start = href.find(url.username); + size_t username_end = username_start + url.username.size() - 1; + TEST_ASSERT(href.substr(username_start, url.username.size()), url.get_username(), "username mismatch " + out.to_string()); + TEST_ASSERT(out.username_end, username_end, "username_end mismatch " + out.to_string()); + } + + if (!url.password.empty()) { + size_t password_start = out.username_end + 2; + TEST_ASSERT(href.substr(password_start, url.password.size()), url.get_password(), "password mismatch " + out.to_string()); + } + + TEST_ASSERT(href.substr(out.host_start, url.get_hostname().size()), url.get_hostname(), "hostname mismatch " + out.to_string()); + + if (url.port.has_value()) { + TEST_ASSERT(out.port, url.port.value(), "port mismatch " + out.to_string()); + } else { + TEST_ASSERT(out.port, ada::url_components::omitted, "port should have been omitted " + out.to_string()); + } + + if (url.get_pathname().length() > 0) { + TEST_ASSERT(href.substr(out.pathname_start, url.get_pathname().size()), url.get_pathname(), "pathname mismatch " + out.to_string()); + } + + if (url.get_search().length() > 0) { + TEST_ASSERT(href.substr(out.search_start, url.get_search().size()), url.get_search(), "search mismatch " + out.to_string()); + } + + if (url.fragment.has_value()) { + TEST_ASSERT(href.substr(out.hash_start, url.get_hash().size()), url.get_hash(), "hash mismatch " + out.to_string());//} + } + } + } + } + std::cout << "Tests executed = "<< counter << std::endl; + TEST_SUCCEED() +} + +int main(int argc, char** argv) { + bool all_tests{true}; + std::string filter; + if(argc > 1) { + all_tests = false; + filter = argv[1]; + std::cout << "Only running tests containing the substring '"<< filter <<"'\n" << std::endl; + } else { + std::cout << "You may pass a parameter to the wpt_tests executable to filter the tests, by substring matching." << std::endl; + } + std::cout << "Running WPT tests.\n" << std::endl; + + std::map results; + std::string name; + +#if ADA_HAS_ICU + name = "urltestdata_encoding("+std::string(URLTESTDATA_JSON)+")"; + if(all_tests || name.find(filter) != std::string::npos) { + results[name] = urltestdata_encoding(URLTESTDATA_JSON); + } +#endif + (void) all_tests; + std::cout << std::endl; + std::cout << "==============="<< std::endl; + std::cout << "Final report: "<< std::endl; + std::cout << "==============="<< std::endl; +#if ADA_HAS_ICU + std::cout << "We are using ICU."<< std::endl; +#else + std::cout << "We are not using ICU."<< std::endl; +#endif +#if ADA_IS_BIG_ENDIAN + std::cout << "You have big-endian system."<< std::endl; +#else + std::cout << "You have litte-endian system."<< std::endl; +#endif + bool one_failed = false; + for(auto [s,b] : results) { + std::cout << std::left << std::setw(60) << std::setfill('.') << s << ": " << (b?"SUCCEEDED":"FAILED") << std::endl; + if(!b) { one_failed = true; } + } + if(!one_failed) { + std::cout << "WPT tests are ok." << std::endl; + return EXIT_SUCCESS; + } else { + return EXIT_FAILURE; + } +}