feat: add url_components for start/end

anonrig · anonrig · commit 037f54b42dee · 2023-03-01T09:31:43.000-05:00
diff --git a/include/ada.h b/include/ada.h
@@ -17,6 +17,7 @@
 #include "ada/state.h"
 #include "ada/unicode.h"
 #include "ada/url-inl.h"
+#include "ada/url_components.h"
 
 // Public API
 #include "ada/ada_version.h"
diff --git a/include/ada/url-inl.h b/include/ada/url-inl.h
@@ -7,6 +7,9 @@
 
 #include "ada/checkers.h"
 #include "ada/url.h"
+#include "ada/url_components.h"
+#include <optional>
+#include <string>
 
 namespace ada {
   [[nodiscard]] ada_really_inline bool url::includes_credentials() const noexcept {
@@ -73,6 +76,52 @@ namespace ada {
   inline std::ostream& operator<<(std::ostream& out, const ada::url& u) {
     return out << u.to_string();
   }
+
+  [[nodiscard]] ada_really_inline ada::url_components url::get_components() noexcept {
+    url_components out{};
+
+    // protocol ends with ':'. for example: "https:"
+    out.protocol_end = get_scheme().size();
+
+    if (host.has_value()) {
+
+      out.host_start = out.protocol_end + 2;
+
+      if (includes_credentials()) {
+        out.username_end = out.protocol_end + 2 + username.size();
+
+        out.host_start += username.size();
+
+        if (!password.empty()) {
+          out.host_start += password.size();
+        }
+      }
+
+      out.host_end = out.host_start + host.value().size() - 1;
+    }
+
+    out.port = port;
+    out.pathname_start = out.host_end;
+
+    if (port.has_value()) {
+      out.pathname_start += std::to_string(port.value()).size();
+    }
+
+    if (query.has_value()) {
+      out.search_start = out.pathname_start + get_pathname().size();
+    }
+
+    if (fragment.has_value()) {
+      if (out.search_start.has_value()) {
+        out.hash_start = out.search_start.value() + get_search().size();
+      } else {
+        out.hash_start = out.pathname_start + get_pathname().size();
+      }
+    }
+
+    return out;
+  }
+
 } // namespace ada
 
 #endif // ADA_URL_H
diff --git a/include/ada/url.h b/include/ada/url.h
@@ -11,6 +11,7 @@
 #include "ada/serializers.h"
 #include "ada/unicode.h"
 #include "ada/log.h"
+#include "ada/url_components.h"
 
 #include <algorithm>
 #include <charconv>
@@ -349,6 +350,25 @@ namespace ada {
      */
     std::string to_string() const;
 
+    /**
+     * Useful for implementing efficient serialization for the URL.
+     *
+     * https://user@pass:example.com:1234/foo/bar?baz#quux
+     *      |      |    |          | ^^^^|       |   |
+     *      |      |    |          | |   |       |   `----- hash_start
+     *      |      |    |          | |   |       `--------- search_start
+     *      |      |    |          | |   `----------------- pathname_start
+     *      |      |    |          | `--------------------- port
+     *      |      |    |          `----------------------- host_end
+     *      |      |    `---------------------------------- host_start
+     *      |      `--------------------------------------- username_end
+     *      `---------------------------------------------- protocol_end
+     *
+     * Inspired after servo/url
+     * @see https://github.com/servo/rust-url/blob/b65a45515c10713f6d212e6726719a020203cc98/url/src/quirks.rs#L31
+     */
+    [[nodiscard]] ada_really_inline ada::url_components get_components() noexcept;
+
   private:
 
     /**
diff --git a/include/ada/url_components.h b/include/ada/url_components.h
@@ -0,0 +1,36 @@
+/**
+ * @file url-components.h
+ * @brief Declaration for the URL Components
+ */
+#ifndef ADA_URL_COMPONENTS_H
+#define ADA_URL_COMPONENTS_H
+
+#include "ada/common_defs.h"
+
+#include <optional>
+#include <string_view>
+
+namespace ada {
+
+  struct url_components {
+
+    url_components() = default;
+    url_components(const url_components &u) = default;
+    url_components(url_components &&u) noexcept = default;
+    url_components &operator=(url_components &&u) noexcept = default;
+    url_components &operator=(const url_components &u) = default;
+    ADA_ATTRIBUTE_NOINLINE ~url_components() = default;
+
+    size_t protocol_end{0};
+    size_t username_end{0};
+    size_t host_start{0};
+    size_t host_end{0};
+    std::optional<uint32_t> port{};
+    size_t pathname_start{0};
+    std::optional<size_t> search_start{};
+    std::optional<size_t> hash_start{};
+
+  }; // struct url_components
+
+} // namespace ada
+#endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -9,15 +9,16 @@ include(${PROJECT_SOURCE_DIR}/cmake/add-cpp-test.cmake)
 link_libraries(ada)
 
 add_cpp_test(wpt_tests)
+add_cpp_test(url_components)
 target_link_libraries(wpt_tests PRIVATE simdjson)
+target_link_libraries(url_components PRIVATE simdjson)
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)
-target_link_libraries(wpt_tests PUBLIC stdc++fs)
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)
+    target_link_libraries(wpt_tests PUBLIC stdc++fs)
+    target_link_libraries(url_components PUBLIC stdc++fs)
+  endif()
 endif()
-endif()
-
 
 add_cpp_test(basic_fuzzer)
 add_cpp_test(from_file_tests)
 add_cpp_test(basic_tests)
-
diff --git a/tests/url_components.cpp b/tests/url_components.cpp
@@ -0,0 +1,188 @@
+#include <cstdlib>
+#include <cstring>
+#include <filesystem>
+#include <iostream>
+#include <string>
+#include <memory>
+#include <map>
+#include <set>
+
+#include "ada.h"
+#include "ada/character_sets-inl.h"
+#include "ada/parser.h"
+#include "ada/url_components.h"
+
+// We think that these examples have bad domains.
+std::set<std::string> bad_domains = {"http://./", "http://../", "http://foo.09.."};
+
+// This function copies your input onto a memory buffer that
+// has just the necessary size. This will entice tools to detect
+// an out-of-bound access.
+ada::result ada_parse(std::string_view view,const ada::url* base = nullptr) {
+  std::cout << "about to parse '" << view << "' [" << view.size() << " bytes]" << std::endl;
+  std::unique_ptr<char[]> buffer(new char[view.size()]);
+  memcpy(buffer.get(), view.data(), view.size());
+  return ada::parse(std::string_view(buffer.get(), view.size()), base);
+}
+
+#include "simdjson.h"
+
+using namespace simdjson;
+
+#ifndef WPT_DATA_DIR
+#define WPT_DATA_DIR "wpt/"
+#endif
+const char *URLTESTDATA_JSON = WPT_DATA_DIR "urltestdata.json";
+
+#define TEST_START()                                                           \
+  do {                                                                         \
+    std::cout << "> Running " << __func__ << " ..." << std::endl;              \
+  } while (0);
+#define RUN_TEST(ACTUAL)                                                       \
+  do {                                                                         \
+    if (!(ACTUAL)) {                                                           \
+      return false;                                                            \
+    }                                                                          \
+  } while (0);
+#define TEST_FAIL(MESSAGE)                                                     \
+  do {                                                                         \
+    std::cerr << "FAIL: " << (MESSAGE) << std::endl;                           \
+    return false;                                                              \
+  } while (0);
+#define TEST_SUCCEED()                                                         \
+  do {                                                                         \
+    return true;                                                               \
+  } while (0);
+#define TEST_ASSERT(LHS, RHS, MESSAGE)                                         \
+  do {                                                                         \
+    if (LHS != RHS)  {                                                         \
+      std::cerr << "Mismatch: '" << LHS << "' - '" << RHS << "'" << std::endl; \
+      TEST_FAIL(MESSAGE);                                                      \
+    }                                                                          \
+  } while (0);                                                                 \
+
+bool file_exists(const char *filename) {
+  namespace fs = std::filesystem;
+  std::filesystem::path f{filename};
+  if (std::filesystem::exists(filename)) {
+    std::cout << "  file found: " << filename << std::endl;
+    return true;
+  } else {
+    std::cout << "  file missing: " << filename << std::endl;
+    return false;
+  }
+}
+
+bool urltestdata_encoding(const char* source) {
+
+  TEST_START()
+  ondemand::parser parser;
+  size_t counter{};
+
+  RUN_TEST(file_exists(source));
+  padded_string json = padded_string::load(source);
+  std::cout << "  loaded " << source << " (" << json.size() << " kB)"
+            << std::endl;
+  ondemand::document doc = parser.iterate(json);
+  for (auto element : doc.get_array()) {
+    if (element.type() == ondemand::json_type::string) {
+      std::string_view comment = element.get_string().value();
+      std::cout << comment << std::endl;
+    } else if (element.type() == ondemand::json_type::object) {
+      ondemand::object object = element.get_object();
+      std::string element_string = std::string(std::string_view(object.raw_json()));
+      object.reset();
+
+      auto input_element = object["input"];
+      std::string_view input{};
+      bool allow_replacement_characters = true;
+      if (input_element.get_string(allow_replacement_characters).get(input)) {
+        std::cout << "I could not parse " << element_string << std::endl;
+        return false;
+      }
+      std::cout << "input='" << input << "' [" << input.size() << " bytes]" << std::endl;
+      std::string_view base;
+      ada::result  base_url;
+      if (!object["base"].get(base)) {
+        std::cout << "base=" << base << std::endl;
+        base_url = ada_parse(base);
+        if(!base_url) {
+          bool failure = false;
+          if (!object["failure"].get(failure) && failure == true) {
+            // We are good. Failure was expected.
+            continue; // We can't proceed any further.
+          } else {
+            TEST_ASSERT(base_url.has_value(), true, "Based should not have failred " + element_string);
+          }
+        }
+      }
+      bool failure = false;
+      ada::result input_url = (!object["base"].get(base)) ? ada_parse(input, &*base_url) : ada_parse(input);
+
+      if (object["failure"].get(failure)) {
+        auto url = input_url.value();
+        auto out = input_url->get_components();
+        auto href = input_url->get_href();
+
+        TEST_ASSERT(out.protocol_end, url.get_protocol().size() - 1, "protocol_end mismatch");
+
+        if (!url.username.empty()) {
+          size_t username_start = href.find(url.username);
+          size_t username_end = username_start + url.username.size() - 1;
+          TEST_ASSERT(href.substr(username_start, url.username.size()), url.get_username(), "username mismatch");
+          TEST_ASSERT(out.username_end, username_end, "username_end mismatch");
+        }
+      }
+    }
+  }
+  std::cout << "Tests executed = "<< counter << std::endl;
+  TEST_SUCCEED()
+}
+
+int main(int argc, char** argv) {
+  bool all_tests{true};
+  std::string filter;
+  if(argc > 1) {
+    all_tests = false;
+    filter = argv[1];
+    std::cout << "Only running tests containing the substring '"<< filter <<"'\n" << std::endl;
+  } else {
+    std::cout << "You may pass a parameter to the wpt_tests executable to filter the tests, by substring matching." << std::endl;
+  }
+  std::cout << "Running WPT tests.\n" << std::endl;
+
+  std::map<std::string, bool> results;
+  std::string name;
+
+#if ADA_HAS_ICU
+  name = "urltestdata_encoding("+std::string(URLTESTDATA_JSON)+")";
+  if(all_tests || name.find(filter) != std::string::npos) {
+    results[name] = urltestdata_encoding(URLTESTDATA_JSON);
+  }
+#endif
+  std::cout << std::endl;
+  std::cout << "==============="<< std::endl;
+  std::cout << "Final report: "<< std::endl;
+  std::cout << "==============="<< std::endl;
+#if ADA_HAS_ICU
+  std::cout << "We are using ICU."<< std::endl;
+#else
+  std::cout << "We are not using ICU."<< std::endl;
+#endif
+#if ADA_IS_BIG_ENDIAN
+  std::cout << "You have big-endian system."<< std::endl;
+#else
+  std::cout << "You have litte-endian system."<< std::endl;
+#endif
+  bool one_failed = false;
+  for(auto [s,b] : results) {
+    std::cout << std::left << std::setw(60) << std::setfill('.') << s << ": " << (b?"SUCCEEDED":"FAILED") << std::endl;
+    if(!b) { one_failed = true; }
+  }
+  if(!one_failed) {
+    std::cout << "WPT tests are ok." << std::endl;
+    return EXIT_SUCCESS;
+  } else {
+    return EXIT_FAILURE;
+  }
+}