Skip to content

Commit 037f54b

Browse files
committed
feat: add url_components for start/end
1 parent d439ac7 commit 037f54b

File tree

6 files changed

+300
-5
lines changed

6 files changed

+300
-5
lines changed

include/ada.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "ada/state.h"
1818
#include "ada/unicode.h"
1919
#include "ada/url-inl.h"
20+
#include "ada/url_components.h"
2021

2122
// Public API
2223
#include "ada/ada_version.h"

include/ada/url-inl.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77

88
#include "ada/checkers.h"
99
#include "ada/url.h"
10+
#include "ada/url_components.h"
11+
#include <optional>
12+
#include <string>
1013

1114
namespace ada {
1215
[[nodiscard]] ada_really_inline bool url::includes_credentials() const noexcept {
@@ -73,6 +76,52 @@ namespace ada {
7376
inline std::ostream& operator<<(std::ostream& out, const ada::url& u) {
7477
return out << u.to_string();
7578
}
79+
80+
[[nodiscard]] ada_really_inline ada::url_components url::get_components() noexcept {
81+
url_components out{};
82+
83+
// protocol ends with ':'. for example: "https:"
84+
out.protocol_end = get_scheme().size();
85+
86+
if (host.has_value()) {
87+
88+
out.host_start = out.protocol_end + 2;
89+
90+
if (includes_credentials()) {
91+
out.username_end = out.protocol_end + 2 + username.size();
92+
93+
out.host_start += username.size();
94+
95+
if (!password.empty()) {
96+
out.host_start += password.size();
97+
}
98+
}
99+
100+
out.host_end = out.host_start + host.value().size() - 1;
101+
}
102+
103+
out.port = port;
104+
out.pathname_start = out.host_end;
105+
106+
if (port.has_value()) {
107+
out.pathname_start += std::to_string(port.value()).size();
108+
}
109+
110+
if (query.has_value()) {
111+
out.search_start = out.pathname_start + get_pathname().size();
112+
}
113+
114+
if (fragment.has_value()) {
115+
if (out.search_start.has_value()) {
116+
out.hash_start = out.search_start.value() + get_search().size();
117+
} else {
118+
out.hash_start = out.pathname_start + get_pathname().size();
119+
}
120+
}
121+
122+
return out;
123+
}
124+
76125
} // namespace ada
77126

78127
#endif // ADA_URL_H

include/ada/url.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "ada/serializers.h"
1212
#include "ada/unicode.h"
1313
#include "ada/log.h"
14+
#include "ada/url_components.h"
1415

1516
#include <algorithm>
1617
#include <charconv>
@@ -349,6 +350,25 @@ namespace ada {
349350
*/
350351
std::string to_string() const;
351352

353+
/**
354+
* Useful for implementing efficient serialization for the URL.
355+
*
356+
* https://user@pass:example.com:1234/foo/bar?baz#quux
357+
* | | | | ^^^^| | |
358+
* | | | | | | | `----- hash_start
359+
* | | | | | | `--------- search_start
360+
* | | | | | `----------------- pathname_start
361+
* | | | | `--------------------- port
362+
* | | | `----------------------- host_end
363+
* | | `---------------------------------- host_start
364+
* | `--------------------------------------- username_end
365+
* `---------------------------------------------- protocol_end
366+
*
367+
* Inspired after servo/url
368+
* @see https://github.com/servo/rust-url/blob/b65a45515c10713f6d212e6726719a020203cc98/url/src/quirks.rs#L31
369+
*/
370+
[[nodiscard]] ada_really_inline ada::url_components get_components() noexcept;
371+
352372
private:
353373

354374
/**

include/ada/url_components.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/**
2+
* @file url-components.h
3+
* @brief Declaration for the URL Components
4+
*/
5+
#ifndef ADA_URL_COMPONENTS_H
6+
#define ADA_URL_COMPONENTS_H
7+
8+
#include "ada/common_defs.h"
9+
10+
#include <optional>
11+
#include <string_view>
12+
13+
namespace ada {
14+
15+
struct url_components {
16+
17+
url_components() = default;
18+
url_components(const url_components &u) = default;
19+
url_components(url_components &&u) noexcept = default;
20+
url_components &operator=(url_components &&u) noexcept = default;
21+
url_components &operator=(const url_components &u) = default;
22+
ADA_ATTRIBUTE_NOINLINE ~url_components() = default;
23+
24+
size_t protocol_end{0};
25+
size_t username_end{0};
26+
size_t host_start{0};
27+
size_t host_end{0};
28+
std::optional<uint32_t> port{};
29+
size_t pathname_start{0};
30+
std::optional<size_t> search_start{};
31+
std::optional<size_t> hash_start{};
32+
33+
}; // struct url_components
34+
35+
} // namespace ada
36+
#endif

tests/CMakeLists.txt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,16 @@ include(${PROJECT_SOURCE_DIR}/cmake/add-cpp-test.cmake)
99
link_libraries(ada)
1010

1111
add_cpp_test(wpt_tests)
12+
add_cpp_test(url_components)
1213
target_link_libraries(wpt_tests PRIVATE simdjson)
14+
target_link_libraries(url_components PRIVATE simdjson)
1315
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
14-
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)
15-
target_link_libraries(wpt_tests PUBLIC stdc++fs)
16+
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)
17+
target_link_libraries(wpt_tests PUBLIC stdc++fs)
18+
target_link_libraries(url_components PUBLIC stdc++fs)
19+
endif()
1620
endif()
17-
endif()
18-
1921

2022
add_cpp_test(basic_fuzzer)
2123
add_cpp_test(from_file_tests)
2224
add_cpp_test(basic_tests)
23-

tests/url_components.cpp

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#include <cstdlib>
2+
#include <cstring>
3+
#include <filesystem>
4+
#include <iostream>
5+
#include <string>
6+
#include <memory>
7+
#include <map>
8+
#include <set>
9+
10+
#include "ada.h"
11+
#include "ada/character_sets-inl.h"
12+
#include "ada/parser.h"
13+
#include "ada/url_components.h"
14+
15+
// We think that these examples have bad domains.
16+
std::set<std::string> bad_domains = {"http://./", "http://../", "http://foo.09.."};
17+
18+
// This function copies your input onto a memory buffer that
19+
// has just the necessary size. This will entice tools to detect
20+
// an out-of-bound access.
21+
ada::result ada_parse(std::string_view view,const ada::url* base = nullptr) {
22+
std::cout << "about to parse '" << view << "' [" << view.size() << " bytes]" << std::endl;
23+
std::unique_ptr<char[]> buffer(new char[view.size()]);
24+
memcpy(buffer.get(), view.data(), view.size());
25+
return ada::parse(std::string_view(buffer.get(), view.size()), base);
26+
}
27+
28+
#include "simdjson.h"
29+
30+
using namespace simdjson;
31+
32+
#ifndef WPT_DATA_DIR
33+
#define WPT_DATA_DIR "wpt/"
34+
#endif
35+
const char *URLTESTDATA_JSON = WPT_DATA_DIR "urltestdata.json";
36+
37+
#define TEST_START() \
38+
do { \
39+
std::cout << "> Running " << __func__ << " ..." << std::endl; \
40+
} while (0);
41+
#define RUN_TEST(ACTUAL) \
42+
do { \
43+
if (!(ACTUAL)) { \
44+
return false; \
45+
} \
46+
} while (0);
47+
#define TEST_FAIL(MESSAGE) \
48+
do { \
49+
std::cerr << "FAIL: " << (MESSAGE) << std::endl; \
50+
return false; \
51+
} while (0);
52+
#define TEST_SUCCEED() \
53+
do { \
54+
return true; \
55+
} while (0);
56+
#define TEST_ASSERT(LHS, RHS, MESSAGE) \
57+
do { \
58+
if (LHS != RHS) { \
59+
std::cerr << "Mismatch: '" << LHS << "' - '" << RHS << "'" << std::endl; \
60+
TEST_FAIL(MESSAGE); \
61+
} \
62+
} while (0); \
63+
64+
bool file_exists(const char *filename) {
65+
namespace fs = std::filesystem;
66+
std::filesystem::path f{filename};
67+
if (std::filesystem::exists(filename)) {
68+
std::cout << " file found: " << filename << std::endl;
69+
return true;
70+
} else {
71+
std::cout << " file missing: " << filename << std::endl;
72+
return false;
73+
}
74+
}
75+
76+
bool urltestdata_encoding(const char* source) {
77+
78+
TEST_START()
79+
ondemand::parser parser;
80+
size_t counter{};
81+
82+
RUN_TEST(file_exists(source));
83+
padded_string json = padded_string::load(source);
84+
std::cout << " loaded " << source << " (" << json.size() << " kB)"
85+
<< std::endl;
86+
ondemand::document doc = parser.iterate(json);
87+
for (auto element : doc.get_array()) {
88+
if (element.type() == ondemand::json_type::string) {
89+
std::string_view comment = element.get_string().value();
90+
std::cout << comment << std::endl;
91+
} else if (element.type() == ondemand::json_type::object) {
92+
ondemand::object object = element.get_object();
93+
std::string element_string = std::string(std::string_view(object.raw_json()));
94+
object.reset();
95+
96+
auto input_element = object["input"];
97+
std::string_view input{};
98+
bool allow_replacement_characters = true;
99+
if (input_element.get_string(allow_replacement_characters).get(input)) {
100+
std::cout << "I could not parse " << element_string << std::endl;
101+
return false;
102+
}
103+
std::cout << "input='" << input << "' [" << input.size() << " bytes]" << std::endl;
104+
std::string_view base;
105+
ada::result base_url;
106+
if (!object["base"].get(base)) {
107+
std::cout << "base=" << base << std::endl;
108+
base_url = ada_parse(base);
109+
if(!base_url) {
110+
bool failure = false;
111+
if (!object["failure"].get(failure) && failure == true) {
112+
// We are good. Failure was expected.
113+
continue; // We can't proceed any further.
114+
} else {
115+
TEST_ASSERT(base_url.has_value(), true, "Based should not have failred " + element_string);
116+
}
117+
}
118+
}
119+
bool failure = false;
120+
ada::result input_url = (!object["base"].get(base)) ? ada_parse(input, &*base_url) : ada_parse(input);
121+
122+
if (object["failure"].get(failure)) {
123+
auto url = input_url.value();
124+
auto out = input_url->get_components();
125+
auto href = input_url->get_href();
126+
127+
TEST_ASSERT(out.protocol_end, url.get_protocol().size() - 1, "protocol_end mismatch");
128+
129+
if (!url.username.empty()) {
130+
size_t username_start = href.find(url.username);
131+
size_t username_end = username_start + url.username.size() - 1;
132+
TEST_ASSERT(href.substr(username_start, url.username.size()), url.get_username(), "username mismatch");
133+
TEST_ASSERT(out.username_end, username_end, "username_end mismatch");
134+
}
135+
}
136+
}
137+
}
138+
std::cout << "Tests executed = "<< counter << std::endl;
139+
TEST_SUCCEED()
140+
}
141+
142+
int main(int argc, char** argv) {
143+
bool all_tests{true};
144+
std::string filter;
145+
if(argc > 1) {
146+
all_tests = false;
147+
filter = argv[1];
148+
std::cout << "Only running tests containing the substring '"<< filter <<"'\n" << std::endl;
149+
} else {
150+
std::cout << "You may pass a parameter to the wpt_tests executable to filter the tests, by substring matching." << std::endl;
151+
}
152+
std::cout << "Running WPT tests.\n" << std::endl;
153+
154+
std::map<std::string, bool> results;
155+
std::string name;
156+
157+
#if ADA_HAS_ICU
158+
name = "urltestdata_encoding("+std::string(URLTESTDATA_JSON)+")";
159+
if(all_tests || name.find(filter) != std::string::npos) {
160+
results[name] = urltestdata_encoding(URLTESTDATA_JSON);
161+
}
162+
#endif
163+
std::cout << std::endl;
164+
std::cout << "==============="<< std::endl;
165+
std::cout << "Final report: "<< std::endl;
166+
std::cout << "==============="<< std::endl;
167+
#if ADA_HAS_ICU
168+
std::cout << "We are using ICU."<< std::endl;
169+
#else
170+
std::cout << "We are not using ICU."<< std::endl;
171+
#endif
172+
#if ADA_IS_BIG_ENDIAN
173+
std::cout << "You have big-endian system."<< std::endl;
174+
#else
175+
std::cout << "You have litte-endian system."<< std::endl;
176+
#endif
177+
bool one_failed = false;
178+
for(auto [s,b] : results) {
179+
std::cout << std::left << std::setw(60) << std::setfill('.') << s << ": " << (b?"SUCCEEDED":"FAILED") << std::endl;
180+
if(!b) { one_failed = true; }
181+
}
182+
if(!one_failed) {
183+
std::cout << "WPT tests are ok." << std::endl;
184+
return EXIT_SUCCESS;
185+
} else {
186+
return EXIT_FAILURE;
187+
}
188+
}

0 commit comments

Comments
 (0)