Skip to content

Commit 0be1843

Browse files
committed
Rewrite gix-url parsing
This change is intentionally kept very simple and does not try to immediately fix all now failing tests. When comparing the test output of this implementation with the previous there are a few things worth mentioning: - all previously passing baseline tests still do so - the new implementation correctly classifies paths starting with a `/` or `./` but containing a `:` as file paths - the new implementation correctly classifies paths containing just letters before the first `:` as scp paths - the new implementation fails all tests that check the returned errors for invalid paths - the new implementation seems to fail for some windows paths The next step is to evaluate the now failing tests, and either remove/rewrite them if they are based on flawed assumptions (looking at you 'ssh_alias_needs_username_to_not_be_considered_a_filepath') or fix the new implementation to make them pass again. We can then work on making the baseline tests pass. The change also keeps the previous parsing logic around and just removes it from the module graph, to allow for local comparisons of the two implementations during development.
1 parent d6ac5fa commit 0be1843

File tree

2 files changed

+125
-0
lines changed

2 files changed

+125
-0
lines changed

gix-url/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
use bstr::{BStr, BString};
1212

1313
///
14+
#[path = "parse/mod.rs"]
15+
// #[path = "parse.rs"]
1416
pub mod parse;
1517
#[doc(inline)]
1618
pub use parse::parse;

gix-url/src/parse/mod.rs

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
use std::convert::Infallible;
2+
3+
use bstr::{BStr, BString, ByteSlice};
4+
5+
#[derive(Debug, thiserror::Error)]
6+
#[allow(missing_docs)]
7+
pub enum Error {
8+
#[error("Could not decode URL as UTF8")]
9+
Utf8(#[from] std::str::Utf8Error),
10+
#[error(transparent)]
11+
Url(#[from] url::ParseError),
12+
#[error("URLs need to specify the path to the repository")]
13+
MissingResourceLocation,
14+
#[error("file URLs require an absolute or relative path to the repository")]
15+
MissingRepositoryPath,
16+
#[error("\"{url}\" is not a valid local path")]
17+
NotALocalFile { url: BString },
18+
#[error("Relative URLs are not permitted: {url:?}")]
19+
RelativeUrl { url: String },
20+
}
21+
22+
impl From<Infallible> for Error {
23+
fn from(_: Infallible) -> Self {
24+
unreachable!("Cannot actually happen, but it seems there can't be a blanket impl for this")
25+
}
26+
}
27+
28+
enum InputScheme {
29+
Uri { protocol_end: usize },
30+
Scp { colon: usize },
31+
Local,
32+
}
33+
34+
fn find_scheme(input: &BStr) -> InputScheme {
35+
// TODO: uri's may only contain `:/`, we should additionally check if the characters used for
36+
// protocol are all valid
37+
if let Some(protocol_end) = input.find("://") {
38+
return InputScheme::Uri { protocol_end };
39+
}
40+
41+
if let Some(colon) = input.find_byte(b':') {
42+
// allow user to select files containing a `:` by passing them as absolute or relative path
43+
// this is behavior explicitly mentioned by the scp and git manuals
44+
if !&input[..colon].contains(&b'/') {
45+
// TODO: implement windows specific checks
46+
return InputScheme::Scp { colon };
47+
}
48+
}
49+
50+
InputScheme::Local
51+
}
52+
53+
/// Parse the given `bytes` as a [git url](crate::Url).
54+
///
55+
/// # Note
56+
///
57+
/// We cannot and should never have to deal with UTF-16 encoded windows strings, so bytes input is acceptable.
58+
/// For file-paths, we don't expect UTF8 encoding either.
59+
pub fn parse(input: &BStr) -> Result<crate::Url, Error> {
60+
match find_scheme(input) {
61+
InputScheme::Uri { .. } => parse_url(input),
62+
InputScheme::Scp { colon } => parse_scp(input, colon),
63+
InputScheme::Local => parse_local(input),
64+
}
65+
}
66+
67+
fn parse_url(input: &BStr) -> Result<crate::Url, Error> {
68+
let url = url::Url::parse(std::str::from_utf8(input)?)?;
69+
70+
Ok(crate::Url {
71+
serialize_alternative_form: false,
72+
scheme: url.scheme().into(),
73+
password: url.password().map(Into::into),
74+
user: if url.username().is_empty() && url.password().is_none() {
75+
None
76+
} else {
77+
Some(url.username().into())
78+
},
79+
host: url.host_str().map(Into::into),
80+
port: url.port(),
81+
path: url.path().into(),
82+
})
83+
}
84+
85+
fn parse_scp(input: &BStr, colon: usize) -> Result<crate::Url, Error> {
86+
let input = std::str::from_utf8(input)?;
87+
88+
// TODO: this incorrectly splits at IPv6 addresses, check for `[]` before splitting
89+
let (host, path) = input.split_at(colon);
90+
let path = &path[1..]; // remove leading `:`
91+
92+
// The path returned by the parsed url often has the wrong number of leading `/` characters but
93+
// should never differ in any other way (ssh URIs should not contain a query or fragment part).
94+
// To avoid the various off-by-one errors caused by the `/` characters, we keep using the path
95+
// determined above and can therefore skip parsing it here as well.
96+
let url = url::Url::parse(&format!("ssh://{host}"))?;
97+
98+
Ok(crate::Url {
99+
serialize_alternative_form: true,
100+
scheme: url.scheme().into(),
101+
password: url.password().map(Into::into),
102+
user: if url.username().is_empty() && url.password().is_none() {
103+
None
104+
} else {
105+
Some(url.username().into())
106+
},
107+
host: url.host_str().map(Into::into),
108+
port: url.port(),
109+
path: path.into(),
110+
})
111+
}
112+
113+
fn parse_local(input: &BStr) -> Result<crate::Url, Error> {
114+
Ok(crate::Url {
115+
serialize_alternative_form: true,
116+
scheme: crate::scheme::Scheme::File,
117+
password: None,
118+
user: None,
119+
host: None,
120+
port: None,
121+
path: input.into(),
122+
})
123+
}

0 commit comments

Comments
 (0)