From ce1efd3911f2c57d2d0a5394bcbb2372188c99f1 Mon Sep 17 00:00:00 2001 From: Pyfisch Date: Fri, 20 Nov 2015 07:15:09 +0100 Subject: [PATCH] Change host to use ip address types provided by std. Removes the custom IPv6Addr type and replaces it with the std one. Parses IPv4Addrs to the std type using the parser described in the url.spec.whatwg.org handling all edge cases. Add tests. fixes #116 This is a breaking change. Version bumped to v0.5.0. --- Cargo.toml | 2 +- src/host.rs | 375 +++++++++++++++++++++----------------------- src/lib.rs | 3 +- src/parser.rs | 1 + src/tests.rs | 19 +++ src/urltestdata.txt | 2 +- 6 files changed, 198 insertions(+), 204 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d3820e1a5..a95afc870 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "url" -version = "0.4.0" +version = "0.5.0" authors = [ "Simon Sapin " ] description = "URL library for Rust, based on the WHATWG URL Standard" diff --git a/src/host.rs b/src/host.rs index c65aa6d33..41702e2d7 100644 --- a/src/host.rs +++ b/src/host.rs @@ -9,6 +9,7 @@ use std::ascii::AsciiExt; use std::cmp; use std::fmt::{self, Formatter}; +use std::net::{Ipv4Addr, Ipv6Addr}; use parser::{ParseResult, ParseError}; use percent_encoding::{from_hex, percent_decode}; @@ -17,26 +18,15 @@ use percent_encoding::{from_hex, percent_decode}; #[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] pub enum Host { - /// A (DNS) domain name or an IPv4 address. - /// - /// FIXME: IPv4 probably should be a separate variant. - /// See https://www.w3.org/Bugs/Public/show_bug.cgi?id=26431 + /// A (DNS) domain name. Domain(String), - + /// A IPv4 address, represented by four sequences of up to three ASCII digits. + Ipv4(Ipv4Addr), /// An IPv6 address, represented inside `[...]` square brackets /// so that `:` colon characters in the address are not ambiguous /// with the port number delimiter. - Ipv6(Ipv6Address), -} - - -/// A 128 bit IPv6 address -#[derive(Clone, Eq, PartialEq, Copy, Debug, Hash, PartialOrd, Ord)] -pub struct Ipv6Address { - pub pieces: [u16; 8] + Ipv6(Ipv6Addr), } -#[cfg(feature="heap_size")] -known_heap_size!(0, Ipv6Address); impl Host { @@ -48,26 +38,28 @@ impl Host { /// FIXME: Add IDNA support for non-ASCII domains. pub fn parse(input: &str) -> ParseResult { if input.len() == 0 { - Err(ParseError::EmptyHost) - } else if input.starts_with("[") { - if input.ends_with("]") { - Ipv6Address::parse(&input[1..input.len() - 1]).map(Host::Ipv6) - } else { - Err(ParseError::InvalidIpv6Address) - } - } else { - let decoded = percent_decode(input.as_bytes()); - let domain = String::from_utf8_lossy(&decoded); - // TODO: Remove this check and use IDNA "domain to ASCII" - if !domain.is_ascii() { - Err(ParseError::NonAsciiDomainsNotSupportedYet) - } else if domain.find(&[ - '\0', '\t', '\n', '\r', ' ', '#', '%', '/', ':', '?', '@', '[', '\\', ']' - ][..]).is_some() { - Err(ParseError::InvalidDomainCharacter) - } else { - Ok(Host::Domain(domain.to_ascii_lowercase())) + return Err(ParseError::EmptyHost) + } + if input.starts_with("[") { + if !input.ends_with("]") { + return Err(ParseError::InvalidIpv6Address) } + return parse_ipv6addr(&input[1..input.len() - 1]).map(Host::Ipv6) + } + let decoded = percent_decode(input.as_bytes()); + let domain = String::from_utf8_lossy(&decoded); + // TODO: Remove this check and use IDNA "domain to ASCII" + if !domain.is_ascii() { + return Err(ParseError::NonAsciiDomainsNotSupportedYet) + } else if domain.find(&[ + '\0', '\t', '\n', '\r', ' ', '#', '%', '/', ':', '?', '@', '[', '\\', ']' + ][..]).is_some() { + return Err(ParseError::InvalidDomainCharacter) + } + match parse_ipv4addr(&domain[..]) { + Ok(Some(ipv4addr)) => Ok(Host::Ipv4(ipv4addr)), + Ok(None) => Ok(Host::Domain(domain.to_ascii_lowercase())), + Err(e) => Err(e), } } @@ -81,203 +73,186 @@ impl Host { impl fmt::Display for Host { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { match *self { - Host::Domain(ref domain) => domain.fmt(formatter), - Host::Ipv6(ref address) => { - try!(formatter.write_str("[")); - try!(address.fmt(formatter)); - formatter.write_str("]") - } + Host::Domain(ref domain) => domain.fmt(f), + Host::Ipv4(ref addr) => addr.fmt(f), + Host::Ipv6(ref addr) => write!(f, "[{}]", addr), + } + } +} + +fn parse_ipv4number(mut input: &str) -> ParseResult { + let mut r = 10; + if input.starts_with("0x") || input.starts_with("0X") { + input = &input[2..]; + r = 16; + } else if input.len() >= 2 && input.starts_with("0") { + input = &input[1..]; + r = 8; + } + if input.is_empty() { + return Ok(0); + } + match u32::from_str_radix(&input, r) { + Ok(number) => return Ok(number), + Err(_) => Err(ParseError::InvalidIpv4Address), + } +} + +fn parse_ipv4addr(input: &str) -> ParseResult> { + let mut parts: Vec<&str> = input.split('.').collect(); + if parts.last() == Some(&"") { + parts.pop(); + } + if parts.len() > 4 { + return Ok(None); + } + let mut numbers: Vec = Vec::new(); + for part in parts { + if part == "" { + return Ok(None); } + if let Ok(n) = parse_ipv4number(part) { + numbers.push(n); + } else { + return Ok(None); + } + } + let mut ipv4 = numbers.pop().expect("a non-empty list of numbers"); + if ipv4 > u32::max_value() >> (8 * numbers.len() as u32) { + return Err(ParseError::InvalidIpv4Address); } + if numbers.iter().any(|x| *x > 255) { + return Err(ParseError::InvalidIpv4Address); + } + for (counter, n) in numbers.iter().enumerate() { + ipv4 += n << (8 * (3 - counter as u32)) + } + Ok(Some(Ipv4Addr::from(ipv4))) } -impl Ipv6Address { - /// Parse an IPv6 address, without the [] square brackets. - pub fn parse(input: &str) -> ParseResult { - let input = input.as_bytes(); - let len = input.len(); - let mut is_ip_v4 = false; - let mut pieces = [0, 0, 0, 0, 0, 0, 0, 0]; - let mut piece_pointer = 0; - let mut compress_pointer = None; - let mut i = 0; +fn parse_ipv6addr(input: &str) -> ParseResult { + let input = input.as_bytes(); + let len = input.len(); + let mut is_ip_v4 = false; + let mut pieces = [0, 0, 0, 0, 0, 0, 0, 0]; + let mut piece_pointer = 0; + let mut compress_pointer = None; + let mut i = 0; + + if len < 2 { + return Err(ParseError::InvalidIpv6Address) + } - if len < 2 { + if input[0] == b':' { + if input[1] != b':' { return Err(ParseError::InvalidIpv6Address) } + i = 2; + piece_pointer = 1; + compress_pointer = Some(1); + } - if input[0] == b':' { - if input[1] != b':' { - return Err(ParseError::InvalidIpv6Address) - } - i = 2; - piece_pointer = 1; - compress_pointer = Some(1); + while i < len { + if piece_pointer == 8 { + return Err(ParseError::InvalidIpv6Address) } - - while i < len { - if piece_pointer == 8 { + if input[i] == b':' { + if compress_pointer.is_some() { return Err(ParseError::InvalidIpv6Address) } - if input[i] == b':' { - if compress_pointer.is_some() { - return Err(ParseError::InvalidIpv6Address) - } - i += 1; - piece_pointer += 1; - compress_pointer = Some(piece_pointer); - continue - } - let start = i; - let end = cmp::min(len, start + 4); - let mut value = 0u16; - while i < end { - match from_hex(input[i]) { - Some(digit) => { - value = value * 0x10 + digit as u16; - i += 1; - }, - None => break - } - } - if i < len { - match input[i] { - b'.' => { - if i == start { - return Err(ParseError::InvalidIpv6Address) - } - i = start; - is_ip_v4 = true; - }, - b':' => { - i += 1; - if i == len { - return Err(ParseError::InvalidIpv6Address) - } - }, - _ => return Err(ParseError::InvalidIpv6Address) - } - } - if is_ip_v4 { - break - } - pieces[piece_pointer] = value; + i += 1; piece_pointer += 1; + compress_pointer = Some(piece_pointer); + continue } - - if is_ip_v4 { - if piece_pointer > 6 { - return Err(ParseError::InvalidIpv6Address) + let start = i; + let end = cmp::min(len, start + 4); + let mut value = 0u16; + while i < end { + match from_hex(input[i]) { + Some(digit) => { + value = value * 0x10 + digit as u16; + i += 1; + }, + None => break } - let mut dots_seen = 0; - while i < len { - // FIXME: https://github.com/whatwg/url/commit/1c22aa119c354e0020117e02571cec53f7c01064 - let mut value = 0u16; - while i < len { - let digit = match input[i] { - c @ b'0' ... b'9' => c - b'0', - _ => break - }; - value = value * 10 + digit as u16; - if value == 0 || value > 255 { + } + if i < len { + match input[i] { + b'.' => { + if i == start { return Err(ParseError::InvalidIpv6Address) } - } - if dots_seen < 3 && !(i < len && input[i] == b'.') { - return Err(ParseError::InvalidIpv6Address) - } - pieces[piece_pointer] = pieces[piece_pointer] * 0x100 + value; - if dots_seen == 0 || dots_seen == 2 { - piece_pointer += 1; - } - i += 1; - if dots_seen == 3 && i < len { - return Err(ParseError::InvalidIpv6Address) - } - dots_seen += 1; + i = start; + is_ip_v4 = true; + }, + b':' => { + i += 1; + if i == len { + return Err(ParseError::InvalidIpv6Address) + } + }, + _ => return Err(ParseError::InvalidIpv6Address) } } - - match compress_pointer { - Some(compress_pointer) => { - let mut swaps = piece_pointer - compress_pointer; - piece_pointer = 7; - while swaps > 0 { - pieces[piece_pointer] = pieces[compress_pointer + swaps - 1]; - pieces[compress_pointer + swaps - 1] = 0; - swaps -= 1; - piece_pointer -= 1; - } - } - _ => if piece_pointer != 8 { - return Err(ParseError::InvalidIpv6Address) - } + if is_ip_v4 { + break } - Ok(Ipv6Address { pieces: pieces }) + pieces[piece_pointer] = value; + piece_pointer += 1; } - /// Serialize the IPv6 address to a string. - pub fn serialize(&self) -> String { - self.to_string() - } -} - - -impl fmt::Display for Ipv6Address { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - let (compress_start, compress_end) = longest_zero_sequence(&self.pieces); - let mut i = 0; - while i < 8 { - if i == compress_start { - try!(formatter.write_str(":")); - if i == 0 { - try!(formatter.write_str(":")); - } - if compress_end < 8 { - i = compress_end; - } else { - break; + if is_ip_v4 { + if piece_pointer > 6 { + return Err(ParseError::InvalidIpv6Address) + } + let mut dots_seen = 0; + while i < len { + // FIXME: https://github.com/whatwg/url/commit/1c22aa119c354e0020117e02571cec53f7c01064 + let mut value = 0u16; + while i < len { + let digit = match input[i] { + c @ b'0' ... b'9' => c - b'0', + _ => break + }; + value = value * 10 + digit as u16; + if value == 0 || value > 255 { + return Err(ParseError::InvalidIpv6Address) } } - try!(write!(formatter, "{:x}", self.pieces[i as usize])); - if i < 7 { - try!(formatter.write_str(":")); + if dots_seen < 3 && !(i < len && input[i] == b'.') { + return Err(ParseError::InvalidIpv6Address) + } + pieces[piece_pointer] = pieces[piece_pointer] * 0x100 + value; + if dots_seen == 0 || dots_seen == 2 { + piece_pointer += 1; } i += 1; + if dots_seen == 3 && i < len { + return Err(ParseError::InvalidIpv6Address) + } + dots_seen += 1; } - Ok(()) } -} - -fn longest_zero_sequence(pieces: &[u16; 8]) -> (isize, isize) { - let mut longest = -1; - let mut longest_length = -1; - let mut start = -1; - macro_rules! finish_sequence( - ($end: expr) => { - if start >= 0 { - let length = $end - start; - if length > longest_length { - longest = start; - longest_length = length; - } - } - }; - ); - for i in 0..8 { - if pieces[i as usize] == 0 { - if start < 0 { - start = i; + match compress_pointer { + Some(compress_pointer) => { + let mut swaps = piece_pointer - compress_pointer; + piece_pointer = 7; + while swaps > 0 { + pieces[piece_pointer] = pieces[compress_pointer + swaps - 1]; + pieces[compress_pointer + swaps - 1] = 0; + swaps -= 1; + piece_pointer -= 1; } - } else { - finish_sequence!(i); - start = -1; + } + _ => if piece_pointer != 8 { + return Err(ParseError::InvalidIpv6Address) } } - finish_sequence!(8); - (longest, longest + longest_length) + Ok(Ipv6Addr::new(pieces[0], pieces[1], pieces[2], pieces[3], + pieces[4], pieces[5], pieces[6], pieces[7])) } diff --git a/src/lib.rs b/src/lib.rs index 08d32ef55..d46810857 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -143,7 +143,7 @@ use std::cmp::Ordering; #[cfg(feature="serde_serialization")] use std::str::FromStr; -pub use host::{Host, Ipv6Address}; +pub use host::Host; pub use parser::{ErrorHandler, ParseResult, ParseError}; use percent_encoding::{percent_encode, lossy_utf8_percent_decode, DEFAULT_ENCODE_SET}; @@ -1140,4 +1140,3 @@ fn file_url_path_to_pathbuf_windows(path: &[String]) -> Result { "to_file_path() failed to produce an absolute Path"); Ok(path) } - diff --git a/src/parser.rs b/src/parser.rs index b03023511..68b28d78b 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -47,6 +47,7 @@ simple_enum_error! { EmptyHost => "empty host", InvalidScheme => "invalid scheme", InvalidPort => "invalid port number", + InvalidIpv4Address => "invalid IPv4 address", InvalidIpv6Address => "invalid IPv6 address", InvalidDomainCharacter => "invalid domain character", InvalidCharacter => "invalid character", diff --git a/src/tests.rs b/src/tests.rs index e25500f5c..c5d12c2c4 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -8,6 +8,7 @@ use std::char; +use std::net::{Ipv4Addr, Ipv6Addr}; use super::{UrlParser, Url, SchemeData, RelativeSchemeData, Host}; @@ -347,3 +348,21 @@ fn relative_scheme_data_equality() { let b: Url = url("http://foo.com/"); check_eq(&a, &b); } + +#[test] +fn host() { + let a = Host::parse("www.mozilla.org").unwrap(); + let b = Host::parse("1.35.33.49").unwrap(); + let c = Host::parse("[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]").unwrap(); + assert_eq!(a, Host::Domain("www.mozilla.org".to_owned())); + assert_eq!(b, Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert_eq!(c, Host::Ipv6(Ipv6Addr::new(0x2001, 0x0db8, 0x85a3, 0x08d3, + 0x1319, 0x8a2e, 0x0370, 0x7344))); + assert_eq!(Host::parse("[::]").unwrap(), Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 0))); + assert_eq!(Host::parse("[::1]").unwrap(), Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1))); + assert_eq!(Host::parse("0x1.0X23.0x21.061").unwrap(), Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert_eq!(Host::parse("0x1232131").unwrap(), Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert!(Host::parse("42.0x1232131").is_err()); + assert_eq!(Host::parse("111").unwrap(), Host::Ipv4(Ipv4Addr::new(0, 0, 0, 111))); + assert_eq!(Host::parse("2..2.3").unwrap(), Host::Domain("2..2.3".to_owned())); +} diff --git a/src/urltestdata.txt b/src/urltestdata.txt index ece4e7131..04ea893f1 100644 --- a/src/urltestdata.txt +++ b/src/urltestdata.txt @@ -162,7 +162,7 @@ http://www.google.com/foo?bar=baz# about:blank s:http h:www.google.com p:/foo q: http://www.google.com/foo?bar=baz#\s\u00BB s:http h:www.google.com p:/foo q:?bar=baz f:#\s%C2%BB http://[www.google.com]/ http://www.google.com s:http h:www.google.com p:/ -http://192.0x00A80001 s:http h:192.0x00a80001 p:/ +http://192.0x00A80001 s:http h:192.168.0.1 p:/ http://www/foo%2Ehtml s:http h:www p:/foo%2Ehtml http://www/foo/%2E/html s:http h:www p:/foo/html http://user:pass@/