From 069718be5280b3fbcc602941240f086fbe8210c0 Mon Sep 17 00:00:00 2001 From: Sean Kim <40130483+theskim@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:17:36 -0500 Subject: [PATCH 1/3] Normalize URL paths: convert /.//p, /..//p, and //p to p --- url/src/lib.rs | 54 ++++++++++++++++++++++++++++++++- url/tests/expected_failures.txt | 4 --- url/tests/unit.rs | 40 ++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 5 deletions(-) diff --git a/url/src/lib.rs b/url/src/lib.rs index b8bc668a..8432e48a 100644 --- a/url/src/lib.rs +++ b/url/src/lib.rs @@ -1756,6 +1756,39 @@ impl Url { let old_after_path_pos = to_u32(self.serialization.len()).unwrap(); let cannot_be_a_base = self.cannot_be_a_base(); let scheme_type = SchemeType::from(self.scheme()); + let mut path_empty = false; + + // Check ':' and then see if the next character is '/' + let mut has_host = if let Some(index) = self.serialization.find(":") { + if self.serialization.len() > index + 1 + && self.serialization.as_bytes().get(index + 1) == Some(&b'/') + { + let rest = &self.serialization[(index + ":/".len())..]; + let host_part = rest.split('/').next().unwrap_or(""); + path_empty = rest.is_empty(); + !host_part.is_empty() && !host_part.contains('@') + } else { + false + } + } else { + false + }; + + // Ensure the path length is greater than 1 to account + // for cases where "/." is already appended from serialization + // If we set path, then we already checked the other two conditions: + // https://url.spec.whatwg.org/#url-serializing + // 1. The host is null + // 2. the first segment of the URL's path is an empty string + if self.path().len() + path.len() > 1 { + if let Some(index) = self.serialization.find(":") { + let removal_start = index + ":".len(); + if self.serialization[removal_start..].starts_with("/.") { + self.path_start = removal_start as u32; + } + } + } + self.serialization.truncate(self.path_start as usize); self.mutate(|parser| { if cannot_be_a_base { @@ -1765,7 +1798,6 @@ impl Url { } parser.parse_cannot_be_a_base_path(parser::Input::new_no_trim(path)); } else { - let mut has_host = true; // FIXME parser.parse_path_start( scheme_type, &mut has_host, @@ -1773,6 +1805,26 @@ impl Url { ); } }); + + // For cases where normalization is applied across both the serialization and the path. + // Append "/." immediately after the scheme (up to ":") + // This is done if three conditions are met. + // https://url.spec.whatwg.org/#url-serializing + // 1. The host is null + // 2. The url's path length is greater than 1 + // 3. the first segment of the URL's path is an empty string + if !has_host && path.len() > 1 && path_empty { + if let Some(index) = self.serialization.find(":") { + if self.serialization.len() > index + 2 + && self.serialization.as_bytes().get(index + 1) == Some(&b'/') + && self.serialization.as_bytes().get(index + 2) == Some(&b'/') + { + self.serialization.insert_str(index + ":".len(), "/."); + self.path_start += "/.".len() as u32; + } + } + } + self.restore_after_path(old_after_path_pos, &after_path); } diff --git a/url/tests/expected_failures.txt b/url/tests/expected_failures.txt index 899e7f70..d1ed726c 100644 --- a/url/tests/expected_failures.txt +++ b/url/tests/expected_failures.txt @@ -43,7 +43,3 @@ set pathname to <\\\\> set pathname to set pathname to - set pathname to - set pathname to - set pathname to - set pathname to

diff --git a/url/tests/unit.rs b/url/tests/unit.rs index 6a9430bd..57465395 100644 --- a/url/tests/unit.rs +++ b/url/tests/unit.rs @@ -1387,3 +1387,43 @@ fn serde_error_message() { r#"relative URL without a base: "§invalid#+#*Ä" at line 1 column 25"# ); } + +#[test] +fn test_fuzzing_uri_failures() { + use url::quirks; + let mut url = Url::parse("data:/.dummy.path").unwrap(); + assert!(!url.cannot_be_a_base()); + + url.set_path(".dummy.path"); + assert_eq!(url.as_str(), "data:/.dummy.path"); + assert_eq!(url.path(), "/.dummy.path"); + url.check_invariants().unwrap(); + + url.path_segments_mut() + .expect("should have path segments") + .push(".another.dummy.path"); + assert_eq!(url.as_str(), "data:/.dummy.path/.another.dummy.path"); + assert_eq!(url.path(), "/.dummy.path/.another.dummy.path"); + url.check_invariants().unwrap(); + + url = Url::parse("web+demo:/").unwrap(); + assert!(!url.cannot_be_a_base()); + + url.set_path("//.dummy.path"); + assert_eq!(url.path(), "//.dummy.path"); + + let segments: Vec<_> = url + .path_segments() + .expect("should have path segments") + .collect(); + assert_eq!(segments, vec!["", ".dummy.path"]); + assert_eq!(url.as_str(), "web+demo:/.//.dummy.path"); + + quirks::set_hostname(&mut url, ".dummy.host").unwrap(); + assert_eq!(url.as_str(), "web+demo://.dummy.host//.dummy.path"); + url.check_invariants().unwrap(); + + quirks::set_hostname(&mut url, "").unwrap(); + assert_eq!(url.as_str(), "web+demo:////.dummy.path"); + url.check_invariants().unwrap(); +} From 86bd58fe2b6968488a856c99cea924b345791046 Mon Sep 17 00:00:00 2001 From: Sean Kim <40130483+theskim@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:18:54 -0500 Subject: [PATCH 2/3] Fix mutating path of URL without authority (idempotency, empty path segments) --- url/src/lib.rs | 13 +++++++-- url/src/path_segments.rs | 42 ++++++++++++++++++++++++-- url/tests/expected_failures.txt | 2 -- url/tests/unit.rs | 52 +++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 6 deletions(-) diff --git a/url/src/lib.rs b/url/src/lib.rs index 8432e48a..35006aeb 100644 --- a/url/src/lib.rs +++ b/url/src/lib.rs @@ -2128,7 +2128,7 @@ impl Url { } else { self.host_end }; - let suffix = self.slice(old_suffix_pos..).to_owned(); + let mut suffix = self.slice(old_suffix_pos..).to_owned(); self.serialization.truncate(self.host_start as usize); if !self.has_authority() { debug_assert!(self.slice(self.scheme_end..self.host_start) == ":"); @@ -2148,7 +2148,16 @@ impl Url { write!(&mut self.serialization, ":{}", port).unwrap(); } } - let new_suffix_pos = to_u32(self.serialization.len()).unwrap(); + let mut new_suffix_pos = to_u32(self.serialization.len()).unwrap(); + + // Remove starting "/." for empty path segment followed by the host + if suffix.starts_with("/.//") { + let adjustment: usize = "/.".len(); + suffix.drain(..adjustment); + // pathname should be "//p" not "p" given that the first segment was empty + new_suffix_pos -= adjustment as u32; + } + self.serialization.push_str(&suffix); let adjust = |index: &mut u32| { diff --git a/url/src/path_segments.rs b/url/src/path_segments.rs index e6363c5c..5da8af0d 100644 --- a/url/src/path_segments.rs +++ b/url/src/path_segments.rs @@ -239,7 +239,7 @@ impl PathSegmentsMut<'_> { I::Item: AsRef, { let scheme_type = SchemeType::from(self.url.scheme()); - let path_start = self.url.path_start as usize; + let mut path_start = self.url.path_start as usize; self.url.mutate(|parser| { parser.context = parser::Context::PathSegmentSetter; for segment in segments { @@ -253,7 +253,44 @@ impl PathSegmentsMut<'_> { { parser.serialization.push('/'); } - let mut has_host = true; // FIXME account for this? + + let mut path_empty = false; + + // Check ':' and then see if the next character is '/' + let mut has_host = if let Some(index) = parser.serialization.find(":") { + if parser.serialization.len() > index + 1 + && parser.serialization.as_bytes().get(index + 1) == Some(&b'/') + { + let rest = &parser.serialization[(index + ":/".len())..]; + let host_part = rest.split('/').next().unwrap_or(""); + path_empty = rest.is_empty(); + !host_part.is_empty() && !host_part.contains('@') + } else { + false + } + } else { + false + }; + + // For cases where normalization is applied across both the serialization and the path. + // Append "/." immediately after the scheme (up to ":") + // This is done if three conditions are met. + // https://url.spec.whatwg.org/#url-serializing + // 1. The host is null + // 2. The url's path length is greater than 1 + // 3. the first segment of the URL's path is an empty string + if !has_host && segment.len() > 1 && path_empty { + if let Some(index) = parser.serialization.find(":") { + if parser.serialization.len() == index + 2 + && parser.serialization.as_bytes().get(index + 1) == Some(&b'/') + { + // Append an extra '/' to ensure that "/./path" becomes "/.//path" + parser.serialization.insert_str(index + ":".len(), "/./"); + path_start += "/.".len(); + } + } + } + parser.parse_path( scheme_type, &mut has_host, @@ -262,6 +299,7 @@ impl PathSegmentsMut<'_> { ); } }); + self.url.path_start = path_start as u32; self } } diff --git a/url/tests/expected_failures.txt b/url/tests/expected_failures.txt index d1ed726c..9bf60b34 100644 --- a/url/tests/expected_failures.txt +++ b/url/tests/expected_failures.txt @@ -36,8 +36,6 @@ set hostname to set hostname to - set hostname to - set hostname to <> set pathname to <> set href to set pathname to <\\\\> diff --git a/url/tests/unit.rs b/url/tests/unit.rs index 57465395..8312efe6 100644 --- a/url/tests/unit.rs +++ b/url/tests/unit.rs @@ -1427,3 +1427,55 @@ fn test_fuzzing_uri_failures() { assert_eq!(url.as_str(), "web+demo:////.dummy.path"); url.check_invariants().unwrap(); } + +#[test] +fn test_can_be_a_base_with_set_path() { + use url::quirks; + let mut url = Url::parse("web+demo:/").unwrap(); + assert!(!url.cannot_be_a_base()); + + url.set_path("//not-a-host"); + assert_eq!(url.path(), "//not-a-host"); + + let segments: Vec<_> = url + .path_segments() + .expect("should have path segments") + .collect(); + + assert_eq!(segments, vec!["", "not-a-host"]); + + url.set_query(Some("query")); + url.set_fragment(Some("frag")); + + assert_eq!(url.as_str(), "web+demo:/.//not-a-host?query#frag"); + quirks::set_hostname(&mut url, "test").unwrap(); + assert_eq!(url.as_str(), "web+demo://test//not-a-host?query#frag"); + url.check_invariants().unwrap(); + quirks::set_hostname(&mut url, "").unwrap(); + assert_eq!(url.as_str(), "web+demo:////not-a-host?query#frag"); + url.check_invariants().unwrap(); +} + +#[test] +fn test_can_be_a_base_with_path_segments_mut() { + let mut url = Url::parse("web+demo:/").unwrap(); + assert!(!url.cannot_be_a_base()); + + url.path_segments_mut() + .expect("should have path segments") + .push("") + .push("not-a-host"); + + url.set_query(Some("query")); + url.set_fragment(Some("frag")); + + assert_eq!(url.as_str(), "web+demo:/.//not-a-host?query#frag"); + assert_eq!(url.path(), "//not-a-host"); + url.check_invariants().unwrap(); + + let segments: Vec<_> = url + .path_segments() + .expect("should have path segments") + .collect(); + assert_eq!(segments, vec!["", "not-a-host"]); +} From 2c307f16b78eb5c02356945ca7093e95822ef9a2 Mon Sep 17 00:00:00 2001 From: Sean Kim <40130483+theskim@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:30:33 -0500 Subject: [PATCH 3/3] Improve testing for host, path, query, hash when mutating path --- url/tests/unit.rs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/url/tests/unit.rs b/url/tests/unit.rs index 8312efe6..85b7f1f0 100644 --- a/url/tests/unit.rs +++ b/url/tests/unit.rs @@ -1479,3 +1479,35 @@ fn test_can_be_a_base_with_path_segments_mut() { .collect(); assert_eq!(segments, vec!["", "not-a-host"]); } + +#[test] +fn test_valid_indices_after_set_path() { + // Testing everything + let mut url = Url::parse("moz:/").unwrap(); + assert!(!url.cannot_be_a_base()); + + url.set_path("/.//p"); + url.set_host(Some("host")).unwrap(); + url.set_query(Some("query")); + url.set_fragment(Some("frag")); + assert_eq!(url.as_str(), "moz://host//p?query#frag"); + assert_eq!(url.host(), Some(Host::Domain("host"))); + assert_eq!(url.path(), "//p"); + assert_eq!(url.query(), Some("query")); + assert_eq!(url.fragment(), Some("frag")); + url.check_invariants().unwrap(); + + url = Url::parse("moz:/.//").unwrap(); + assert!(!url.cannot_be_a_base()); + + url.set_path("p"); + url.set_host(Some("host")).unwrap(); + url.set_query(Some("query")); + url.set_fragment(Some("frag")); + assert_eq!(url.as_str(), "moz://host/p?query#frag"); + assert_eq!(url.host(), Some(Host::Domain("host"))); + assert_eq!(url.path(), "/p"); + assert_eq!(url.query(), Some("query")); + assert_eq!(url.fragment(), Some("frag")); + url.check_invariants().unwrap(); +}