Skip to content

Rewrite libcore's UTF-8 validation for performance #107760

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 268 additions & 1 deletion library/alloc/tests/str.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::assert_matches::assert_matches;
use std::borrow::Cow;
use std::cmp::Ordering::{Equal, Greater, Less};
use std::str::{from_utf8, from_utf8_unchecked};
use std::str::{from_utf8, from_utf8_unchecked, Utf8Error};

#[test]
fn test_le() {
Expand Down Expand Up @@ -983,6 +983,250 @@ fn from_utf8_error() {
test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3));
}

const fn utf8_error_eq(e: &Utf8Error, valid: usize, err_len: Option<usize>) -> bool {
e.valid_up_to() == valid
&& match (err_len, e.error_len()) {
(Some(a), Some(b)) => a == b,
(None, None) => true,
_ => false,
}
}

#[test]
fn from_utf8_error_offset() {
const N: usize = if cfg!(miri) { 8 } else { 64 };

#[track_caller]
fn check(input: &[u8], valid: usize, err_len: Option<usize>) {
let mut buf = Vec::with_capacity(input.len() + N * N * N * 4);
for i in 0..N {
for j in 0..N {
buf.clear();
buf.extend(core::iter::repeat(b'a').take(i));
buf.extend(core::iter::repeat(*b"\xE6\x88\x91").take(i).flat_map(|n| n));
let bump = buf.len();
buf.extend_from_slice(input);

assert!(
utf8_error_eq(&from_utf8(&buf).unwrap_err(), bump + valid, err_len),
"offset ({i}, {j}, _): on {input:?} ({buf:?})"
);
for k in 0..N {
if k != 0 {
buf.extend(*b"\xD0\xB6")
}
let error = from_utf8(&buf).unwrap_err();
let real_err = from_utf8(&buf).unwrap_err();
assert_eq!(
error, real_err,
"(vs ref) offset ({i}, {j}, {k}): on {input:?} ({buf:?})"
);
}
}
}
}
check(b"A\xC3\xA9 \xFF ", 4, Some(1));
check(b"A\xC3\xA9 \x80 ", 4, Some(1));
check(b"A\xC3\xA9 \xC1 ", 4, Some(1));
check(b"A\xC3\xA9 \xC1", 4, Some(1));
check(b"A\xC3\xA9 \xC2", 4, None);
check(b"A\xC3\xA9 \xC2 ", 4, Some(1));
check(b"A\xC3\xA9 \xC2\xC0", 4, Some(1));
check(b"A\xC3\xA9 \xE0", 4, None);
check(b"A\xC3\xA9 \xE0\x9F", 4, Some(1));
check(b"A\xC3\xA9 \xE0\xA0", 4, None);
check(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(2));
check(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(2));
check(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(1));
check(b"A\xC3\xA9 \xF1", 4, None);
check(b"A\xC3\xA9 \xF1\x80", 4, None);
check(b"A\xC3\xA9 \xF1\x80\x80", 4, None);
check(b"A\xC3\xA9 \xF1 ", 4, Some(1));
check(b"A\xC3\xA9 \xF1\x80 ", 4, Some(2));
check(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3));
check(b"\xc3\x28", 0, Some(1));
check(b"\xa0\xa1", 0, Some(1));
check(b"\xe2\x28\xa1", 0, Some(1));
check(b"\xe2\x82\x28", 0, Some(2));
check(b"\xf0\x28\x8c\xbc", 0, Some(1));
check(b"\xf0\x90\x28\xbc", 0, Some(2));
check(b"\xf0\x28\x8c\x28", 0, Some(1));
check(b"\xc0\x9f", 0, Some(1));
check(b"\xf5\xff\xff\xff", 0, Some(1));
check(b"\xed\xa0\x81", 0, Some(1));
check(b"\xf8\x90\x80\x80\x80", 0, Some(1));
check(b"123456789012345\xed", 15, None);
check(b"123456789012345\xf1", 15, None);
check(b"123456789012345\xc2", 15, None);
check(b"\xC2\x7F", 0, Some(1));
check(b"\xce", 0, None);
check(b"\xce\xba\xe1", 2, None);
check(b"\xce\xba\xe1\xbd", 2, None);
check(b"\xce\xba\xe1\xbd\xb9\xcf", 5, None);
check(b"\xce\xba\xe1\xbd\xb9\xcf\x83\xce", 7, None);
check(b"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", 9, None);
check(b"\xdf", 0, None);
check(b"\xef\xbf", 0, None);
check(b"\x80", 0, Some(1));
check(b"\x91\x85\x95\x9e", 0, Some(1));
check(b"\x6c\x02\x8e\x18", 2, Some(1));
check(b"\xFF", 0, Some(1));
check(b"a\xFF", 1, Some(1));
check(b"\xCE\xB2\xFF", 2, Some(1));
check(b"\xE2\x98\x83\xFF", 3, Some(1));
check(b"\xF0\x9D\x9D\xB1\xFF", 4, Some(1));
check(b"\xCE\xF0", 0, Some(1));
check(b"\xE2\x98\xF0", 0, Some(2));
check(b"\xF0\x9D\x9D\xF0", 0, Some(3));
check(b"\xF0\x82\x82\xAC", 0, Some(1));
check(b"a\xF0\x82\x82\xAC", 1, Some(1));
check(b"\xE2\x98\x83\xF0\x82\x82\xAC", 3, Some(1));
check(b"\xED\xA0\x80", 0, Some(1));
check(b"\xE2\x98\x83\xED\xA0\x80", 3, Some(1));
check(b"\xE2\x98\x83\xCE\xE2\x98\x83", 3, Some(1));
check(b"\xCEa", 0, Some(1));
check(b"a\xCEa", 1, Some(1));
check(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83", 3, Some(2));
// check(b"\xF0\x9D\x9Ca", 3, Some(2));
check(b"\xE2\x98a", 0, Some(2));
check(b"a\xE2\x98a", 1, Some(2));
check(b"\xF0\x9D\x9Ca", 0, Some(3));
check(b"a\xF0\x9D\x9Ca", 1, Some(3));
check(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83", 4, Some(3));
check(b"foobar\xF1\x80\x80quux", 6, Some(3));
check(b"\xCE", 0, None);
check(b"a\xCE", 1, None);
check(b"\xE2\x98\x83\xCE", 3, None);
check(b"\xE2\x98", 0, None);
check(b"a\xE2\x98", 1, None);
check(b"\xE2\x98\x83\xE2\x98", 3, None);
check(b"\xF0\x9D\x9C", 0, None);
check(b"a\xF0\x9D\x9C", 1, None);
check(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C", 4, None);
check(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF", 8, Some(1));
}

#[test]
fn utf8_error_cases_const() {
macro_rules! expect_utf8_error {
($bytes:expr, $valid:expr, $elen:expr $(,)?) => {{
assert!(utf8_error_eq(&from_utf8($bytes).unwrap_err(), $valid, $elen));
const _: () = match from_utf8($bytes) {
Ok(_) => panic!(concat!("shouldn't pass: ", stringify!($bytes))),
Err(e) => assert!(utf8_error_eq(&e, $valid, $elen)),
};
}};
}
expect_utf8_error!(b"\xc3\x28", 0, Some(1));
expect_utf8_error!(b"\xa0\xa1", 0, Some(1));
expect_utf8_error!(b"\xe2\x28\xa1", 0, Some(1));
expect_utf8_error!(b"\xe2\x82\x28", 0, Some(2));
expect_utf8_error!(b"\xf0\x28\x8c\xbc", 0, Some(1));
expect_utf8_error!(b"\xf0\x90\x28\xbc", 0, Some(2));
expect_utf8_error!(b"\xf0\x28\x8c\x28", 0, Some(1));
expect_utf8_error!(b"\xc0\x9f", 0, Some(1));
expect_utf8_error!(b"\xf5\xff\xff\xff", 0, Some(1));
expect_utf8_error!(b"\xed\xa0\x81", 0, Some(1));
expect_utf8_error!(b"\xf8\x90\x80\x80\x80", 0, Some(1));
expect_utf8_error!(b"123456789012345\xed", 15, None);
expect_utf8_error!(b"123456789012345\xf1", 15, None);
expect_utf8_error!(b"123456789012345\xc2", 15, None);
expect_utf8_error!(b"\xC2\x7F", 0, Some(1));
expect_utf8_error!(b"\xce", 0, None);
expect_utf8_error!(b"\xce\xba\xe1", 2, None);
expect_utf8_error!(b"\xce\xba\xe1\xbd", 2, None);
expect_utf8_error!(b"\xce\xba\xe1\xbd\xb9\xcf", 5, None);
expect_utf8_error!(b"\xce\xba\xe1\xbd\xb9\xcf\x83\xce", 7, None);
expect_utf8_error!(b"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", 9, None);
expect_utf8_error!(b"\xdf", 0, None);
expect_utf8_error!(b"\xef\xbf", 0, None);
expect_utf8_error!(b"\x80", 0, Some(1));
expect_utf8_error!(b"\x91\x85\x95\x9e", 0, Some(1));
expect_utf8_error!(b"\x6c\x02\x8e\x18", 2, Some(1));
expect_utf8_error!(
&[
0x25, 0x5b, 0x6e, 0x2c, 0x32, 0x2c, 0x5b, 0x5b, 0x33, 0x2c, 0x34, 0x2c, 0x05, 0x29,
0x2c, 0x33, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x5b, 0x5b,
0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b,
0x5b, 0x5d, 0x2c, 0x35, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x33, 0x2c, 0x37, 0x2e, 0x33,
0x2c, 0x39, 0x2e, 0x34, 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x33, 0x2c, 0x37,
0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x34, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d,
0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x20, 0x01, 0x01, 0x01, 0x01,
0x01, 0x02, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x23, 0x0a, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x7e, 0x7e, 0x0a, 0x0a, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b,
0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5d, 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x33,
0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x34, 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39,
0x2e, 0x33, 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x34, 0x5d, 0x5d, 0x5d, 0x5d,
0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x01, 0x01, 0x80,
0x01, 0x01, 0x01, 0x79, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01,
],
335,
Some(1),
);
expect_utf8_error!(
&[
0x5bu8, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b,
0x5b, 0x80, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, b'0', 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01,
],
15,
Some(1),
);
expect_utf8_error!(
&[
0x20, 0x0b, 0x01, 0x01, 0x01, 0x64, 0x3a, 0x64, 0x3a, 0x64, 0x3a, 0x5b, 0x5b, 0x5b,
0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b,
0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b,
0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x30, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x80, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01u8,
],
172,
Some(1),
);
}

#[test]
fn test_as_bytes() {
// no null
Expand All @@ -996,6 +1240,29 @@ fn test_as_bytes() {
assert_eq!("ศไทย中华Việt Nam".as_bytes(), v);
}

#[test]
#[cfg(not(miri))]
fn from_utf8_all_chars() {
for i in 0..(0x10FFFF + 1) {
let Some(cp) = char::from_u32(i) else {
continue;
};
let mut buf = [0; 4];
let s: &str = cp.encode_utf8(&mut buf);
assert_eq!(Ok(s), from_utf8(s.as_bytes()));
}
}

#[test]
fn test_multi() {
assert!(from_utf8(b"abc").is_ok());
assert!(from_utf8(b"a\xE2\x98\x83a").is_ok());
assert!(from_utf8(b"a\xF0\x9D\x9C\xB7a").is_ok());
assert!(from_utf8(b"\xE2\x98\x83\xF0\x9D\x9C\xB7").is_ok());
assert!(from_utf8(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a").is_ok());
assert!(from_utf8(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD").is_ok());
}

#[test]
#[should_panic]
fn test_as_bytes_fail() {
Expand Down
Loading