rust-lang · bors · Feb 6, 2022 · Oct 30, 2021 · Oct 30, 2021 · Oct 30, 2021
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs
@@ -2230,3 +2230,45 @@ fn utf8_chars() {
     assert!((!from_utf8(&[0xf0, 0xff, 0x10]).is_ok()));
     assert!((!from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_ok()));
 }
+
+#[test]
+fn utf8_char_counts() {
+    let strs = [("e", 1), ("é", 1), ("€", 1), ("\u{10000}", 1), ("eé€\u{10000}", 4)];
+    let mut reps =
+        [8, 64, 256, 512, 1024].iter().copied().flat_map(|n| n - 8..=n + 8).collect::<Vec<usize>>();
+    if cfg!(not(miri)) {
+        let big = 1 << 16;
+        reps.extend(big - 8..=big + 8);
+    }
+    let counts = if cfg!(miri) { 0..1 } else { 0..8 };
+    let padding = counts.map(|len| " ".repeat(len)).collect::<Vec<String>>();
+
+    for repeat in reps {
+        for (tmpl_str, tmpl_char_count) in strs {
+            for pad_start in &padding {
+                for pad_end in &padding {
+                    // Create a string with padding...
+                    let with_padding =
+                        format!("{}{}{}", pad_start, tmpl_str.repeat(repeat), pad_end);
+                    // ...and then skip past that padding. This should ensure
+                    // that we test several different alignments for both head
+                    // and tail.
+                    let si = pad_start.len();
+                    let ei = with_padding.len() - pad_end.len();
+                    let target = &with_padding[si..ei];
+
+                    assert!(!target.starts_with(" ") && !target.ends_with(" "));
+                    let expected_count = tmpl_char_count * repeat;
+                    assert_eq!(
+                        expected_count,
+                        target.chars().count(),
+                        "wrong count for `{:?}.repeat({})` (padding: `{:?}`)",
+                        tmpl_str,
+                        repeat,
+                        (pad_start.len(), pad_end.len()),
+                    );
+                }
+            }
+        }
+    }
+}
diff --git a/library/core/benches/str.rs b/library/core/benches/str.rs
@@ -1,33 +1,10 @@
 use std::str;
 use test::{black_box, Bencher};
 
-const LOREM_SHORT: &str = "Lorem ipsum";
-
-const LOREM: &str = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
-Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.
-Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.
-Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat.
-Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis.
-At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, At accusam aliquyam diam diam dolore dolores duo eirmod eos erat, et nonumy sed tempor et et invidunt justo labore Stet clita ea et gubergren, kasd magna no rebum. sanctus sea sed takimata ut vero voluptua. est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur";
-
-const EMOJI: &str = "😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘😗☺😚😙🥲😋😛😜🤪😝🤑🤗🤭🤫🤔🤐🤨😐😑😶😶‍🌫️😏😒🙄😬😮‍💨🤥😌😔😪🤤😴😷🤒🤕🤢🤮🤧🥵🥶🥴😵😵‍💫🤯🤠🥳🥸😎🤓🧐😕😟🙁☹😮😯😲😳🥺😦😧😨😰😥😢😭😱😖😣😞😓😩😫🥱😤😡😠🤬😈👿💀☠💩🤡👹👺👻👽👾🤖😺😸😹😻😼😽🙀😿😾🙈🙉🙊💋💌💘💝💖💗💓💞💕💟❣💔❤️‍🔥❤️‍🩹❤🧡💛💚💙💜🤎🖤🤍💯💢💥💫💦💨🕳💣💬👁️‍🗨️🗨🗯💭💤👋🤚🖐✋🖖👌🤌🤏✌🤞🤟🤘🤙👈👉👆🖕👇☝👍👎✊👊🤛🤜👏🙌👐🤲🤝🙏✍💅🤳💪🦾🦿🦵🦶👂🦻👃🧠🫀🫁🦷🦴👀👁👅👄👶🧒👦👧🧑👱👨🧔🧔‍♂️🧔‍♀️👨‍🦰👨‍🦱👨‍🦳👨‍🦲👩👩‍🦰🧑‍🦰👩‍🦱🧑‍🦱👩‍🦳🧑‍🦳👩‍🦲🧑‍🦲👱‍♀️👱‍♂️🧓👴👵🙍🙍‍♂️🙍‍♀️🙎🙎‍♂️🙎‍♀️🙅🙅‍♂️🙅‍♀️🙆🙆‍♂️🙆‍♀️💁💁‍♂️💁‍♀️🙋🙋‍♂️🙋‍♀️🧏🧏‍♂️🧏‍♀️🙇🙇‍♂️🙇‍♀️🤦🤦‍♂️🤦‍♀️🤷🤷‍♂️🤷‍♀️🧑‍⚕️👨‍⚕️👩‍⚕️🧑‍🎓👨‍🎓👩‍🎓🧑‍🏫👨‍🏫👩‍🏫🧑‍⚖️👨‍⚖️👩‍⚖️🧑‍🌾👨‍🌾👩‍🌾🧑‍🍳👨‍🍳👩‍🍳🧑‍🔧👨‍🔧👩‍🔧🧑‍🏭👨‍🏭👩‍🏭🧑‍💼👨‍💼👩‍💼🧑‍🔬👨‍🔬👩‍🔬🧑‍💻👨‍💻👩‍💻🧑‍🎤👨‍🎤👩‍🎤🧑‍🎨👨‍🎨👩‍🎨🧑‍✈️👨‍✈️👩‍✈️🧑‍🚀👨‍🚀👩‍🚀🧑‍🚒👨‍🚒👩‍🚒👮👮‍♂️👮‍♀️🕵🕵️‍♂️🕵️‍♀️💂💂‍♂️💂‍♀️🥷👷👷‍♂️👷‍♀️🤴👸👳👳‍♂️👳‍♀️👲🧕🤵🤵‍♂️🤵‍♀️👰👰‍♂️👰‍♀️🤰🤱👩‍🍼👨‍🍼🧑‍🍼👼🎅🤶🧑‍🎄🦸🦸‍♂️🦸‍♀️🦹🦹‍♂️🦹‍♀️🧙🧙‍♂️🧙‍♀️🧚🧚‍♂️🧚‍♀️🧛🧛‍♂️🧛‍♀️🧜🧜‍♂️🧜‍♀️🧝🧝‍♂️🧝‍♀️🧞🧞‍♂️🧞‍♀️🧟🧟‍♂️🧟‍♀️💆💆‍♂️💆‍♀️💇💇‍♂️💇‍♀️🚶🚶‍♂️🚶‍♀️🧍🧍‍♂️🧍‍♀️🧎🧎‍♂️🧎‍♀️🧑‍🦯👨‍🦯👩‍🦯🧑‍🦼👨‍🦼👩‍🦼🧑‍🦽👨‍🦽👩‍🦽🏃🏃‍♂️🏃‍♀️💃🕺🕴👯👯‍♂️👯‍♀️🧖🧖‍♂️🧖‍♀️🧗🧗‍♂️🧗‍♀️🤺🏇⛷🏂🏌🏌️‍♂️🏌️‍♀️🏄🏄‍♂️🏄‍♀️🚣🚣‍♂️🚣‍♀️🏊🏊‍♂️🏊‍♀️⛹⛹️‍♂️⛹️‍♀️🏋🏋️‍♂️🏋️‍♀️🚴🚴‍♂️🚴‍♀️🚵🚵‍♂️🚵‍♀️🤸🤸‍♂️🤸‍♀️🤼🤼‍♂️🤼‍♀️🤽🤽‍♂️🤽‍♀️🤾🤾‍♂️🤾‍♀️🤹🤹‍♂️🤹‍♀️🧘🧘‍♂️🧘‍♀️🛀🛌🧑‍🤝‍🧑👭👫👬💏👩‍❤️‍💋‍👨👨‍❤️‍💋‍👨👩‍❤️‍💋‍👩💑👩‍❤️‍👨👨‍❤️‍👨👩‍❤️‍👩👪👨‍👩‍👦👨‍👩‍👧👨‍👩‍👧‍👦👨‍👩‍👦‍👦👨‍👩‍👧‍👧👨‍👨‍👦👨‍👨‍👧👨‍👨‍👧‍👦👨‍👨‍👦‍👦👨‍👨‍👧‍👧👩‍👩‍👦👩‍👩‍👧👩‍👩‍👧‍👦👩‍👩‍👦‍👦👩‍👩‍👧‍👧👨‍👦👨‍👦‍👦👨‍👧👨‍👧‍👦👨‍👧‍👧👩‍👦👩‍👦‍👦👩‍👧👩‍👧‍👦👩‍👧‍👧🗣👤👥🫂👣🦰🦱🦳🦲🐵🐒🦍🦧🐶🐕🦮🐕‍🦺🐩🐺🦊🦝🐱🐈🐈‍⬛🦁🐯🐅🐆🐴🐎🦄🦓🦌🦬🐮🐂🐃🐄🐷🐖🐗🐽🐏🐑🐐🐪🐫🦙🦒🐘🦣🦏🦛🐭🐁🐀🐹🐰🐇🐿🦫🦔🦇🐻🐻‍❄️🐨🐼🦥🦦🦨🦘🦡🐾🦃🐔🐓🐣🐤🐥🐦🐧🕊🦅🦆🦢🦉🦤🪶🦩🦚🦜🐸🐊🐢🦎🐍🐲🐉🦕🦖🐳🐋🐬🦭🐟🐠🐡🦈🐙🐚🐌🦋🐛🐜🐝🪲🐞🦗🪳🕷🕸🦂🦟🪰🪱🦠💐🌸💮🏵🌹🥀🌺🌻🌼🌷🌱🪴🌲🌳🌴🌵🌾🌿☘🍀🍁🍂🍃🍇🍈🍉🍊🍋🍌🍍🥭🍎🍏🍐🍑🍒🍓🫐🥝🍅🫒🥥🥑🍆🥔🥕🌽🌶🫑🥒🥬🥦🧄🧅🍄🥜🌰🍞🥐🥖🫓🥨🥯🥞🧇🧀🍖🍗🥩🥓🍔🍟🍕🌭🥪🌮🌯🫔🥙🧆🥚🍳🥘🍲🫕🥣🥗🍿🧈🧂🥫🍱🍘🍙🍚🍛🍜🍝🍠🍢🍣🍤🍥🥮🍡🥟🥠🥡🦀🦞🦐🦑🦪🍦🍧🍨🍩🍪🎂🍰🧁🥧🍫🍬🍭🍮🍯🍼🥛☕🫖🍵🍶🍾🍷🍸🍹🍺🍻🥂🥃🥤🧋🧃🧉🧊🥢🍽🍴🥄🔪🏺🌍🌎🌏🌐🗺🗾🧭🏔⛰🌋🗻🏕🏖🏜🏝🏞🏟🏛🏗🧱🪨🪵🛖🏘🏚🏠🏡🏢🏣🏤🏥🏦🏨🏩🏪🏫🏬🏭🏯🏰💒🗼🗽⛪🕌🛕🕍⛩🕋⛲⛺🌁🌃🏙🌄🌅🌆🌇🌉♨🎠🎡🎢💈🎪🚂🚃🚄🚅🚆🚇🚈🚉🚊🚝🚞🚋🚌🚍🚎🚐🚑🚒🚓🚔🚕🚖🚗🚘🚙🛻🚚🚛🚜🏎🏍🛵🦽🦼🛺🚲🛴🛹🛼🚏🛣🛤🛢⛽🚨🚥🚦🛑🚧⚓⛵🛶🚤🛳⛴🛥🚢✈🛩🛫🛬🪂💺🚁🚟🚠🚡🛰🚀🛸🛎🧳⌛⏳⌚⏰⏱⏲🕰🕛🕧🕐🕜🕑🕝🕒🕞🕓🕟🕔🕠🕕🕡🕖🕢🕗🕣🕘🕤🕙🕥🕚🕦🌑🌒🌓🌔🌕🌖🌗🌘🌙🌚🌛🌜🌡☀🌝🌞🪐⭐🌟🌠🌌☁⛅⛈🌤🌥🌦🌧🌨🌩🌪🌫🌬🌀🌈🌂☂☔⛱⚡❄☃⛄☄🔥💧🌊🎃🎄🎆🎇🧨✨🎈🎉🎊🎋🎍🎎🎏🎐🎑🧧🎀🎁🎗🎟🎫🎖🏆🏅🥇🥈🥉⚽⚾🥎🏀🏐🏈🏉🎾🥏🎳🏏🏑🏒🥍🏓🏸🥊🥋🥅⛳⛸🎣🤿🎽🎿🛷🥌🎯🪀🪁🎱🔮🪄🧿🎮🕹🎰🎲🧩🧸🪅🪆♠♥♦♣♟🃏🀄🎴🎭🖼🎨🧵🪡🧶🪢👓🕶🥽🥼🦺👔👕👖🧣🧤🧥🧦👗👘🥻🩱🩲🩳👙👚👛👜👝🛍🎒🩴👞👟🥾🥿👠👡🩰👢👑👒🎩🎓🧢🪖⛑📿💄💍💎🔇🔈🔉🔊📢📣📯🔔🔕🎼🎵🎶🎙🎚🎛🎤🎧📻🎷🪗🎸🎹🎺🎻🪕🥁";
-
-#[bench]
-fn str_char_count_lorem(b: &mut Bencher) {
-    b.iter(|| black_box(LOREM).chars().count());
-}
-
-#[bench]
-fn str_char_count_lorem_short(b: &mut Bencher) {
-    b.iter(|| black_box(LOREM_SHORT).chars().count());
-}
-
-#[bench]
-fn str_char_count_emoji(b: &mut Bencher) {
-    b.iter(|| black_box(EMOJI).chars().count());
-}
+mod char_count;
+mod corpora;
 
 #[bench]
 fn str_validate_emoji(b: &mut Bencher) {
-    b.iter(|| str::from_utf8(black_box(EMOJI.as_bytes())));
+    b.iter(|| str::from_utf8(black_box(corpora::emoji::LARGE.as_bytes())));
 }
diff --git a/library/core/benches/str/char_count.rs b/library/core/benches/str/char_count.rs
@@ -0,0 +1,107 @@
+use super::corpora::*;
+use test::{black_box, Bencher};
+
+macro_rules! define_benches {
+    ($( fn $name: ident($arg: ident: &str) $body: block )+) => {
+        define_benches!(mod en_tiny, en::TINY, $($name $arg $body)+);
+        define_benches!(mod en_small, en::SMALL, $($name $arg $body)+);
+        define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+);
+        define_benches!(mod en_large, en::LARGE, $($name $arg $body)+);
+        define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+);
+
+        define_benches!(mod zh_tiny, zh::TINY, $($name $arg $body)+);
+        define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+);
+        define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+);
+        define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+);
+        define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+);
+
+        define_benches!(mod ru_tiny, ru::TINY, $($name $arg $body)+);
+        define_benches!(mod ru_small, ru::SMALL, $($name $arg $body)+);
+        define_benches!(mod ru_medium, ru::MEDIUM, $($name $arg $body)+);
+        define_benches!(mod ru_large, ru::LARGE, $($name $arg $body)+);
+        define_benches!(mod ru_huge, ru::HUGE, $($name $arg $body)+);
+
+        define_benches!(mod emoji_tiny, emoji::TINY, $($name $arg $body)+);
+        define_benches!(mod emoji_small, emoji::SMALL, $($name $arg $body)+);
+        define_benches!(mod emoji_medium, emoji::MEDIUM, $($name $arg $body)+);
+        define_benches!(mod emoji_large, emoji::LARGE, $($name $arg $body)+);
+        define_benches!(mod emoji_huge, emoji::HUGE, $($name $arg $body)+);
+    };
+    (mod $mod_name: ident, $input: expr, $($name: ident $arg: ident $body: block)+) => {
+        mod $mod_name {
+            use super::*;
+            $(
+                #[bench]
+                fn $name(bencher: &mut Bencher) {
+                    let input = $input;
+                    bencher.bytes = input.len() as u64;
+                    let mut input_s = input.to_string();
+                    bencher.iter(|| {
+                        let $arg: &str = &black_box(&mut input_s);
+                        black_box($body)
+                    })
+                }
+            )+
+        }
+    };
+}
+
+define_benches! {
+    fn case00_libcore(s: &str) {
+        libcore(s)
+    }
+
+    fn case01_filter_count_cont_bytes(s: &str) {
+        filter_count_cont_bytes(s)
+    }
+
+    fn case02_iter_increment(s: &str) {
+        iterator_increment(s)
+    }
+
+    fn case03_manual_char_len(s: &str) {
+        manual_char_len(s)
+    }
+}
+
+fn libcore(s: &str) -> usize {
+    s.chars().count()
+}
+
+#[inline]
+fn utf8_is_cont_byte(byte: u8) -> bool {
+    (byte as i8) < -64
+}
+
+fn filter_count_cont_bytes(s: &str) -> usize {
+    s.as_bytes().iter().filter(|&&byte| !utf8_is_cont_byte(byte)).count()
+}
+
+fn iterator_increment(s: &str) -> usize {
+    let mut c = 0;
+    for _ in s.chars() {
+        c += 1;
+    }
+    c
+}
+
+fn manual_char_len(s: &str) -> usize {
+    let s = s.as_bytes();
+    let mut c = 0;
+    let mut i = 0;
+    let l = s.len();
+    while i < l {
+        let b = s[i];
+        if b < 0x80 {
+            i += 1;
+        } else if b < 0xe0 {
+            i += 2;
+        } else if b < 0xf0 {
+            i += 3;
+        } else {
+            i += 4;
+        }
+        c += 1;
+    }
+    c
+}
diff --git a/library/core/benches/str/corpora.rs b/library/core/benches/str/corpora.rs
@@ -0,0 +1,88 @@
+//! Exposes a number of modules with different kinds of strings.
+//!
+//! Each module contains `&str` constants named `TINY`, `SMALL`, `MEDIUM`,
+//! `LARGE`, and `HUGE`.
+//!
+//! - The `TINY` string is generally around 8 bytes.
+//! - The `SMALL` string is generally around 30-40 bytes.
+//! - The `MEDIUM` string is generally around 600-700 bytes.
+//! - The `LARGE` string is the `MEDIUM` string repeated 8x, and is around 5kb.
+//! - The `HUGE` string is the `LARGE` string repeated 8x (or the `MEDIUM`
+//!   string repeated 64x), and is around 40kb.
+//!
+//! Except for `mod emoji` (which is just a bunch of emoji), the strings were
+//! pulled from (localizations of) rust-lang.org.
+
+macro_rules! repeat8 {
+    ($s:expr) => {
+        concat!($s, $s, $s, $s, $s, $s, $s, $s)
+    };
+}
+
+macro_rules! define_consts {
+    ($s:literal) => {
+        pub const MEDIUM: &str = $s;
+        pub const LARGE: &str = repeat8!($s);
+        pub const HUGE: &str = repeat8!(repeat8!(repeat8!($s)));
+    };
+}
+
+pub mod en {
+    pub const TINY: &str = "Mary had";
+    pub const SMALL: &str = "Mary had a little lamb, Little lamb";
+    define_consts! {
+        "Rust is blazingly fast and memory-efficient: with no runtime or garbage
+         collector, it can power performance-critical services, run on embedded
+         devices, and easily integrate with other languages.  Rust’s rich type system
+         and ownership model guarantee memory-safety and thread-safety — enabling you
+         to eliminate many classes of bugs at compile-time.  Rust has great
+         documentation, a friendly compiler with useful error messages, and top-notch
+         tooling — an integrated package manager and build tool, smart multi-editor
+         support with auto-completion and type inspections, an auto-formatter, and
+         more."
+    }
+}
+
+pub mod zh {
+    pub const TINY: &str = "速度惊";
+    pub const SMALL: &str = "速度惊人且内存利用率极高";
+    define_consts! {
+        "Rust   速度惊人且内存利用率极高。由于\
+         没有运行时和垃圾回收，它能够胜任对性能要\
+         求特别高的服务，可以在嵌入式设备上运行，\
+         还能轻松和其他语言集成。Rust 丰富的类型\
+         系统和所有权模型保证了内存安全和线程安全，\
+         让您在编译期就能够消除各种各样的错误。\
+         Rust 拥有出色的文档、友好的编译器和清晰\
+         的错误提示信息， 还集成了一流的工具——\
+         包管理器和构建工具， 智能地自动补全和类\
+         型检验的多编辑器支持， 以及自动格式化代\
+         码等等。"
+    }
+}
+
+pub mod ru {
+    pub const TINY: &str = "Сотни";
+    pub const SMALL: &str = "Сотни компаний по";
+    define_consts! {
+        "Сотни компаний по всему миру используют Rust в реальных\
+         проектах для быстрых кросс-платформенных решений с\
+         ограниченными ресурсами. Такие проекты, как Firefox,\
+         Dropbox и Cloudflare, используют Rust. Rust отлично\
+         подходит как для стартапов, так и для больших компаний,\
+         как для встраиваемых устройств, так и для масштабируемых\
+         web-сервисов. Мой самый большой комплимент Rust."
+    }
+}
+
+pub mod emoji {
+    pub const TINY: &str = "😀😃";
+    pub const SMALL: &str = "😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘";
+    define_consts! {
+        "😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘😗☺😚😙🥲😋😛😜🤪😝🤑🤗🤭🤫🤔🤐🤨😐😑😶😶‍🌫️😏😒\
+         🙄😬😮‍💨🤥😌😔😪🤤😴😷🤒🤕🤢🤮🤧🥵🥶🥴😵😵‍💫🤯��🥳🥸😎🤓🧐😕😟🙁☹😮😯😲😳🥺😦😧😨\
+         😰😥😢😭😱😖😣😞😓😩😫🥱😤😡😠🤬😈👿💀☠💩🤡👹👺👻👽👾🤖😺😸😹😻😼😽🙀😿😾🙈🙉🙊\
+         💋💌💘💝💖💗💓��💕💟❣💔❤️‍🔥❤️‍🩹❤🧡💛💚💙💜🤎🖤🤍💯💢💥💫💦💨🕳💬👁️‍🗨️🗨🗯💭💤👋\
+         🤚🖐✋🖖👌🤌🤏✌"
+    }
+}
diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs
@@ -0,0 +1,136 @@
+//! Code for efficiently counting the number of `char`s in a UTF-8 encoded
+//! string.
+//!
+//! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`,
+//! followed by some number (possibly 0) of continuation bytes.
+//!
+//! The leading byte can have a number of bit-patterns (with the specific
+//! pattern indicating how many continuation bytes follow), but the continuation
+//! bytes are always in the format `0b10XX_XXXX` (where the `X`s can take any
+//! value). That is, the most significant bit is set, and the second most
+//! significant bit is unset.
+//!
+//! To count the number of characters, we can just count the number of bytes in
+//! the string which are not continuation bytes, which can be done many bytes at
+//! a time fairly easily.
+//!
+//! Note: Because the term "leading byte" can sometimes be ambiguous (for
+//! example, it could also refer to the first byte of a slice), we'll often use
+//! the term "non-continuation byte" to refer to these bytes in the code.
+use core::intrinsics::unlikely;
+
+const USIZE_SIZE: usize = core::mem::size_of::<usize>();
+const UNROLL_INNER: usize = 4;
+
+#[inline]
+pub(super) fn count_chars(s: &str) -> usize {
+    if s.len() < USIZE_SIZE * UNROLL_INNER {
+        // Avoid entering the optimized implementation for strings where the
+        // difference is not likely to matter, or where it might even be slower.
+        // That said, a ton of thought was not spent on the particular threshold
+        // here, beyond "this value seems to make sense".
+        char_count_general_case(s.as_bytes())
+    } else {
+        do_count_chars(s)
+    }
+}
+
+fn do_count_chars(s: &str) -> usize {
+    // For correctness, `CHUNK_SIZE` must be:
+    //
+    // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`.
+    // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the
+    //   `body.chunks(CHUNK_SIZE)` loop is incorrect.
+    //
+    // For performance, `CHUNK_SIZE` should be:
+    // - Relatively cheap to `/` against (so some simple sum of powers of two).
+    // - Large enough to avoid paying for the cost of the `sum_bytes_in_usize`
+    //   too often.
+    const CHUNK_SIZE: usize = 192;
+
+    // Check the properties of `CHUNK_SIZE` and `UNROLL_INNER` that are required
+    // for correctness.
+    const _: () = assert!(CHUNK_SIZE < 256);
+    const _: () = assert!(CHUNK_SIZE % UNROLL_INNER == 0);
+
+    // SAFETY: transmuting `[u8]` to `[usize]` is safe except for size
+    // differences which are handled by `align_to`.
+    let (head, body, tail) = unsafe { s.as_bytes().align_to::<usize>() };
+
+    // This should be quite rare, and basically exists to handle the degenerate
+    // cases where align_to fails (as well as miri under symbolic alignment
+    // mode).
+    //
+    // The `unlikely` helps discourage LLVM from inlining the body, which is
+    // nice, as we would rather not mark the `char_count_general_case` function
+    // as cold.
+    if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) {
+        return char_count_general_case(s.as_bytes());
+    }
+
+    let mut total = char_count_general_case(head) + char_count_general_case(tail);
+    // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which
+    // we call `sum_bytes_in_usize`.
+    for chunk in body.chunks(CHUNK_SIZE) {
+        // We accumulate intermediate sums in `counts`, where each byte contains
+        // a subset of the sum of this chunk, like a `[u8; size_of::<usize>()]`.
+        let mut counts = 0;
+
+        let (unrolled_chunks, remainder) = chunk.as_chunks::<UNROLL_INNER>();
+        for unrolled in unrolled_chunks {
+            for &word in unrolled {
+                // Because `CHUNK_SIZE` is < 256, this addition can't cause the
+                // count in any of the bytes to overflow into a subsequent byte.
+                counts += contains_non_continuation_byte(word);
+            }
+        }
+
+        // Sum the values in `counts` (which, again, is conceptually a `[u8;
+        // size_of::<usize>()]`), and accumulate the result into `total`.
+        total += sum_bytes_in_usize(counts);
+
+        // If there's any data in `remainder`, then handle it. This will only
+        // happen for the last `chunk` in `body.chunks()` (because `CHUNK_SIZE`
+        // is divisible by `UNROLL_INNER`), so we explicitly break at the end
+        // (which seems to help LLVM out).
+        if !remainder.is_empty() {
+            // Accumulate all the data in the remainder.
+            let mut counts = 0;
+            for &word in remainder {
+                counts += contains_non_continuation_byte(word);
+            }
+            total += sum_bytes_in_usize(counts);
+            break;
+        }
+    }
+    total
+}
+
+// Checks each byte of `w` to see if it contains the first byte in a UTF-8
+// sequence. Bytes in `w` which are continuation bytes are left as `0x00` (e.g.
+// false), and bytes which are non-continuation bytes are left as `0x01` (e.g.
+// true)
+#[inline]
+fn contains_non_continuation_byte(w: usize) -> usize {
+    const LSB: usize = 0x0101_0101_0101_0101u64 as usize;
+    ((!w >> 7) | (w >> 6)) & LSB
+}
+
+// Morally equivalent to `values.to_ne_bytes().into_iter().sum::<usize>()`, but
+// more efficient.
+#[inline]
+fn sum_bytes_in_usize(values: usize) -> usize {
+    const LSB_SHORTS: usize = 0x0001_0001_0001_0001_u64 as usize;
+    const SKIP_BYTES: usize = 0x00ff_00ff_00ff_00ff_u64 as usize;
+
+    let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES);
+    pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - 2) * 8)
+}
+
+// This is the most direct implementation of the concept of "count the number of
+// bytes in the string which are not continuation bytes", and is used for the
+// head and tail of the input string (the first and last item in the tuple
+// returned by `slice::align_to`).
+fn char_count_general_case(s: &[u8]) -> usize {
+    s.iter().filter(|&&byte| !super::validations::utf8_is_cont_byte(byte)).count()
+}
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs
@@ -12,7 +12,7 @@ use crate::slice::{self, Split as SliceSplit};
 use super::from_utf8_unchecked;
 use super::pattern::Pattern;
 use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
-use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte};
+use super::validations::{next_code_point, next_code_point_reverse};
 use super::LinesAnyMap;
 use super::{BytesIsNotEmpty, UnsafeBytesToStr};
 use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode};
@@ -46,8 +46,7 @@ impl<'a> Iterator for Chars<'a> {
 
     #[inline]
     fn count(self) -> usize {
-        // length in `char` is equal to the number of non-continuation bytes
-        self.iter.filter(|&&byte| !utf8_is_cont_byte(byte)).count()
+        super::count::count_chars(self.as_str())
     }
 
     #[inline]

diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -7,6 +7,7 @@
 #![stable(feature = "rust1", since = "1.0.0")]
 
 mod converts;
+mod count;
 mod error;
 mod iter;
 mod traits;