From 380fa264132ad481e73cbbf0f3a0feefd99a1d78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AE=B8=E6=9D=B0=E5=8F=8B=20Jieyou=20Xu=20=28Joe=29?=
 <jieyouxu@outlook.com>
Date: Tue, 14 Feb 2023 17:31:58 +0800
Subject: [PATCH 1/3] Don't recover lifetimes/labels containing emojis as
 character literals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Note that at the time of this commit, `unic-emoji-char` seems to have
data tables only up to Unicode 5.0, but Unicode is already newer than
this.

A newer emoji such as `🥺` will not be recognized as an emoji
but older emojis such as `🐱` will.
---
 compiler/rustc_errors/src/lib.rs              |  2 +
 compiler/rustc_lexer/src/lib.rs               | 43 +++++++---
 compiler/rustc_parse/src/lexer/mod.rs         |  9 +-
 .../lexer/issue-108019-bad-emoji-recovery.rs  | 45 ++++++++++
 .../issue-108019-bad-emoji-recovery.stderr    | 86 +++++++++++++++++++
 5 files changed, 173 insertions(+), 12 deletions(-)
 create mode 100644 tests/ui/lexer/issue-108019-bad-emoji-recovery.rs
 create mode 100644 tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr

diff --git a/compiler/rustc_errors/src/lib.rs b/compiler/rustc_errors/src/lib.rs
index 83b733d4c0677..b310b191d5273 100644
--- a/compiler/rustc_errors/src/lib.rs
+++ b/compiler/rustc_errors/src/lib.rs
@@ -471,6 +471,8 @@ pub enum StashKey {
     /// When an invalid lifetime e.g. `'2` should be reinterpreted
     /// as a char literal in the parser
     LifetimeIsChar,
+    /// When an invalid lifetime e.g. `'🐱` contains emoji.
+    LifetimeContainsEmoji,
     /// Maybe there was a typo where a comma was forgotten before
     /// FRU syntax
     MaybeFruTypo,
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index 6e815863d06ff..e6f04fe0aaa63 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -95,7 +95,7 @@ pub enum TokenKind {
     Literal { kind: LiteralKind, suffix_start: u32 },
 
     /// "'a"
-    Lifetime { starts_with_number: bool },
+    Lifetime { starts_with_number: bool, contains_emoji: bool },
 
     // One-char tokens:
     /// ";"
@@ -630,7 +630,13 @@ impl Cursor<'_> {
             // If the first symbol is valid for identifier, it can be a lifetime.
             // Also check if it's a number for a better error reporting (so '0 will
             // be reported as invalid lifetime and not as unterminated char literal).
-            is_id_start(self.first()) || self.first().is_digit(10)
+            // We also have to account for potential `'🐱` emojis to avoid reporting
+            // it as an unterminated char literal.
+            is_id_start(self.first())
+                || self.first().is_digit(10)
+                // FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
+                // 5.0, but Unicode is already newer than this.
+                || unic_emoji_char::is_emoji(self.first())
         };
 
         if !can_be_a_lifetime {
@@ -643,16 +649,33 @@ impl Cursor<'_> {
             return Literal { kind, suffix_start };
         }
 
-        // Either a lifetime or a character literal with
-        // length greater than 1.
+        // Either a lifetime or a character literal.
 
         let starts_with_number = self.first().is_digit(10);
+        let mut contains_emoji = false;
 
-        // Skip the literal contents.
-        // First symbol can be a number (which isn't a valid identifier start),
-        // so skip it without any checks.
-        self.bump();
-        self.eat_while(is_id_continue);
+        // FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
+        // 5.0, but Unicode is already newer than this.
+        if unic_emoji_char::is_emoji(self.first()) {
+            contains_emoji = true;
+        } else {
+            // Skip the literal contents.
+            // First symbol can be a number (which isn't a valid identifier start),
+            // so skip it without any checks.
+            self.bump();
+        }
+        self.eat_while(|c| {
+            if is_id_continue(c) {
+                true
+            // FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
+            // 5.0, but Unicode is already newer than this.
+            } else if unic_emoji_char::is_emoji(c) {
+                contains_emoji = true;
+                true
+            } else {
+                false
+            }
+        });
 
         // Check if after skipping literal contents we've met a closing
         // single quote (which means that user attempted to create a
@@ -662,7 +685,7 @@ impl Cursor<'_> {
             let kind = Char { terminated: true };
             Literal { kind, suffix_start: self.pos_within_token() }
         } else {
-            Lifetime { starts_with_number }
+            Lifetime { starts_with_number, contains_emoji }
         }
     }
 
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index bd998ed91d977..37449aaabed8b 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -200,16 +200,21 @@ impl<'a> StringReader<'a> {
                     };
                     token::Literal(token::Lit { kind, symbol, suffix })
                 }
-                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
+                rustc_lexer::TokenKind::Lifetime { starts_with_number, contains_emoji } => {
                     // Include the leading `'` in the real identifier, for macro
                     // expansion purposes. See #12512 for the gory details of why
                     // this is necessary.
                     let lifetime_name = self.str_from(start);
                     if starts_with_number {
                         let span = self.mk_sp(start, self.pos);
-                        let mut diag = self.sess.struct_err("lifetimes cannot start with a number");
+                        let mut diag = self.sess.struct_err("lifetimes or labels cannot start with a number");
                         diag.set_span(span);
                         diag.stash(span, StashKey::LifetimeIsChar);
+                    } else if contains_emoji {
+                        let span = self.mk_sp(start, self.pos);
+                        let mut diag = self.sess.struct_err("lifetimes or labels cannot contain emojis");
+                        diag.set_span(span);
+                        diag.stash(span, StashKey::LifetimeContainsEmoji);
                     }
                     let ident = Symbol::intern(lifetime_name);
                     token::Lifetime(ident)
diff --git a/tests/ui/lexer/issue-108019-bad-emoji-recovery.rs b/tests/ui/lexer/issue-108019-bad-emoji-recovery.rs
new file mode 100644
index 0000000000000..f0f8622456010
--- /dev/null
+++ b/tests/ui/lexer/issue-108019-bad-emoji-recovery.rs
@@ -0,0 +1,45 @@
+#![allow(unused_labels)]
+
+// FIXME(#108019): outdated Unicode table
+// fn foo() {
+//     '🥺 loop {
+//         break
+//     }
+// }
+
+fn bar() {
+    '🐱 loop {
+    //~^ ERROR labeled expression must be followed by `:`
+    //~| ERROR lifetimes or labels cannot contain emojis
+        break
+    }
+}
+
+fn qux() {
+    'a🐱 loop {
+    //~^ ERROR labeled expression must be followed by `:`
+    //~| ERROR lifetimes or labels cannot contain emojis
+        break
+    }
+}
+
+fn quux() {
+    '1🐱 loop {
+    //~^ ERROR labeled expression must be followed by `:`
+    //~| ERROR lifetimes or labels cannot start with a number
+        break
+    }
+}
+
+fn x<'🐱>() -> &'🐱 () {
+    //~^ ERROR lifetimes or labels cannot contain emojis
+    //~| ERROR lifetimes or labels cannot contain emojis
+    &()
+}
+
+fn y() {
+    'a🐱: loop {}
+    //~^ ERROR lifetimes or labels cannot contain emojis
+}
+
+fn main() {}
diff --git a/tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr b/tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr
new file mode 100644
index 0000000000000..be77ffdea349f
--- /dev/null
+++ b/tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr
@@ -0,0 +1,86 @@
+error: labeled expression must be followed by `:`
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:11:5
+   |
+LL |       '🐱 loop {
+   |       ^--- help: add `:` after the label
+   |       |
+   |  _____the label
+   | |
+LL | |
+LL | |
+LL | |         break
+LL | |     }
+   | |_____^
+   |
+   = note: labels are used before loops and blocks, allowing e.g., `break 'label` to them
+
+error: labeled expression must be followed by `:`
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:19:5
+   |
+LL |       'a🐱 loop {
+   |       ^---- help: add `:` after the label
+   |       |
+   |  _____the label
+   | |
+LL | |
+LL | |
+LL | |         break
+LL | |     }
+   | |_____^
+   |
+   = note: labels are used before loops and blocks, allowing e.g., `break 'label` to them
+
+error: labeled expression must be followed by `:`
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:27:5
+   |
+LL |       '1🐱 loop {
+   |       ^---- help: add `:` after the label
+   |       |
+   |  _____the label
+   | |
+LL | |
+LL | |
+LL | |         break
+LL | |     }
+   | |_____^
+   |
+   = note: labels are used before loops and blocks, allowing e.g., `break 'label` to them
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:11:5
+   |
+LL |     '🐱 loop {
+   |     ^^^
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:19:5
+   |
+LL |     'a🐱 loop {
+   |     ^^^^
+
+error: lifetimes or labels cannot start with a number
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:27:5
+   |
+LL |     '1🐱 loop {
+   |     ^^^^
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:34:6
+   |
+LL | fn x<'🐱>() -> &'🐱 () {
+   |      ^^^
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:34:16
+   |
+LL | fn x<'🐱>() -> &'🐱 () {
+   |                 ^^^
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:41:5
+   |
+LL |     'a🐱: loop {}
+   |     ^^^^
+
+error: aborting due to 9 previous errors
+

From 98b82aedba3f3f581e89df54352914b27f42c6f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AE=B8=E6=9D=B0=E5=8F=8B=20Jieyou=20Xu=20=28Joe=29?=
 <jieyouxu@outlook.com>
Date: Tue, 14 Feb 2023 18:17:59 +0800
Subject: [PATCH 2/3] Update numeric lifetime test

---
 tests/ui/parser/numeric-lifetime.rs     | 4 ++--
 tests/ui/parser/numeric-lifetime.stderr | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ui/parser/numeric-lifetime.rs b/tests/ui/parser/numeric-lifetime.rs
index 2d82354c62cca..a082a8a44df2a 100644
--- a/tests/ui/parser/numeric-lifetime.rs
+++ b/tests/ui/parser/numeric-lifetime.rs
@@ -1,6 +1,6 @@
 struct S<'1> { s: &'1 usize }
-//~^ ERROR lifetimes cannot start with a number
-//~| ERROR lifetimes cannot start with a number
+//~^ ERROR lifetimes or labels cannot start with a number
+//~| ERROR lifetimes or labels cannot start with a number
 fn main() {
     // verify that the parse error doesn't stop type checking
     let x: usize = "";
diff --git a/tests/ui/parser/numeric-lifetime.stderr b/tests/ui/parser/numeric-lifetime.stderr
index 7c1bcb7263171..66e35dca92319 100644
--- a/tests/ui/parser/numeric-lifetime.stderr
+++ b/tests/ui/parser/numeric-lifetime.stderr
@@ -6,13 +6,13 @@ LL |     let x: usize = "";
    |            |
    |            expected due to this
 
-error: lifetimes cannot start with a number
+error: lifetimes or labels cannot start with a number
   --> $DIR/numeric-lifetime.rs:1:10
    |
 LL | struct S<'1> { s: &'1 usize }
    |          ^^
 
-error: lifetimes cannot start with a number
+error: lifetimes or labels cannot start with a number
   --> $DIR/numeric-lifetime.rs:1:20
    |
 LL | struct S<'1> { s: &'1 usize }

From e3f9db5fc319c6d8eee5d47d216ea6a426070c41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AE=B8=E6=9D=B0=E5=8F=8B=20Jieyou=20Xu=20=28Joe=29?=
 <jieyouxu@outlook.com>
Date: Tue, 14 Feb 2023 23:24:55 +0800
Subject: [PATCH 3/3] Update lexer lifetime test

---
 compiler/rustc_lexer/src/tests.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs
index e4c1787f2ccef..670d64fb983f5 100644
--- a/compiler/rustc_lexer/src/tests.rs
+++ b/compiler/rustc_lexer/src/tests.rs
@@ -235,7 +235,7 @@ fn lifetime() {
     check_lexing(
         "'abc",
         expect![[r#"
-            Token { kind: Lifetime { starts_with_number: false }, len: 4 }
+            Token { kind: Lifetime { starts_with_number: false, contains_emoji: false }, len: 4 }
         "#]],
     );
 }