Skip to content

Improve error messages for raw strings (#60762) #70522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 1, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 124 additions & 22 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
@@ -17,9 +17,13 @@
mod cursor;
pub mod unescape;

#[cfg(test)]
mod tests;

use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::{Cursor, EOF_CHAR};
use std::convert::TryInto;

/// Parsed token.
/// It doesn't contain information about data that has been parsed,
@@ -132,9 +136,80 @@ pub enum LiteralKind {
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr { n_hashes: usize, started: bool, terminated: bool },
RawStr(UnvalidatedRawStr),
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
RawByteStr(UnvalidatedRawStr),
}

/// Represents something that looks like a raw string, but may have some
/// problems. Use `.validate()` to convert it into something
/// usable.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct UnvalidatedRawStr {
/// The prefix (`r###"`) is valid
valid_start: bool,
/// The number of leading `#`
n_start_hashes: usize,
/// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
n_end_hashes: usize,
/// The offset starting at `r` or `br` where the user may have intended to end the string.
/// Currently, it is the longest sequence of pattern `"#+"`.
possible_terminator_offset: Option<usize>,
}

/// Error produced validating a raw string. Represents cases like:
/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
/// - Too many `#`s (>65536): `TooManyDelimiters`
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LexRawStrError {
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
InvalidStarter,
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
/// may have intended to terminate it.
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
/// More than 65536 `#`s exist.
TooManyDelimiters,
}

/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
/// there are a matching number of `#` characters in both. Note that this will
/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub struct ValidatedRawStr {
n_hashes: u16,
}

impl ValidatedRawStr {
pub fn num_hashes(&self) -> u16 {
self.n_hashes
}
}

impl UnvalidatedRawStr {
pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
if !self.valid_start {
return Err(LexRawStrError::InvalidStarter);
}

// Only up to 65535 `#`s are allowed in raw strings
let n_start_safe: u16 =
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;

if self.n_start_hashes > self.n_end_hashes {
Err(LexRawStrError::NoTerminator {
expected: self.n_start_hashes,
found: self.n_end_hashes,
possible_terminator_offset: self.possible_terminator_offset,
})
} else {
// Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
// they must be equal.
debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
Ok(ValidatedRawStr { n_hashes: n_start_safe })
}
}
}

/// Base of numeric literal encoding according to its prefix.
@@ -209,7 +284,7 @@ pub fn is_whitespace(c: char) -> bool {
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
=> true,
_ => false,
}
}
@@ -258,12 +333,12 @@ impl Cursor<'_> {
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let raw_str_i = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
if terminated {
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
self.eat_literal_suffix();
}
let kind = RawStr { n_hashes, started, terminated };
let kind = RawStr(raw_str_i);
Literal { kind, suffix_start }
}
_ => self.ident(),
@@ -293,12 +368,14 @@ impl Cursor<'_> {
}
('r', '"') | ('r', '#') => {
self.bump();
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let raw_str_i = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
if terminated {
self.eat_literal_suffix();
}
let kind = RawByteStr { n_hashes, started, terminated };

let kind = RawByteStr(raw_str_i);
Literal { kind, suffix_start }
}
_ => self.ident(),
@@ -594,37 +671,49 @@ impl Cursor<'_> {
false
}

/// Eats the double-quoted string and returns a tuple of
/// (amount of the '#' symbols, raw string started, raw string terminated)
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
/// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
debug_assert!(self.prev() == 'r');
let mut started: bool = false;
let mut finished: bool = false;
let mut valid_start: bool = false;
let start_pos = self.len_consumed();
let (mut possible_terminator_offset, mut max_hashes) = (None, 0);

// Count opening '#' symbols.
let n_hashes = self.eat_while(|c| c == '#');
let n_start_hashes = self.eat_while(|c| c == '#');

// Check that string is started.
match self.bump() {
Some('"') => started = true,
_ => return (n_hashes, started, finished),
Some('"') => valid_start = true,
_ => {
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes: 0,
possible_terminator_offset,
};
}
}

// Skip the string contents and on each '#' character met, check if this is
// a raw string termination.
while !finished {
loop {
self.eat_while(|c| c != '"');

if self.is_eof() {
return (n_hashes, started, finished);
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes: max_hashes,
possible_terminator_offset,
};
}

// Eat closing double quote.
self.bump();

// Check that amount of closing '#' symbols
// is equal to the amount of opening ones.
let mut hashes_left = n_hashes;
let mut hashes_left = n_start_hashes;
let is_closing_hash = |c| {
if c == '#' && hashes_left != 0 {
hashes_left -= 1;
@@ -633,10 +722,23 @@ impl Cursor<'_> {
false
}
};
finished = self.eat_while(is_closing_hash) == n_hashes;
let n_end_hashes = self.eat_while(is_closing_hash);

if n_end_hashes == n_start_hashes {
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes,
possible_terminator_offset: None,
};
} else if n_end_hashes > max_hashes {
// Keep track of possible terminators to give a hint about where there might be
// a missing terminator
possible_terminator_offset =
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
max_hashes = n_end_hashes;
}
}

(n_hashes, started, finished)
}

fn eat_decimal_digits(&mut self) -> bool {
121 changes: 121 additions & 0 deletions src/librustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#[cfg(test)]
mod tests {
use crate::*;

fn check_raw_str(
s: &str,
expected: UnvalidatedRawStr,
validated: Result<ValidatedRawStr, LexRawStrError>,
) {
let s = &format!("r{}", s);
let mut cursor = Cursor::new(s);
cursor.bump();
let tok = cursor.raw_double_quoted_string(0);
assert_eq!(tok, expected);
assert_eq!(tok.validate(), validated);
}

#[test]
fn test_naked_raw_str() {
check_raw_str(
r#""abc""#,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
}

#[test]
fn test_raw_no_start() {
check_raw_str(
r##""abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
}

#[test]
fn test_too_many_terminators() {
// this error is handled in the parser later
check_raw_str(
r###"#"abc"##"###,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 1,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 1 }),
);
}

#[test]
fn test_unterminated() {
check_raw_str(
r#"#"abc"#,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 1,
found: 0,
possible_terminator_offset: None,
}),
);
check_raw_str(
r###"##"abc"#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 1,
valid_start: true,
possible_terminator_offset: Some(7),
},
Err(LexRawStrError::NoTerminator {
expected: 2,
found: 1,
possible_terminator_offset: Some(7),
}),
);
// We're looking for "# not just any #
check_raw_str(
r###"##"abc#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 2,
found: 0,
possible_terminator_offset: None,
}),
)
}

#[test]
fn test_invalid_start() {
check_raw_str(
r##"#~"abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::InvalidStarter),
);
}
}
93 changes: 59 additions & 34 deletions src/librustc_parse/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
use rustc_ast::token::{self, Token, TokenKind};
use rustc_ast::util::comments;
use rustc_data_structures::sync::Lrc;
use rustc_errors::{error_code, DiagnosticBuilder, FatalError};
use rustc_lexer::unescape;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
use rustc_lexer::Base;
use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr};
use rustc_session::parse::ParseSess;
use rustc_span::symbol::{sym, Symbol};
use rustc_span::{BytePos, Pos, Span};

use log::debug;
use std::char;
use std::convert::TryInto;

mod tokentrees;
mod unescape_error_reporting;
mod unicode_chars;

use unescape_error_reporting::{emit_unescape_error, push_escaped_char};

#[derive(Clone, Debug)]
@@ -376,30 +376,22 @@ impl<'a> StringReader<'a> {
let id = self.symbol_from_to(content_start, content_end);
(token::ByteStr, id)
}
rustc_lexer::LiteralKind::RawStr { n_hashes, started, terminated } => {
if !started {
self.report_non_started_raw_string(start);
}
if !terminated {
self.report_unterminated_raw_string(start, n_hashes)
}
let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes);
rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => {
let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
let n_hashes = valid_raw_str.num_hashes();
let n = u32::from(n_hashes);

let content_start = start + BytePos(2 + n);
let content_end = suffix_start - BytePos(1 + n);
self.validate_raw_str_escape(content_start, content_end);
let id = self.symbol_from_to(content_start, content_end);
(token::StrRaw(n_hashes), id)
}
rustc_lexer::LiteralKind::RawByteStr { n_hashes, started, terminated } => {
if !started {
self.report_non_started_raw_string(start);
}
if !terminated {
self.report_unterminated_raw_string(start, n_hashes)
}
let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes);
rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => {
let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
let n_hashes = validated_raw_str.num_hashes();
let n = u32::from(n_hashes);

let content_start = start + BytePos(3 + n);
let content_end = suffix_start - BytePos(1 + n);
self.validate_raw_byte_str_escape(content_start, content_end);
@@ -485,6 +477,26 @@ impl<'a> StringReader<'a> {
}
}

fn validate_and_report_errors(
&self,
start: BytePos,
unvalidated_raw_str: UnvalidatedRawStr,
) -> ValidatedRawStr {
match unvalidated_raw_str.validate() {
Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start),
Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => {
self.report_unterminated_raw_string(
start,
expected,
possible_terminator_offset,
found,
)
}
Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start),
Ok(valid) => valid,
}
}

fn report_non_started_raw_string(&self, start: BytePos) -> ! {
let bad_char = self.str_from(start).chars().last().unwrap();
self.struct_fatal_span_char(
@@ -498,38 +510,51 @@ impl<'a> StringReader<'a> {
FatalError.raise()
}

fn report_unterminated_raw_string(&self, start: BytePos, n_hashes: usize) -> ! {
fn report_unterminated_raw_string(
&self,
start: BytePos,
n_hashes: usize,
possible_offset: Option<usize>,
found_terminators: usize,
) -> ! {
let mut err = self.sess.span_diagnostic.struct_span_fatal_with_code(
self.mk_sp(start, start),
"unterminated raw string",
error_code!(E0748),
);

err.span_label(self.mk_sp(start, start), "unterminated raw string");

if n_hashes > 0 {
err.note(&format!(
"this raw string should be terminated with `\"{}`",
"#".repeat(n_hashes as usize)
"#".repeat(n_hashes)
));
}

if let Some(possible_offset) = possible_offset {
let lo = start + BytePos(possible_offset as u32);
let hi = lo + BytePos(found_terminators as u32);
let span = self.mk_sp(lo, hi);
err.span_suggestion(
span,
"consider terminating the string here",
"#".repeat(n_hashes),
Applicability::MaybeIncorrect,
);
}

err.emit();
FatalError.raise()
}

fn restrict_n_hashes(&self, start: BytePos, n_hashes: usize) -> u16 {
match n_hashes.try_into() {
Ok(n_hashes) => n_hashes,
Err(_) => {
self.fatal_span_(
start,
self.pos,
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols",
)
.raise();
}
}
fn report_too_many_hashes(&self, start: BytePos) -> ! {
self.fatal_span_(
start,
self.pos,
"too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
)
.raise();
}

fn validate_char_escape(&self, content_start: BytePos, content_end: BytePos) {
1 change: 1 addition & 0 deletions src/librustc_parse/lib.rs
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
#![feature(crate_visibility_modifier)]
#![feature(bindings_after_at)]
#![feature(try_blocks)]
#![feature(or_patterns)]

use rustc_ast::ast;
use rustc_ast::token::{self, Nonterminal};
33 changes: 30 additions & 3 deletions src/librustc_parse/parser/diagnostics.rs
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ use rustc_ast::ast::{
};
use rustc_ast::ast::{AttrVec, ItemKind, Mutability, Pat, PatKind, PathSegment, QSelf, Ty, TyKind};
use rustc_ast::ptr::P;
use rustc_ast::token::{self, TokenKind};
use rustc_ast::token::{self, Lit, LitKind, TokenKind};
use rustc_ast::util::parser::AssocOp;
use rustc_ast_pretty::pprust;
use rustc_data_structures::fx::FxHashSet;
@@ -255,6 +255,10 @@ impl<'a> Parser<'a> {
}
}

if self.check_too_many_raw_str_terminators(&mut err) {
return Err(err);
}

let sm = self.sess.source_map();
if self.prev_token.span == DUMMY_SP {
// Account for macro context where the previous span might not be
@@ -282,6 +286,29 @@ impl<'a> Parser<'a> {
Err(err)
}

fn check_too_many_raw_str_terminators(&mut self, err: &mut DiagnosticBuilder<'_>) -> bool {
match (&self.prev_token.kind, &self.token.kind) {
(
TokenKind::Literal(Lit {
kind: LitKind::StrRaw(n_hashes) | LitKind::ByteStrRaw(n_hashes),
..
}),
TokenKind::Pound,
) => {
err.set_primary_message("too many `#` when terminating raw string");
err.span_suggestion(
self.token.span,
"remove the extra `#`",
String::new(),
Applicability::MachineApplicable,
);
err.note(&format!("the raw string started with {} `#`s", n_hashes));
true
}
_ => false,
}
}

pub fn maybe_annotate_with_ascription(
&mut self,
err: &mut DiagnosticBuilder<'_>,
@@ -491,7 +518,7 @@ impl<'a> Parser<'a> {
.unwrap_or_else(|_| pprust::expr_to_string(&e))
};
err.span_suggestion_verbose(
inner_op.span.shrink_to_hi(),
inner_op.span.shrink_to_hi(),
"split the comparison into two",
format!(" && {}", expr_to_str(&r1)),
Applicability::MaybeIncorrect,
@@ -1086,7 +1113,7 @@ impl<'a> Parser<'a> {
self.look_ahead(2, |t| t.is_ident())
|| self.look_ahead(1, |t| t == &token::ModSep)
&& (self.look_ahead(2, |t| t.is_ident()) || // `foo:bar::baz`
self.look_ahead(2, |t| t == &token::Lt)) // `foo:bar::<baz>`
self.look_ahead(2, |t| t == &token::Lt)) // `foo:bar::<baz>`
}

pub(super) fn recover_seq_parse_error(
4 changes: 0 additions & 4 deletions src/test/ui/parser/raw-str-unbalanced.rs

This file was deleted.

8 changes: 0 additions & 8 deletions src/test/ui/parser/raw-str-unbalanced.stderr

This file was deleted.

File renamed without changes.
Original file line number Diff line number Diff line change
@@ -2,7 +2,9 @@ error[E0748]: unterminated raw string
--> $DIR/raw-byte-string-eof.rs:2:5
|
LL | br##"a"#;
| ^ unterminated raw string
| ^ - help: consider terminating the string here: `##`
| |
| unterminated raw string
|
= note: this raw string should be terminated with `"##`

File renamed without changes.
File renamed without changes.
14 changes: 14 additions & 0 deletions src/test/ui/parser/raw/raw-str-in-macro-call.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// check-pass

macro_rules! m1 {
($tt:tt #) => ()
}

macro_rules! m2 {
($tt:tt) => ()
}

fn main() {
m1!(r#"abc"##);
m2!(r#"abc"#);
}
4 changes: 4 additions & 0 deletions src/test/ui/parser/raw/raw-str-unbalanced.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
static s: &'static str =
r#"
"## //~ too many `#` when terminating raw string
;
10 changes: 10 additions & 0 deletions src/test/ui/parser/raw/raw-str-unbalanced.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
error: too many `#` when terminating raw string
--> $DIR/raw-str-unbalanced.rs:3:9
|
LL | "##
| ^ help: remove the extra `#`
|
= note: the raw string started with 1 `#`s

error: aborting due to previous error

File renamed without changes.
4 changes: 4 additions & 0 deletions src/test/ui/parser/raw/raw-string-2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
fn main() {
let x = r###"here's a long string"# "# "##;
//~^ ERROR unterminated raw string
}
11 changes: 11 additions & 0 deletions src/test/ui/parser/raw/raw-string-2.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
error[E0748]: unterminated raw string
--> $DIR/raw-string-2.rs:2:13
|
LL | let x = r###"here's a long string"# "# "##;
| ^ unterminated raw string -- help: consider terminating the string here: `###`
|
= note: this raw string should be terminated with `"###`

error: aborting due to previous error

For more information about this error, try `rustc --explain E0748`.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
error[E0748]: unterminated raw string
--> $DIR/raw_string.rs:2:13
--> $DIR/raw-string.rs:2:13
|
LL | let x = r##"lol"#;
| ^ unterminated raw string
| ^ - help: consider terminating the string here: `##`
| |
| unterminated raw string
|
= note: this raw string should be terminated with `"##`