Skip to content

Basic validation for character literals #184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions crates/ra_syntax/src/ast/generated.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,40 @@ impl<'a> AstNode<'a> for CastExpr<'a> {

impl<'a> CastExpr<'a> {}

// Char

#[derive(Debug, Clone)]
pub struct CharNode(SyntaxNode);

impl CharNode {
pub fn ast(&self) -> Char {
Char::cast(self.0.borrowed()).unwrap()
}
}

impl<'a> From<Char<'a>> for CharNode {
fn from(ast: Char<'a>) -> CharNode {
let syntax = ast.syntax().owned();
CharNode(syntax)
}
}
#[derive(Debug, Clone, Copy)]
pub struct Char<'a> {
syntax: SyntaxNodeRef<'a>,
}

impl<'a> AstNode<'a> for Char<'a> {
fn cast(syntax: SyntaxNodeRef<'a>) -> Option<Self> {
match syntax.kind() {
CHAR => Some(Char { syntax }),
_ => None,
}
}
fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax }
}

impl<'a> Char<'a> {}

// Comment

#[derive(Debug, Clone)]
Expand Down
6 changes: 6 additions & 0 deletions crates/ra_syntax/src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ impl<'a> Lifetime<'a> {
}
}

impl<'a> Char<'a> {
pub fn text(&self) -> &SmolStr {
&self.syntax().leaf_text().unwrap()
}
}

impl<'a> Comment<'a> {
pub fn text(&self) -> &SmolStr {
self.syntax().leaf_text().unwrap()
Expand Down
1 change: 1 addition & 0 deletions crates/ra_syntax/src/grammar.ron
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ Grammar(
"PrefixExpr": (),
"RangeExpr": (),
"BinExpr": (),
"Char": (),
"Literal": (),

"Expr": (
Expand Down
7 changes: 5 additions & 2 deletions crates/ra_syntax/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ mod grammar;
mod parser_api;
mod parser_impl;
mod reparsing;

mod string_lexing;
mod syntax_kinds;
pub mod text_utils;
/// Utilities for simple uses of the parser.
pub mod utils;
mod validation;
mod yellow;

pub use crate::{
Expand Down Expand Up @@ -98,6 +99,8 @@ impl File {
self.root.borrowed()
}
pub fn errors(&self) -> Vec<SyntaxError> {
self.root.root_data().clone()
let mut errors = self.root.root_data().clone();
errors.extend(validation::validate(self));
errors
}
}
311 changes: 311 additions & 0 deletions crates/ra_syntax/src/string_lexing/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
use self::CharComponentKind::*;
use rowan::{TextRange, TextUnit};

pub fn parse_char_literal(src: &str) -> CharComponentIterator {
CharComponentIterator {
parser: Parser::new(src),
has_closing_quote: false,
}
}

#[derive(Debug, Eq, PartialEq, Clone)]
pub struct CharComponent {
pub range: TextRange,
pub kind: CharComponentKind,
}

impl CharComponent {
fn new(range: TextRange, kind: CharComponentKind) -> CharComponent {
CharComponent { range, kind }
}
}

#[derive(Debug, Eq, PartialEq, Clone)]
pub enum CharComponentKind {
CodePoint,
AsciiEscape,
AsciiCodeEscape,
UnicodeEscape,
}

pub struct CharComponentIterator<'a> {
parser: Parser<'a>,
pub has_closing_quote: bool,
}

impl<'a> Iterator for CharComponentIterator<'a> {
type Item = CharComponent;
fn next(&mut self) -> Option<CharComponent> {
if self.parser.pos == 0 {
assert!(
self.parser.advance() == '\'',
"char literal should start with a quote"
);
}

if let Some(component) = self.parser.parse_char_component() {
return Some(component);
}

// We get here when there are no char components left to parse
if self.parser.peek() == Some('\'') {
self.parser.advance();
self.has_closing_quote = true;
}

assert!(
self.parser.peek() == None,
"char literal should leave no unparsed input: src = {}, pos = {}, length = {}",
self.parser.src,
self.parser.pos,
self.parser.src.len()
);

None
}
}

pub struct Parser<'a> {
src: &'a str,
pos: usize,
}

impl<'a> Parser<'a> {
pub fn new(src: &'a str) -> Parser<'a> {
Parser { src, pos: 0 }
}

// Utility methods

pub fn peek(&self) -> Option<char> {
if self.pos == self.src.len() {
return None;
}

self.src[self.pos..].chars().next()
}

pub fn advance(&mut self) -> char {
let next = self
.peek()
.expect("cannot advance if end of input is reached");
self.pos += next.len_utf8();
next
}

pub fn get_pos(&self) -> TextUnit {
(self.pos as u32).into()
}

// Char parsing methods

fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent {
// Note: validation of UnicodeEscape will be done elsewhere:
// * Only hex digits or underscores allowed
// * Max 6 chars
// * Within allowed range (must be at most 10FFFF)
match self.peek() {
Some('{') => {
self.advance();

// Parse anything until we reach `}`
while let Some(next) = self.peek() {
self.advance();
if next == '}' {
break;
}
}

let end = self.get_pos();
CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
}
Some(_) | None => {
let end = self.get_pos();
CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
}
}
}

fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent {
// Note: validation of AsciiCodeEscape will be done elsewhere:
// * First digit is octal
// * Second digit is hex
let code_start = self.get_pos();
while let Some(next) = self.peek() {
if next == '\'' || (self.get_pos() - code_start == 2.into()) {
break;
}

self.advance();
}

let end = self.get_pos();
CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape)
}

fn parse_escape(&mut self, start: TextUnit) -> CharComponent {
// Note: validation of AsciiEscape will be done elsewhere:
// * The escape sequence is non-empty
// * The escape sequence is valid
if self.peek().is_none() {
return CharComponent::new(TextRange::from_to(start, start), AsciiEscape);
}

let next = self.advance();
let end = self.get_pos();
let range = TextRange::from_to(start, end);
match next {
'x' => self.parse_ascii_code_escape(start),
'u' => self.parse_unicode_escape(start),
_ => CharComponent::new(range, AsciiEscape),
}
}

pub fn parse_char_component(&mut self) -> Option<CharComponent> {
let next = self.peek()?;

// Ignore character close
if next == '\'' {
return None;
}

let start = self.get_pos();
self.advance();

if next == '\\' {
Some(self.parse_escape(start))
} else {
let end = self.get_pos();
Some(CharComponent::new(
TextRange::from_to(start, end),
CodePoint,
))
}
}
}

#[cfg(test)]
mod tests {
use super::*;

fn parse(src: &str) -> (bool, Vec<CharComponent>) {
let component_iterator = &mut super::parse_char_literal(src);
let components: Vec<_> = component_iterator.collect();
(component_iterator.has_closing_quote, components)
}

fn unclosed_char_component(src: &str) -> CharComponent {
let (has_closing_quote, components) = parse(src);
assert!(!has_closing_quote, "char should not have closing quote");
assert!(components.len() == 1);
components[0].clone()
}

fn closed_char_component(src: &str) -> CharComponent {
let (has_closing_quote, components) = parse(src);
assert!(has_closing_quote, "char should have closing quote");
assert!(
components.len() == 1,
"Literal: {}\nComponents: {:#?}",
src,
components
);
components[0].clone()
}

fn closed_char_components(src: &str) -> Vec<CharComponent> {
let (has_closing_quote, components) = parse(src);
assert!(has_closing_quote, "char should have closing quote");
components
}

fn range_closed(src: &str) -> TextRange {
TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
}

fn range_unclosed(src: &str) -> TextRange {
TextRange::from_to(1.into(), (src.len() as u32).into())
}

#[test]
fn test_unicode_escapes() {
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""];
for escape in unicode_escapes {
let escape_sequence = format!(r"'\u{}'", escape);
let component = closed_char_component(&escape_sequence);
let expected_range = range_closed(&escape_sequence);
assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
assert_eq!(component.range, expected_range);
}
}

#[test]
fn test_unicode_escapes_unclosed() {
let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
for escape in unicode_escapes {
let escape_sequence = format!(r"'\u{}'", escape);
let component = unclosed_char_component(&escape_sequence);
let expected_range = range_unclosed(&escape_sequence);
assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
assert_eq!(component.range, expected_range);
}
}

#[test]
fn test_empty_char() {
let (has_closing_quote, components) = parse("''");
assert!(has_closing_quote, "char should have closing quote");
assert!(components.len() == 0);
}

#[test]
fn test_unclosed_char() {
let component = unclosed_char_component("'a");
assert!(component.kind == CodePoint);
assert!(component.range == TextRange::from_to(1.into(), 2.into()));
}

#[test]
fn test_digit_escapes() {
let literals = &[r"", r"5", r"55"];

for literal in literals {
let lit_text = format!(r"'\x{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == CharComponentKind::AsciiCodeEscape);
assert!(component.range == range_closed(&lit_text));
}

// More than 2 digits starts a new codepoint
let components = closed_char_components(r"'\x555'");
assert!(components.len() == 2);
assert!(components[1].kind == CharComponentKind::CodePoint);
}

#[test]
fn test_ascii_escapes() {
let literals = &[
r"\'", "\\\"", // equivalent to \"
r"\n", r"\r", r"\t", r"\\", r"\0",
];

for literal in literals {
let lit_text = format!("'{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == CharComponentKind::AsciiEscape);
assert!(component.range == range_closed(&lit_text));
}
}

#[test]
fn test_no_escapes() {
let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];

for &literal in literals {
let lit_text = format!("'{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == CharComponentKind::CodePoint);
assert!(component.range == range_closed(&lit_text));
}
}
}
Loading