From 969095e9f6be0bb13a69c715c6ee4910814a065b Mon Sep 17 00:00:00 2001 From: Lee Byron Date: Thu, 24 Sep 2015 15:51:32 -0700 Subject: [PATCH] [RFC] Clarify and restrict unicode support This proposal alters the parser grammar to be more specific about what unicode characters are allowed as source, restricts those characters interpretted as white space or line breaks, and clarifies line break behavior relative to error reporting with a non-normative note. Implements https://github.com/facebook/graphql/pull/96 --- src/language/__tests__/lexer.js | 67 +++++++++++++++++++++------ src/language/lexer.js | 81 ++++++++++++++++++++++----------- 2 files changed, 109 insertions(+), 39 deletions(-) diff --git a/src/language/__tests__/lexer.js b/src/language/__tests__/lexer.js index 03ef3dbaa6..30d247a1aa 100644 --- a/src/language/__tests__/lexer.js +++ b/src/language/__tests__/lexer.js @@ -22,6 +22,25 @@ function lexErr(str) { describe('Lexer', () => { + it('disallows uncommon control characters', () => { + + expect(lexErr('\u0007') + ).to.throw( + 'Syntax Error GraphQL (1:1) Invalid character "\\u0007"' + ); + + }); + + it('accepts BOM header', () => { + expect(lexOne('\uFEFF foo') + ).to.deep.equal({ + kind: TokenKind.NAME, + start: 2, + end: 5, + value: 'foo' + }); + }); + it('skips whitespace', () => { expect(lexOne(` @@ -136,53 +155,75 @@ describe('Lexer', () => { it('lex reports useful string errors', () => { + expect( + lexErr('"') + ).to.throw('Syntax Error GraphQL (1:2) Unterminated string'); + expect( lexErr('"no end quote') ).to.throw('Syntax Error GraphQL (1:14) Unterminated string'); expect( - lexErr('"multi\nline"') - ).to.throw('Syntax Error GraphQL (1:7) Unterminated string'); + lexErr('"contains unescaped \u0007 control char"') + ).to.throw( + 'Syntax Error GraphQL (1:21) Invalid character within String: "\\u0007".' + ); expect( - lexErr('"multi\rline"') - ).to.throw('Syntax Error GraphQL (1:7) Unterminated string'); + lexErr('"null-byte is not \u0000 end of file"') + ).to.throw( + 'Syntax Error GraphQL (1:19) Invalid character within String: "\\u0000".' + ); expect( - lexErr('"multi\u2028line"') + lexErr('"multi\nline"') ).to.throw('Syntax Error GraphQL (1:7) Unterminated string'); expect( - lexErr('"multi\u2029line"') + lexErr('"multi\rline"') ).to.throw('Syntax Error GraphQL (1:7) Unterminated string'); expect( lexErr('"bad \\z esc"') - ).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence'); + ).to.throw( + 'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\z.' + ); expect( lexErr('"bad \\x esc"') - ).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence'); + ).to.throw( + 'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\x.' + ); expect( lexErr('"bad \\u1 esc"') - ).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence'); + ).to.throw( + 'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\u1 es.' + ); expect( lexErr('"bad \\u0XX1 esc"') - ).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence'); + ).to.throw( + 'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\u0XX1.' + ); expect( lexErr('"bad \\uXXXX esc"') - ).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence'); + ).to.throw( + 'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uXXXX.' + ); expect( lexErr('"bad \\uFXXX esc"') - ).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence'); + ).to.throw( + 'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uFXXX.' + ); expect( lexErr('"bad \\uXXXF esc"') - ).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence'); + ).to.throw( + 'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uXXXF.' + ); }); it('lexes numbers', () => { diff --git a/src/language/lexer.js b/src/language/lexer.js index 5fcf265941..0f6024831b 100644 --- a/src/language/lexer.js +++ b/src/language/lexer.js @@ -110,7 +110,6 @@ tokenDescription[TokenKind.FLOAT] = 'Float'; tokenDescription[TokenKind.STRING] = 'String'; var charCodeAt = String.prototype.charCodeAt; -var fromCharCode = String.fromCharCode; var slice = String.prototype.slice; /** @@ -125,6 +124,10 @@ function makeToken( return { kind, start, end, value }; } +function printCharCode(code) { + return isNaN(code) ? 'EOF' : JSON.stringify(String.fromCharCode(code)); +} + /** * Gets the next token from the source starting at the given position. * @@ -137,12 +140,22 @@ function readToken(source: Source, fromPosition: number): Token { var bodyLength = body.length; var position = positionAfterWhitespace(body, fromPosition); - var code = charCodeAt.call(body, position); if (position >= bodyLength) { return makeToken(TokenKind.EOF, position, position); } + var code = charCodeAt.call(body, position); + + // SourceCharacter + if (code < 0x0020 && code !== 0x0009 && code !== 0x000A && code !== 0x000D) { + throw syntaxError( + source, + position, + `Invalid character ${printCharCode(code)}.` + ); + } + switch (code) { // ! case 33: return makeToken(TokenKind.BANG, position, position + 1); @@ -201,7 +214,7 @@ function readToken(source: Source, fromPosition: number): Token { throw syntaxError( source, position, - `Unexpected character "${fromCharCode(code)}".` + `Unexpected character ${printCharCode(code)}.` ); } @@ -215,14 +228,18 @@ function positionAfterWhitespace(body: string, startPosition: number): number { var position = startPosition; while (position < bodyLength) { var code = charCodeAt.call(body, position); - // Skip whitespace + // Skip Ignored if ( - code === 32 || // space - code === 44 || // comma - code === 160 || // '\xa0' - code === 0x2028 || // line separator - code === 0x2029 || // paragraph separator - code > 8 && code < 14 // whitespace + // BOM + code === 0xFEFF || + // White Space + code === 0x0009 || // tab + code === 0x0020 || // space + // Line Terminator + code === 0x000A || // new line + code === 0x000D || // carriage return + // Comma + code === 0x002C ) { ++position; // Skip comments @@ -230,8 +247,9 @@ function positionAfterWhitespace(body: string, startPosition: number): number { ++position; while ( position < bodyLength && - (code = charCodeAt.call(body, position)) && - code !== 10 && code !== 13 && code !== 0x2028 && code !== 0x2029 + (code = charCodeAt.call(body, position)) !== null && + // SourceCharacter but not LineTerminator + (code > 0x001F || code === 0x0009) && code !== 0x000A && code !== 0x000D ) { ++position; } @@ -265,7 +283,7 @@ function readNumber(source, start, firstCode) { throw syntaxError( source, position, - `Invalid number, unexpected digit after 0: "${fromCharCode(code)}".` + `Invalid number, unexpected digit after 0: ${printCharCode(code)}.` ); } } else { @@ -315,8 +333,7 @@ function readDigits(source, start, firstCode) { throw syntaxError( source, position, - 'Invalid number, expected digit but got: ' + - (code ? `"${fromCharCode(code)}"` : 'EOF') + '.' + `Invalid number, expected digit but got: ${printCharCode(code)}.` ); } @@ -329,15 +346,26 @@ function readString(source, start) { var body = source.body; var position = start + 1; var chunkStart = position; - var code; + var code = 0; var value = ''; while ( position < body.length && - (code = charCodeAt.call(body, position)) && - code !== 34 && - code !== 10 && code !== 13 && code !== 0x2028 && code !== 0x2029 + (code = charCodeAt.call(body, position)) !== null && + // not LineTerminator + code !== 0x000A && code !== 0x000D && + // not Quote (") + code !== 34 ) { + // SourceCharacter + if (code < 0x0020 && code !== 0x0009) { + throw syntaxError( + source, + position, + `Invalid character within String: ${printCharCode(code)}.` + ); + } + ++position; if (code === 92) { // \ value += slice.call(body, chunkStart, position - 1); @@ -351,7 +379,7 @@ function readString(source, start) { case 110: value += '\n'; break; case 114: value += '\r'; break; case 116: value += '\t'; break; - case 117: + case 117: // u var charCode = uniCharCode( charCodeAt.call(body, position + 1), charCodeAt.call(body, position + 2), @@ -362,17 +390,18 @@ function readString(source, start) { throw syntaxError( source, position, - 'Bad character escape sequence.' + `Invalid character escape sequence: ` + + `\\u${body.slice(position + 1, position + 5)}.` ); } - value += fromCharCode(charCode); + value += String.fromCharCode(charCode); position += 4; break; default: throw syntaxError( source, position, - 'Bad character escape sequence.' + `Invalid character escape sequence: \\${String.fromCharCode(code)}.` ); } ++position; @@ -380,7 +409,7 @@ function readString(source, start) { } } - if (code !== 34) { + if (code !== 34) { // quote (") throw syntaxError(source, position, 'Unterminated string.'); } @@ -428,10 +457,10 @@ function readName(source, position) { var body = source.body; var bodyLength = body.length; var end = position + 1; - var code; + var code = 0; while ( end !== bodyLength && - (code = charCodeAt.call(body, end)) && + (code = charCodeAt.call(body, end)) !== null && ( code === 95 || // _ code >= 48 && code <= 57 || // 0-9