Skip to content

[RFC] Clarify and restrict unicode support #186

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 25, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 54 additions & 13 deletions src/language/__tests__/lexer.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,25 @@ function lexErr(str) {

describe('Lexer', () => {

it('disallows uncommon control characters', () => {

expect(lexErr('\u0007')
).to.throw(
'Syntax Error GraphQL (1:1) Invalid character "\\u0007"'
);

});

it('accepts BOM header', () => {
expect(lexOne('\uFEFF foo')
).to.deep.equal({
kind: TokenKind.NAME,
start: 2,
end: 5,
value: 'foo'
});
});

it('skips whitespace', () => {

expect(lexOne(`
Expand Down Expand Up @@ -136,53 +155,75 @@ describe('Lexer', () => {

it('lex reports useful string errors', () => {

expect(
lexErr('"')
).to.throw('Syntax Error GraphQL (1:2) Unterminated string');

expect(
lexErr('"no end quote')
).to.throw('Syntax Error GraphQL (1:14) Unterminated string');

expect(
lexErr('"multi\nline"')
).to.throw('Syntax Error GraphQL (1:7) Unterminated string');
lexErr('"contains unescaped \u0007 control char"')
).to.throw(
'Syntax Error GraphQL (1:21) Invalid character within String: "\\u0007".'
);

expect(
lexErr('"multi\rline"')
).to.throw('Syntax Error GraphQL (1:7) Unterminated string');
lexErr('"null-byte is not \u0000 end of file"')
).to.throw(
'Syntax Error GraphQL (1:19) Invalid character within String: "\\u0000".'
);

expect(
lexErr('"multi\u2028line"')
lexErr('"multi\nline"')
).to.throw('Syntax Error GraphQL (1:7) Unterminated string');

expect(
lexErr('"multi\u2029line"')
lexErr('"multi\rline"')
).to.throw('Syntax Error GraphQL (1:7) Unterminated string');

expect(
lexErr('"bad \\z esc"')
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
).to.throw(
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\z.'
);

expect(
lexErr('"bad \\x esc"')
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
).to.throw(
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\x.'
);

expect(
lexErr('"bad \\u1 esc"')
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
).to.throw(
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\u1 es.'
);

expect(
lexErr('"bad \\u0XX1 esc"')
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
).to.throw(
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\u0XX1.'
);

expect(
lexErr('"bad \\uXXXX esc"')
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
).to.throw(
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uXXXX.'
);

expect(
lexErr('"bad \\uFXXX esc"')
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
).to.throw(
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uFXXX.'
);

expect(
lexErr('"bad \\uXXXF esc"')
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
).to.throw(
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uXXXF.'
);
});

it('lexes numbers', () => {
Expand Down
81 changes: 55 additions & 26 deletions src/language/lexer.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ tokenDescription[TokenKind.FLOAT] = 'Float';
tokenDescription[TokenKind.STRING] = 'String';

var charCodeAt = String.prototype.charCodeAt;
var fromCharCode = String.fromCharCode;
var slice = String.prototype.slice;

/**
Expand All @@ -125,6 +124,10 @@ function makeToken(
return { kind, start, end, value };
}

function printCharCode(code) {
return isNaN(code) ? 'EOF' : JSON.stringify(String.fromCharCode(code));
}

/**
* Gets the next token from the source starting at the given position.
*
Expand All @@ -137,12 +140,22 @@ function readToken(source: Source, fromPosition: number): Token {
var bodyLength = body.length;

var position = positionAfterWhitespace(body, fromPosition);
var code = charCodeAt.call(body, position);

if (position >= bodyLength) {
return makeToken(TokenKind.EOF, position, position);
}

var code = charCodeAt.call(body, position);

// SourceCharacter
if (code < 0x0020 && code !== 0x0009 && code !== 0x000A && code !== 0x000D) {
throw syntaxError(
source,
position,
`Invalid character ${printCharCode(code)}.`
);
}

switch (code) {
// !
case 33: return makeToken(TokenKind.BANG, position, position + 1);
Expand Down Expand Up @@ -201,7 +214,7 @@ function readToken(source: Source, fromPosition: number): Token {
throw syntaxError(
source,
position,
`Unexpected character "${fromCharCode(code)}".`
`Unexpected character ${printCharCode(code)}.`
);
}

Expand All @@ -215,23 +228,28 @@ function positionAfterWhitespace(body: string, startPosition: number): number {
var position = startPosition;
while (position < bodyLength) {
var code = charCodeAt.call(body, position);
// Skip whitespace
// Skip Ignored
if (
code === 32 || // space
code === 44 || // comma
code === 160 || // '\xa0'
code === 0x2028 || // line separator
code === 0x2029 || // paragraph separator
code > 8 && code < 14 // whitespace
// BOM
code === 0xFEFF ||
// White Space
code === 0x0009 || // tab
code === 0x0020 || // space
// Line Terminator
code === 0x000A || // new line
code === 0x000D || // carriage return
// Comma
code === 0x002C
) {
++position;
// Skip comments
} else if (code === 35) { // #
++position;
while (
position < bodyLength &&
(code = charCodeAt.call(body, position)) &&
code !== 10 && code !== 13 && code !== 0x2028 && code !== 0x2029
(code = charCodeAt.call(body, position)) !== null &&
// SourceCharacter but not LineTerminator
(code > 0x001F || code === 0x0009) && code !== 0x000A && code !== 0x000D
) {
++position;
}
Expand Down Expand Up @@ -265,7 +283,7 @@ function readNumber(source, start, firstCode) {
throw syntaxError(
source,
position,
`Invalid number, unexpected digit after 0: "${fromCharCode(code)}".`
`Invalid number, unexpected digit after 0: ${printCharCode(code)}.`
);
}
} else {
Expand Down Expand Up @@ -315,8 +333,7 @@ function readDigits(source, start, firstCode) {
throw syntaxError(
source,
position,
'Invalid number, expected digit but got: ' +
(code ? `"${fromCharCode(code)}"` : 'EOF') + '.'
`Invalid number, expected digit but got: ${printCharCode(code)}.`
);
}

Expand All @@ -329,15 +346,26 @@ function readString(source, start) {
var body = source.body;
var position = start + 1;
var chunkStart = position;
var code;
var code = 0;
var value = '';

while (
position < body.length &&
(code = charCodeAt.call(body, position)) &&
code !== 34 &&
code !== 10 && code !== 13 && code !== 0x2028 && code !== 0x2029
(code = charCodeAt.call(body, position)) !== null &&
// not LineTerminator
code !== 0x000A && code !== 0x000D &&
// not Quote (")
code !== 34
) {
// SourceCharacter
if (code < 0x0020 && code !== 0x0009) {
throw syntaxError(
source,
position,
`Invalid character within String: ${printCharCode(code)}.`
);
}

++position;
if (code === 92) { // \
value += slice.call(body, chunkStart, position - 1);
Expand All @@ -351,7 +379,7 @@ function readString(source, start) {
case 110: value += '\n'; break;
case 114: value += '\r'; break;
case 116: value += '\t'; break;
case 117:
case 117: // u
var charCode = uniCharCode(
charCodeAt.call(body, position + 1),
charCodeAt.call(body, position + 2),
Expand All @@ -362,25 +390,26 @@ function readString(source, start) {
throw syntaxError(
source,
position,
'Bad character escape sequence.'
`Invalid character escape sequence: ` +
`\\u${body.slice(position + 1, position + 5)}.`
);
}
value += fromCharCode(charCode);
value += String.fromCharCode(charCode);
position += 4;
break;
default:
throw syntaxError(
source,
position,
'Bad character escape sequence.'
`Invalid character escape sequence: \\${String.fromCharCode(code)}.`
);
}
++position;
chunkStart = position;
}
}

if (code !== 34) {
if (code !== 34) { // quote (")
throw syntaxError(source, position, 'Unterminated string.');
}

Expand Down Expand Up @@ -428,10 +457,10 @@ function readName(source, position) {
var body = source.body;
var bodyLength = body.length;
var end = position + 1;
var code;
var code = 0;
while (
end !== bodyLength &&
(code = charCodeAt.call(body, end)) &&
(code = charCodeAt.call(body, end)) !== null &&
(
code === 95 || // _
code >= 48 && code <= 57 || // 0-9
Expand Down