Skip to content

Commit c7e2f0b

Browse files
committed
[RFC] Clarify and restrict unicode support
This proposal alters the parser grammar to be more specific about what unicode characters are allowed as source, restricts those characters interpretted as white space or line breaks, and clarifies line break behavior relative to error reporting with a non-normative note. Implements graphql/graphql-spec#96
1 parent 5d4d531 commit c7e2f0b

File tree

2 files changed

+108
-38
lines changed

2 files changed

+108
-38
lines changed

src/language/__tests__/lexer.js

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,25 @@ function lexErr(str) {
2222

2323
describe('Lexer', () => {
2424

25+
it('disallows uncommon control characters', () => {
26+
27+
expect(lexErr('\u0007')
28+
).to.throw(
29+
'Syntax Error GraphQL (1:1) Invalid character "\\u0007"'
30+
);
31+
32+
});
33+
34+
it('accepts BOM header', () => {
35+
expect(lexOne('\uFEFF foo')
36+
).to.deep.equal({
37+
kind: TokenKind.NAME,
38+
start: 2,
39+
end: 5,
40+
value: 'foo'
41+
});
42+
});
43+
2544
it('skips whitespace', () => {
2645

2746
expect(lexOne(`
@@ -136,53 +155,75 @@ describe('Lexer', () => {
136155

137156
it('lex reports useful string errors', () => {
138157

158+
expect(
159+
lexErr('"')
160+
).to.throw('Syntax Error GraphQL (1:2) Unterminated string');
161+
139162
expect(
140163
lexErr('"no end quote')
141164
).to.throw('Syntax Error GraphQL (1:14) Unterminated string');
142165

143166
expect(
144-
lexErr('"multi\nline"')
145-
).to.throw('Syntax Error GraphQL (1:7) Unterminated string');
167+
lexErr('"contains unescaped \u0007 control char"')
168+
).to.throw(
169+
'Syntax Error GraphQL (1:21) Invalid character within String: "\\u0007".'
170+
);
146171

147172
expect(
148-
lexErr('"multi\rline"')
149-
).to.throw('Syntax Error GraphQL (1:7) Unterminated string');
173+
lexErr('"null-byte is not \u0000 end of file"')
174+
).to.throw(
175+
'Syntax Error GraphQL (1:19) Invalid character within String: "\\u0000".'
176+
);
150177

151178
expect(
152-
lexErr('"multi\u2028line"')
179+
lexErr('"multi\nline"')
153180
).to.throw('Syntax Error GraphQL (1:7) Unterminated string');
154181

155182
expect(
156-
lexErr('"multi\u2029line"')
183+
lexErr('"multi\rline"')
157184
).to.throw('Syntax Error GraphQL (1:7) Unterminated string');
158185

159186
expect(
160187
lexErr('"bad \\z esc"')
161-
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
188+
).to.throw(
189+
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\z.'
190+
);
162191

163192
expect(
164193
lexErr('"bad \\x esc"')
165-
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
194+
).to.throw(
195+
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\x.'
196+
);
166197

167198
expect(
168199
lexErr('"bad \\u1 esc"')
169-
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
200+
).to.throw(
201+
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\u1 es.'
202+
);
170203

171204
expect(
172205
lexErr('"bad \\u0XX1 esc"')
173-
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
206+
).to.throw(
207+
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\u0XX1.'
208+
);
174209

175210
expect(
176211
lexErr('"bad \\uXXXX esc"')
177-
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
212+
).to.throw(
213+
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uXXXX.'
214+
);
178215

179216
expect(
180217
lexErr('"bad \\uFXXX esc"')
181-
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
218+
).to.throw(
219+
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uFXXX.'
220+
);
182221

183222
expect(
184223
lexErr('"bad \\uXXXF esc"')
185-
).to.throw('Syntax Error GraphQL (1:7) Bad character escape sequence');
224+
).to.throw(
225+
'Syntax Error GraphQL (1:7) Invalid character escape sequence: \\uXXXF.'
226+
);
186227
});
187228

188229
it('lexes numbers', () => {

src/language/lexer.js

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ tokenDescription[TokenKind.FLOAT] = 'Float';
110110
tokenDescription[TokenKind.STRING] = 'String';
111111

112112
var charCodeAt = String.prototype.charCodeAt;
113-
var fromCharCode = String.fromCharCode;
114113
var slice = String.prototype.slice;
115114

116115
/**
@@ -125,6 +124,10 @@ function makeToken(
125124
return { kind, start, end, value };
126125
}
127126

127+
function printCharCode(code) {
128+
return isNaN(code) ? 'EOF' : JSON.stringify(String.fromCharCode(code));
129+
}
130+
128131
/**
129132
* Gets the next token from the source starting at the given position.
130133
*
@@ -137,12 +140,22 @@ function readToken(source: Source, fromPosition: number): Token {
137140
var bodyLength = body.length;
138141

139142
var position = positionAfterWhitespace(body, fromPosition);
140-
var code = charCodeAt.call(body, position);
141143

142144
if (position >= bodyLength) {
143145
return makeToken(TokenKind.EOF, position, position);
144146
}
145147

148+
var code = charCodeAt.call(body, position);
149+
150+
// SourceCharacter
151+
if (code < 0x0020 && code !== 0x0009 && code !== 0x000A && code !== 0x000D) {
152+
throw syntaxError(
153+
source,
154+
position,
155+
`Invalid character ${printCharCode(code)}.`
156+
);
157+
}
158+
146159
switch (code) {
147160
// !
148161
case 33: return makeToken(TokenKind.BANG, position, position + 1);
@@ -201,7 +214,7 @@ function readToken(source: Source, fromPosition: number): Token {
201214
throw syntaxError(
202215
source,
203216
position,
204-
`Unexpected character "${fromCharCode(code)}".`
217+
`Unexpected character ${printCharCode(code)}.`
205218
);
206219
}
207220

@@ -217,21 +230,26 @@ function positionAfterWhitespace(body: string, startPosition: number): number {
217230
var code = charCodeAt.call(body, position);
218231
// Skip whitespace
219232
if (
220-
code === 32 || // space
221-
code === 44 || // comma
222-
code === 160 || // '\xa0'
223-
code === 0x2028 || // line separator
224-
code === 0x2029 || // paragraph separator
225-
code > 8 && code < 14 // whitespace
233+
// BOM
234+
code === 0xFEFF ||
235+
// White Space
236+
code === 0x0009 || // tab
237+
code === 0x0020 || // space
238+
// Line Terminator
239+
code === 0x000A || // new line
240+
code === 0x000D || // carriage return
241+
// Comma
242+
code === 0x002C
226243
) {
227244
++position;
228245
// Skip comments
229246
} else if (code === 35) { // #
230247
++position;
231248
while (
232249
position < bodyLength &&
233-
(code = charCodeAt.call(body, position)) &&
234-
code !== 10 && code !== 13 && code !== 0x2028 && code !== 0x2029
250+
(code = charCodeAt.call(body, position)) !== null &&
251+
// SourceCharacter but not LineTerminator
252+
(code > 0x001F || code === 0x0009) && code !== 0x000A && code !== 0x000D
235253
) {
236254
++position;
237255
}
@@ -265,7 +283,7 @@ function readNumber(source, start, firstCode) {
265283
throw syntaxError(
266284
source,
267285
position,
268-
`Invalid number, unexpected digit after 0: "${fromCharCode(code)}".`
286+
`Invalid number, unexpected digit after 0: ${printCharCode(code)}.`
269287
);
270288
}
271289
} else {
@@ -315,8 +333,7 @@ function readDigits(source, start, firstCode) {
315333
throw syntaxError(
316334
source,
317335
position,
318-
'Invalid number, expected digit but got: ' +
319-
(code ? `"${fromCharCode(code)}"` : 'EOF') + '.'
336+
`Invalid number, expected digit but got: ${printCharCode(code)}.`
320337
);
321338
}
322339

@@ -329,15 +346,26 @@ function readString(source, start) {
329346
var body = source.body;
330347
var position = start + 1;
331348
var chunkStart = position;
332-
var code;
349+
var code = 0;
333350
var value = '';
334351

335352
while (
336353
position < body.length &&
337-
(code = charCodeAt.call(body, position)) &&
338-
code !== 34 &&
339-
code !== 10 && code !== 13 && code !== 0x2028 && code !== 0x2029
354+
(code = charCodeAt.call(body, position)) !== null &&
355+
// not LineTerminator
356+
code !== 0x000A && code !== 0x000D &&
357+
// not Quote (")
358+
code !== 34
340359
) {
360+
// SourceCharacter
361+
if (code < 0x0020 && code !== 0x0009) {
362+
throw syntaxError(
363+
source,
364+
position,
365+
`Invalid character within String: ${printCharCode(code)}.`
366+
);
367+
}
368+
341369
++position;
342370
if (code === 92) { // \
343371
value += slice.call(body, chunkStart, position - 1);
@@ -351,7 +379,7 @@ function readString(source, start) {
351379
case 110: value += '\n'; break;
352380
case 114: value += '\r'; break;
353381
case 116: value += '\t'; break;
354-
case 117:
382+
case 117: // u
355383
var charCode = uniCharCode(
356384
charCodeAt.call(body, position + 1),
357385
charCodeAt.call(body, position + 2),
@@ -362,25 +390,26 @@ function readString(source, start) {
362390
throw syntaxError(
363391
source,
364392
position,
365-
'Bad character escape sequence.'
393+
`Invalid character escape sequence: ` +
394+
`\\u${body.slice(position + 1, position + 5)}.`
366395
);
367396
}
368-
value += fromCharCode(charCode);
397+
value += String.fromCharCode(charCode);
369398
position += 4;
370399
break;
371400
default:
372401
throw syntaxError(
373402
source,
374403
position,
375-
'Bad character escape sequence.'
404+
`Invalid character escape sequence: \\${String.fromCharCode(code)}.`
376405
);
377406
}
378407
++position;
379408
chunkStart = position;
380409
}
381410
}
382411

383-
if (code !== 34) {
412+
if (code !== 34) { // quote (")
384413
throw syntaxError(source, position, 'Unterminated string.');
385414
}
386415

@@ -428,10 +457,10 @@ function readName(source, position) {
428457
var body = source.body;
429458
var bodyLength = body.length;
430459
var end = position + 1;
431-
var code;
460+
var code = 0;
432461
while (
433462
end !== bodyLength &&
434-
(code = charCodeAt.call(body, end)) &&
463+
(code = charCodeAt.call(body, end)) !== null &&
435464
(
436465
code === 95 || // _
437466
code >= 48 && code <= 57 || // 0-9

0 commit comments

Comments
 (0)