Skip to content

Commit 741be04

Browse files
authored
Allow all Unicode characters (#207)
1 parent db26ebc commit 741be04

File tree

5 files changed

+115
-17
lines changed

5 files changed

+115
-17
lines changed

spec/fluent.ebnf

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,14 @@ Identifier ::= [a-zA-Z] [a-zA-Z0-9_-]*
9595

9696
/* Content Characters
9797
*
98-
* Translation content can be written using most Unicode characters, with the
99-
* exception of C0 control characters (but allowing tab), surrogate blocks and
100-
* non-characters (U+FFFE, U+FFFF).
98+
* Translation content can be written using any Unicode characters. However,
99+
* some characters are considered special depending on the type of content
100+
* they're in. See text_char and quoted_char for more information.
101+
*
102+
* Some Unicode characters, even if allowed, should be avoided in Fluent
103+
* resources. See spec/recommendations.md.
101104
*/
102-
any_char ::= [\\u{9}\\u{20}-\\u{D7FF}\\u{E000}-\\u{FFFD}]
103-
| [\\u{10000}-\\u{10FFFF}]
105+
any_char ::= [\\u{0}-\\u{10FFFF}]
104106

105107
/* Text elements
106108
*
@@ -113,7 +115,7 @@ any_char ::= [\\u{9}\\u{20}-\\u{D7FF}\\u{E000}-\\u{FFFD}]
113115
*/
114116
special_text_char ::= "{"
115117
| "}"
116-
text_char ::= any_char - special_text_char
118+
text_char ::= any_char - special_text_char - line_end
117119
indented_char ::= text_char - "[" - "*" - "."
118120

119121
/* String literals
@@ -130,7 +132,7 @@ special_quoted_char ::= "\""
130132
| "\\"
131133
special_escape ::= "\\" special_quoted_char
132134
unicode_escape ::= "\\u" /[0-9a-fA-F]{4}/
133-
quoted_char ::= (any_char - special_quoted_char)
135+
quoted_char ::= (any_char - special_quoted_char - line_end)
134136
| special_escape
135137
| unicode_escape
136138

spec/recommendations.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Recommendations for Writing Fluent
2+
3+
## Unicode Characters
4+
5+
Fluent resources can be written using all Unicode characters. The recommended
6+
encoding for Fluent files is UTF-8.
7+
8+
Translation authors and developers are encouraged to avoid characters defined
9+
in the following code point ranges. They are either control characters or
10+
permanently undefined Unicode characters:
11+
12+
[U+0000-U+0008], [U+000B-U+000C], [U+000E-U+001F], [U+007F-U+009F],
13+
[U+FDD0-U+FDEF], [U+1FFFE-U+1FFFF], [U+2FFFE-U+2FFFF], [U+3FFFE-U+3FFFF],
14+
[U+4FFFE-U+4FFFF], [U+5FFFE-U+5FFFF], [U+6FFFE-U+6FFFF], [U+7FFFE-U+7FFFF],
15+
[U+8FFFE-U+8FFFF], [U+9FFFE-U+9FFFF], [U+AFFFE-U+AFFFF], [U+BFFFE-U+BFFFF],
16+
[U+CFFFE-U+CFFFF], [U+DFFFE-U+DFFFF], [U+EFFFE-U+EFFFF], [U+FFFFE-U+FFFFF],
17+
[U+10FFFE-U+10FFFF].

syntax/grammar.mjs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -392,15 +392,16 @@ let Identifier =
392392
/* -------------------------------------------------------------------------- */
393393
/* Content Characters
394394
*
395-
* Translation content can be written using most Unicode characters, with the
396-
* exception of C0 control characters (but allowing tab), surrogate blocks and
397-
* non-characters (U+FFFE, U+FFFF).
395+
* Translation content can be written using any Unicode characters. However,
396+
* some characters are considered special depending on the type of content
397+
* they're in. See text_char and quoted_char for more information.
398+
*
399+
* Some Unicode characters, even if allowed, should be avoided in Fluent
400+
* resources. See spec/recommendations.md.
398401
*/
399402

400403
let any_char =
401-
either(
402-
charset("\\u{9}\\u{20}-\\u{D7FF}\\u{E000}-\\u{FFFD}"),
403-
charset("\\u{10000}-\\u{10FFFF}"));
404+
charset("\\u{0}-\\u{10FFFF}");
404405

405406
/* -------------------------------------------------------------------------- */
406407
/* Text elements
@@ -418,10 +419,11 @@ let special_text_char =
418419
string("{"),
419420
string("}"));
420421

421-
let text_char =
422+
let text_char = defer(() =>
422423
and(
424+
not(line_end),
423425
not(special_text_char),
424-
any_char);
426+
any_char));
425427

426428
let indented_char =
427429
and(
@@ -459,13 +461,14 @@ let unicode_escape =
459461
regex(/[0-9a-fA-F]{4}/))
460462
.map(join);
461463

462-
let quoted_char =
464+
let quoted_char = defer(() =>
463465
either(
464466
and(
467+
not(line_end),
465468
not(special_quoted_char),
466469
any_char),
467470
special_escape,
468-
unicode_escape);
471+
unicode_escape));
469472

470473
/* ------- */
471474
/* Numbers */

test/fixtures/any_char.ftl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# ↓ BEL, U+0007
2+
control0 = abcdef
3+
4+
# ↓ DEL, U+007F
5+
delete = abcdef
6+
7+
# ↓ BPM, U+0082
8+
control1 = abc‚def

test/fixtures/any_char.json

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"type": "Resource",
3+
"body": [
4+
{
5+
"type": "Message",
6+
"id": {
7+
"type": "Identifier",
8+
"name": "control0"
9+
},
10+
"value": {
11+
"type": "Pattern",
12+
"elements": [
13+
{
14+
"type": "TextElement",
15+
"value": "abc\u0007def"
16+
}
17+
]
18+
},
19+
"attributes": [],
20+
"comment": {
21+
"type": "Comment",
22+
"content": " ↓ BEL, U+0007"
23+
}
24+
},
25+
{
26+
"type": "Message",
27+
"id": {
28+
"type": "Identifier",
29+
"name": "delete"
30+
},
31+
"value": {
32+
"type": "Pattern",
33+
"elements": [
34+
{
35+
"type": "TextElement",
36+
"value": "abcdef"
37+
}
38+
]
39+
},
40+
"attributes": [],
41+
"comment": {
42+
"type": "Comment",
43+
"content": " ↓ DEL, U+007F"
44+
}
45+
},
46+
{
47+
"type": "Message",
48+
"id": {
49+
"type": "Identifier",
50+
"name": "control1"
51+
},
52+
"value": {
53+
"type": "Pattern",
54+
"elements": [
55+
{
56+
"type": "TextElement",
57+
"value": "abc‚def"
58+
}
59+
]
60+
},
61+
"attributes": [],
62+
"comment": {
63+
"type": "Comment",
64+
"content": " ↓ BPM, U+0082"
65+
}
66+
}
67+
]
68+
}

0 commit comments

Comments
 (0)