Skip to content

Commit 516f683

Browse files
authored
Merge pull request #265 from benley/verbatim-strings
Implement verbatim string literals
2 parents 4b0e795 + 43b1802 commit 516f683

File tree

12 files changed

+151
-8
lines changed

12 files changed

+151
-8
lines changed

core/ast.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ struct LiteralNumber : public AST {
473473
/** Represents JSON strings. */
474474
struct LiteralString : public AST {
475475
String value;
476-
enum TokenKind { SINGLE, DOUBLE, BLOCK };
476+
enum TokenKind { SINGLE, DOUBLE, BLOCK, VERBATIM_SINGLE, VERBATIM_DOUBLE };
477477
TokenKind tokenKind;
478478
std::string blockIndent; // Only contains ' ' and '\t'.
479479
std::string blockTermIndent; // Only contains ' ' and '\t'.

core/desugarer.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,9 @@ class Desugarer {
658658
// Nothing to do.
659659

660660
} else if (auto *ast = dynamic_cast<LiteralString*>(ast_)) {
661-
if (ast->tokenKind != LiteralString::BLOCK) {
661+
if ((ast->tokenKind != LiteralString::BLOCK) &&
662+
(ast->tokenKind != LiteralString::VERBATIM_DOUBLE) &&
663+
(ast->tokenKind != LiteralString::VERBATIM_SINGLE)) {
662664
ast->value = jsonnet_string_unescape(ast->location, ast->value);
663665
}
664666
ast->tokenKind = LiteralString::DOUBLE;

core/formatter.cpp

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ class Unparser {
427427
if (bind.functionSugar) {
428428
unparseParams(bind.parenLeftFodder, bind.params, bind.trailingComma,
429429
bind.parenRightFodder);
430-
}
430+
}
431431
fill(bind.opFodder, true, true);
432432
o << "=";
433433
unparse(bind.body, true);
@@ -464,6 +464,30 @@ class Unparser {
464464
}
465465
}
466466
o << ast->blockTermIndent << "|||";
467+
} else if (ast->tokenKind == LiteralString::VERBATIM_DOUBLE) {
468+
o << "@\"";
469+
for (const char32_t *cp = ast->value.c_str() ; *cp != U'\0' ; ++cp) {
470+
if (*cp == U'"') {
471+
o << "\"\"";
472+
} else {
473+
std::string utf8;
474+
encode_utf8(*cp, utf8);
475+
o << utf8;
476+
}
477+
}
478+
o << "\"";
479+
} else if (ast->tokenKind == LiteralString::VERBATIM_SINGLE) {
480+
o << "@'";
481+
for (const char32_t *cp = ast->value.c_str() ; *cp != U'\0' ; ++cp) {
482+
if (*cp == U'\'') {
483+
o << "''";
484+
} else {
485+
std::string utf8;
486+
encode_utf8(*cp, utf8);
487+
o << utf8;
488+
}
489+
}
490+
o << "'";
467491
}
468492

469493
} else if (dynamic_cast<const LiteralNull*>(ast_)) {
@@ -617,6 +641,8 @@ class EnforceStringStyle : public FmtPass {
617641
void visit(LiteralString *lit)
618642
{
619643
if (lit->tokenKind == LiteralString::BLOCK) return;
644+
if (lit->tokenKind == LiteralString::VERBATIM_DOUBLE) return;
645+
if (lit->tokenKind == LiteralString::VERBATIM_SINGLE) return;
620646
String canonical = jsonnet_string_unescape(lit->location, lit->value);
621647
unsigned num_single = 0, num_double = 0;
622648
for (char32_t c : canonical) {
@@ -1302,7 +1328,7 @@ class FixIndentation {
13021328
Indent new_indent = strong_indent
13031329
? newIndentStrong(first_fodder, indent, new_column)
13041330
: newIndent(first_fodder, indent, new_column);
1305-
1331+
13061332
first = true;
13071333
for (auto &element : ast->elements) {
13081334
if (!first) column++;
@@ -1430,7 +1456,7 @@ class FixIndentation {
14301456
if (bind.functionSugar) {
14311457
params(bind.parenLeftFodder, bind.params, bind.trailingComma,
14321458
bind.parenRightFodder, new_indent);
1433-
}
1459+
}
14341460
fill(bind.opFodder, true, true, new_indent.lineUp);
14351461
column++; // '='
14361462
Indent new_indent2 = newIndent(open_fodder(bind.body), new_indent, column + 1);
@@ -1456,6 +1482,24 @@ class FixIndentation {
14561482
ast->blockTermIndent = std::string(indent.base, ' ');
14571483
column = indent.base; // blockTermIndent
14581484
column += 3; // "|||"
1485+
} else if (ast->tokenKind == LiteralString::VERBATIM_SINGLE) {
1486+
column += 2; // Include start and end quotes
1487+
for (const char32_t *cp = ast->value.c_str() ; ; ++cp) {
1488+
if (*cp == U'\'') {
1489+
column += 2;
1490+
} else {
1491+
column += 1;
1492+
}
1493+
}
1494+
} else if (ast->tokenKind == LiteralString::VERBATIM_DOUBLE) {
1495+
column += 2; // Include start and end quotes
1496+
for (const char32_t *cp = ast->value.c_str() ; ; ++cp) {
1497+
if (*cp == U'"') {
1498+
column += 2;
1499+
} else {
1500+
column += 1;
1501+
}
1502+
}
14591503
}
14601504

14611505
} else if (dynamic_cast<LiteralNull*>(ast_)) {

core/lexer.cpp

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const
9797
}
9898

9999

100-
/**
100+
/**
101101
# Consume all text until the end of the line, return number of newlines after that and indent
102102
*/
103103
static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent,
@@ -507,6 +507,43 @@ Tokens jsonnet_lex(const std::string &filename, const char *input)
507507
}
508508
break;
509509

510+
// Verbatim string literals.
511+
// ' and " quoting is interpreted here, unlike non-verbatim strings
512+
// where it is done later by jsonnet_string_unescape. This is OK
513+
// in this case because no information is lost by resoving the
514+
// repeated quote into a single quote, so we can go back to the
515+
// original form in the formatter.
516+
case '@': {
517+
c++;
518+
if (*c != '"' && *c != '\'') {
519+
std::stringstream ss;
520+
ss << "Couldn't lex verbatim string, junk after '@': " << *c;
521+
throw StaticError(filename, begin, ss.str());
522+
}
523+
const char quot = *c;
524+
c++; // Advance beyond the opening quote.
525+
for (; ; ++c) {
526+
if (*c == '\0') {
527+
throw StaticError(filename, begin, "Unterminated verbatim string");
528+
}
529+
if (*c == quot) {
530+
if (*(c+1) == quot) {
531+
c++;
532+
} else {
533+
break;
534+
}
535+
}
536+
data += *c;
537+
}
538+
c++; // Advance beyond the closing quote.
539+
if (quot == '"') {
540+
kind = Token::VERBATIM_STRING_DOUBLE;
541+
} else {
542+
kind = Token::VERBATIM_STRING_SINGLE;
543+
}
544+
}
545+
break;
546+
510547
// Keywords
511548
default:
512549
if (is_identifier_first(*c)) {
@@ -534,7 +571,7 @@ Tokens jsonnet_lex(const std::string &filename, const char *input)
534571
if (*c == '/' && *(c+1) == '*') {
535572

536573
unsigned margin = c - line_start;
537-
574+
538575
const char *initial_c = c;
539576
c += 2; // Avoid matching /*/: skip the /* before starting the search for */.
540577

core/lexer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ struct Token {
102102
STRING_DOUBLE,
103103
STRING_SINGLE,
104104
STRING_BLOCK,
105+
VERBATIM_STRING_SINGLE,
106+
VERBATIM_STRING_DOUBLE,
105107

106108
// Keywords
107109
ASSERT,
@@ -176,6 +178,8 @@ struct Token {
176178
case OPERATOR: return "OPERATOR";
177179
case STRING_SINGLE: return "STRING_SINGLE";
178180
case STRING_DOUBLE: return "STRING_DOUBLE";
181+
case VERBATIM_STRING_SINGLE: return "VERBATIM_STRING_SINGLE";
182+
case VERBATIM_STRING_DOUBLE: return "VERBATIM_STRING_DOUBLE";
179183
case STRING_BLOCK: return "STRING_BLOCK";
180184

181185
case ASSERT: return "assert";

core/lexer_test.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,38 @@ TEST(Lexer, TestSingleStrings)
140140
"'hi", {}, "single string 'hi:1:1: Unterminated string");
141141
}
142142

143+
TEST(Lexer, TestVerbatimDoubleStrings)
144+
{
145+
testLex("verbatim double string @\"hi\"",
146+
"@\"hi\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi")}, "");
147+
testLex("verbatim double string @\"hi nl\"",
148+
"@\"hi\n\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\n")}, "");
149+
testLex("verbatim double string @\"hi\\\"",
150+
"@\"hi\\\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\\")}, "");
151+
testLex("verbatim double string @\"hi\\\\\"",
152+
"@\"hi\\\\\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\\\\")}, "");
153+
testLex("verbatim double string @\"hi\"\"\"",
154+
"@\"hi\"\"\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\"")}, "");
155+
testLex("verbatim double string @\"\"\"hi\"",
156+
"@\"\"\"hi\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "\"hi")}, "");
157+
}
158+
159+
TEST(Lexer, TestVerbatimSingleStrings)
160+
{
161+
testLex("verbatim single string @'hi'",
162+
"@'hi'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi")}, "");
163+
testLex("verbatim single string @'hi nl'",
164+
"@'hi\n'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\n")}, "");
165+
testLex("verbatim single string @'hi\\'",
166+
"@'hi\\'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\\")}, "");
167+
testLex("verbatim single string @'hi\\\\'",
168+
"@'hi\\\\'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\\\\")}, "");
169+
testLex("verbatim single string @'hi'''",
170+
"@'hi'''", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi'")}, "");
171+
testLex("verbatim single string @'''hi'",
172+
"@'''hi'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "'hi")}, "");
173+
}
174+
143175
TEST(Lexer, TestBlockStringSpaces)
144176
{
145177
const char str[] =

core/parser.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,13 @@ class Parser {
634634
return alloc->make<LiteralString>(
635635
span(tok), tok.fodder, tok.data32(), LiteralString::BLOCK,
636636
tok.stringBlockIndent, tok.stringBlockTermIndent);
637+
case Token::VERBATIM_STRING_SINGLE:
638+
return alloc->make<LiteralString>(
639+
span(tok), tok.fodder, tok.data32(), LiteralString::VERBATIM_SINGLE, "", "");
640+
case Token::VERBATIM_STRING_DOUBLE:
641+
return alloc->make<LiteralString>(
642+
span(tok), tok.fodder, tok.data32(), LiteralString::VERBATIM_DOUBLE, "", "");
643+
637644

638645
case Token::FALSE:
639646
return alloc->make<LiteralBoolean>(span(tok), tok.fodder, false);

doc/language/spec.html

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,17 @@ <h2 id="lexing">Lexing</h2>
123123
<li><i>number</i>: As defined by <a href="http://json.org/">JSON</a> but without the leading
124124
minus.</li>
125125

126-
<li><i>string</i>: Which can have three quoting forms:
126+
<li><i>string</i>: Which can have five quoting forms:
127127

128128
<ul>
129129
<li>Double-quoted, beginning with <code>"</code> and ending with the first subsequent non-quoted
130130
<code>"</code> </li>
131131
<li>Single-quoted, beginning with <code>'</code> and ending with the first subsequent non-quoted
132132
<code>'</code> </li>
133+
<li>Double-quoted verbatim, beginning with <code>@"</code> and ending with the first subsequent
134+
<code>non-quoted "</code> </li>
135+
<li>Single-quoted verbatim, beginning with <code>@'</code> and ending with the first subsequent
136+
<code>non-quoted '</code> </li>
133137
<li>Text block, beginning with <code>|||</code>, followed by optional whitespace and a new-line.
134138
The next line must be prefixed with some non-zero length whitespace <i>W</i>. The block ends at the
135139
first subsequent line that does not begin with <i>W</i>, and it is an error if this line does not
@@ -143,6 +147,11 @@ <h2 id="lexing">Lexing</h2>
143147
characters: <code>"'\bfnrt0</code> which have their standard meanings, as well as
144148
<code>\uXXXX</code> for hexadecimal unicode escapes.</p>
145149

150+
<p>Verbatim strings eschew all of the normal string escaping, including hexidecimal unicode escapes.
151+
Every character in a verbatim string is processed literally, with the exception of doubled
152+
end-quotes. Within a verbatim single-quoted string, <code>''</code> is processed as <code>'</code>,
153+
and a verbatim double-quoted string, <code>""</code> is processed as <code>"</code>.</p>
154+
146155
<p>In the rest of this specification, the string is assumed to be canonicalized into a sequence of
147156
unicode codepoints with no record of the original quoting form as well and any escape characters
148157
removed. </p>

test_suite/unicode.jsonnet

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,7 @@ local test_chinese = "肉"; // Meat.
3232
std.assertEqual(std.length(test_chinese), 1) &&
3333

3434
std.assertEqual("\u0100", "Ā") &&
35+
std.assertEqual(@"\u0100", "\\u0100") &&
36+
std.assertEqual(@"Ā", "Ā") &&
3537

3638
true

test_suite/unparse.jsonnet

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ limitations under the License.
2222
zero: 0,
2323
string: "'foo\n bar\n\n\"bar\u0005\"\'\t \u0050\b\f\r\\",
2424
string2: '"foo\n bar\n\n\'bar\u0005\"\'\t \u0050\b\f\r\\',
25+
string3: @'"foo\n bar\n\n''bar\u0005\"''\t \u0050\b\f\r\\',
26+
string4: @"'foo\n bar\n\n'bar\u0005""'\t \u0050\b\f\r\\",
2527
"lit_field1": 1,
2628
'lit_field2': 1,
2729
"false": false,

test_suite/unparse.jsonnet.fmt.golden

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ limitations under the License.
2222
zero: 0,
2323
string: "'foo\n bar\n\n\"bar\u0005\"\'\t \u0050\b\f\r\\",
2424
string2: '"foo\n bar\n\n\'bar\u0005\"\'\t \u0050\b\f\r\\',
25+
string3: @'"foo\n bar\n\n''bar\u0005\"''\t \u0050\b\f\r\\',
26+
string4: @"'foo\n bar\n\n'bar\u0005""'\t \u0050\b\f\r\\",
2527
lit_field1: 1,
2628
lit_field2: 1,
2729
"false": false,

test_suite/unparse.jsonnet.golden

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
"small_number": 1e-14,
1010
"string": "'foo\n bar\n\n\"bar\u0005\"'\t P\b\f\r\\",
1111
"string2": "\"foo\n bar\n\n'bar\u0005\"'\t P\b\f\r\\",
12+
"string3": "\"foo\\n bar\\n\\n'bar\\u0005\\\"'\\t \\u0050\\b\\f\\r\\\\",
13+
"string4": "'foo\\n bar\\n\\n'bar\\u0005\"'\\t \\u0050\\b\\f\\r\\\\",
1214
"true": true,
1315
"with\"quote": "\"",
1416
"zero": 0

0 commit comments

Comments
 (0)