Implement verbatim string literals

benley · benley · commit a36fe1047c7e · 2016-11-25T17:14:50.000-05:00
Fixes google#246
diff --git a/core/ast.h b/core/ast.h
@@ -473,7 +473,7 @@ struct LiteralNumber : public AST {
 /** Represents JSON strings. */
 struct LiteralString : public AST {
     String value;
-    enum TokenKind { SINGLE, DOUBLE, BLOCK };
+    enum TokenKind { SINGLE, DOUBLE, BLOCK, VERBATIM_SINGLE, VERBATIM_DOUBLE };
     TokenKind tokenKind;
     std::string blockIndent;  // Only contains ' ' and '\t'.
     std::string blockTermIndent;  // Only contains ' ' and '\t'.
diff --git a/core/desugarer.cpp b/core/desugarer.cpp
@@ -658,7 +658,9 @@ class Desugarer {
             // Nothing to do.
 
         } else if (auto *ast = dynamic_cast<LiteralString*>(ast_)) {
-            if (ast->tokenKind != LiteralString::BLOCK) {
+            if ((ast->tokenKind != LiteralString::BLOCK) &&
+                (ast->tokenKind != LiteralString::VERBATIM_DOUBLE) &&
+                (ast->tokenKind != LiteralString::VERBATIM_SINGLE)) {
                 ast->value = jsonnet_string_unescape(ast->location, ast->value);
             }
             ast->tokenKind = LiteralString::DOUBLE;
diff --git a/core/formatter.cpp b/core/formatter.cpp
@@ -464,6 +464,30 @@ class Unparser {
                     }
                 }
                 o << ast->blockTermIndent << "|||";
+            } else if (ast->tokenKind == LiteralString::VERBATIM_DOUBLE) {
+                o << "@\"";
+                for (const char32_t *cp = ast->value.c_str() ; *cp != U'\0' ; ++cp) {
+                    if (*cp == U'"') {
+                        o << "\"\"";
+                    } else {
+                        std::string utf8;
+                        encode_utf8(*cp, utf8);
+                        o << utf8;
+                    }
+                }
+                o << "\"";
+            } else if (ast->tokenKind == LiteralString::VERBATIM_SINGLE) {
+                o << "@'";
+                for (const char32_t *cp = ast->value.c_str() ; *cp != U'\0' ; ++cp) {
+                    if (*cp == U'\'') {
+                        o << "''";
+                    } else {
+                        std::string utf8;
+                        encode_utf8(*cp, utf8);
+                        o << utf8;
+                    }
+                }
+                o << "'";
             }
 
         } else if (dynamic_cast<const LiteralNull*>(ast_)) {
@@ -613,6 +637,8 @@ class EnforceStringStyle : public FmtPass {
     void visit(LiteralString *lit)
     {
         if (lit->tokenKind == LiteralString::BLOCK) return;
+        if (lit->tokenKind == LiteralString::VERBATIM_DOUBLE) return;
+        if (lit->tokenKind == LiteralString::VERBATIM_SINGLE) return;
         String canonical = jsonnet_string_unescape(lit->location, lit->value);
         unsigned num_single = 0, num_double = 0;
         for (char32_t c : canonical) {
diff --git a/core/lexer.cpp b/core/lexer.cpp
@@ -507,6 +507,38 @@ Tokens jsonnet_lex(const std::string &filename, const char *input)
             }
             break;
 
+            // Verbatim string literals.
+            case '@': {
+                c++;
+                if (*c != '"' && *c != '\'') {
+                    std::stringstream ss;
+                    ss << "Couldn't lex verbatim string, junk after '@': " << *c;
+                    throw StaticError(filename, begin, ss.str());
+                }
+                const char quot = *c;
+                c++;  // Advance beyond the opening quote.
+                for (; ; ++c) {
+                    if (*c == '\0')  {
+                        throw StaticError(filename, begin, "Unterminated verbatim string");
+                    }
+                    if (*c == quot) {
+                        if (*(c+1) == quot) {
+                           c++;
+                       } else {
+                           break;
+                       }
+                    }
+                    data += *c;
+                }
+                c++;  // Advance beyond the closing quote.
+                if (quot == '"') {
+                    kind = Token::VERBATIM_STRING_DOUBLE;
+                } else {
+                    kind = Token::VERBATIM_STRING_SINGLE;
+                }
+            }
+            break;
+
             // Keywords
             default:
             if (is_identifier_first(*c)) {
diff --git a/core/lexer.h b/core/lexer.h
@@ -102,6 +102,8 @@ struct Token {
         STRING_DOUBLE,
         STRING_SINGLE,
         STRING_BLOCK,
+        VERBATIM_STRING_SINGLE,
+        VERBATIM_STRING_DOUBLE,
 
         // Keywords
         ASSERT,
@@ -176,6 +178,8 @@ struct Token {
             case OPERATOR: return "OPERATOR";
             case STRING_SINGLE: return "STRING_SINGLE";
             case STRING_DOUBLE: return "STRING_DOUBLE";
+            case VERBATIM_STRING_SINGLE: return "VERBATIM_STRING_SINGLE";
+            case VERBATIM_STRING_DOUBLE: return "VERBATIM_STRING_DOUBLE";
             case STRING_BLOCK: return "STRING_BLOCK";
 
             case ASSERT: return "assert";
diff --git a/core/lexer_test.cpp b/core/lexer_test.cpp
@@ -140,6 +140,38 @@ TEST(Lexer, TestSingleStrings)
             "'hi", {}, "single string 'hi:1:1: Unterminated string");
 }
 
+TEST(Lexer, TestVerbatimDoubleStrings)
+{
+    testLex("verbatim double string @\"hi\"",
+            "@\"hi\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi")}, "");
+    testLex("verbatim double string @\"hi nl\"",
+            "@\"hi\n\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\n")}, "");
+    testLex("verbatim double string @\"hi\\\"",
+            "@\"hi\\\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\\")}, "");
+    testLex("verbatim double string @\"hi\\\\\"",
+            "@\"hi\\\\\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\\\\")}, "");
+    testLex("verbatim double string @\"hi\"\"\"",
+            "@\"hi\"\"\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\"")}, "");
+    testLex("verbatim double string @\"\"\"hi\"",
+            "@\"\"\"hi\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "\"hi")}, "");
+}
+
+TEST(Lexer, TestVerbatimSingleStrings)
+{
+    testLex("verbatim single string @'hi'",
+            "@'hi'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi")}, "");
+    testLex("verbatim single string @'hi nl'",
+            "@'hi\n'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\n")}, "");
+    testLex("verbatim single string @'hi\\'",
+            "@'hi\\'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\\")}, "");
+    testLex("verbatim single string @'hi\\\\'",
+            "@'hi\\\\'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\\\\")}, "");
+    testLex("verbatim single string @'hi'''",
+            "@'hi'''", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi'")}, "");
+    testLex("verbatim single string @'''hi'",
+            "@'''hi'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "'hi")}, "");
+}
+
 TEST(Lexer, TestBlockStringSpaces)
 {
     const char str[] =
diff --git a/core/parser.cpp b/core/parser.cpp
@@ -634,6 +634,13 @@ class Parser {
             return alloc->make<LiteralString>(
                 span(tok), tok.fodder, tok.data32(), LiteralString::BLOCK,
                 tok.stringBlockIndent, tok.stringBlockTermIndent);
+            case Token::VERBATIM_STRING_SINGLE:
+            return alloc->make<LiteralString>(
+                span(tok), tok.fodder, tok.data32(), LiteralString::VERBATIM_SINGLE, "", "");
+            case Token::VERBATIM_STRING_DOUBLE:
+            return alloc->make<LiteralString>(
+                span(tok), tok.fodder, tok.data32(), LiteralString::VERBATIM_DOUBLE, "", "");
+
 
             case Token::FALSE:
             return alloc->make<LiteralBoolean>(span(tok), tok.fodder, false);
diff --git a/doc/language/spec.html b/doc/language/spec.html
@@ -130,6 +130,10 @@ <h2 id="lexing">Lexing</h2>
 <code>"</code> </li>
 <li>Single-quoted, beginning with <code>'</code> and ending with the first subsequent non-quoted
 <code>'</code> </li>
+<li>Double-quoted verbatim, beginning with <code>@"</code> and ending with the first subsequent
+non-doubled <code>"</code> </li>
+<li>Single-quoted verbatim, beginning with <code>@'</code> and ending with the first subsequent
+non-doubled <code>'</code> </li>
 <li>Text block, beginning with <code>|||</code>, followed by optional whitespace and a new-line.
 The next line must be prefixed with some non-zero length whitespace <i>W</i>.  The block ends at the
 first subsequent line that does not begin with <i>W</i>, and it is an error if this line does not
@@ -143,6 +147,11 @@ <h2 id="lexing">Lexing</h2>
 characters: <code>"'\bfnrt0</code> which have their standard meanings, as well as
 <code>\uXXXX</code> for hexadecimal unicode escapes.</p>
 
+<p>Verbatim strings eschew all of the normal string escaping, including hexidecimal unicode escapes.
+Every character in a verbatim string is processed literally, with the exception of doubled
+end-quotes.  Within a verbatim single-quoted string, <code>''</code> is processed as <code>'</code>,
+and a verbatim double-quoted string, <code>""</code> is processed as <code>"</code>.</p>
+
 <p>In the rest of this specification, the string is assumed to be canonicalized into a sequence of
 unicode codepoints with no record of the original quoting form as well and any escape characters
 removed. </p>
diff --git a/test_suite/unparse.jsonnet b/test_suite/unparse.jsonnet
@@ -22,6 +22,8 @@ limitations under the License.
     zero: 0,
     string: "'foo\n bar\n\n\"bar\u0005\"\'\t \u0050\b\f\r\\",
     string2: '"foo\n bar\n\n\'bar\u0005\"\'\t \u0050\b\f\r\\',
+    string3: @'"foo\n bar\n\n''bar\u0005\"''\t \u0050\b\f\r\\',
+    string4: @"'foo\n bar\n\n'bar\u0005""'\t \u0050\b\f\r\\",
     "lit_field1": 1,
     'lit_field2': 1,
     "false": false,
diff --git a/test_suite/unparse.jsonnet.fmt.golden b/test_suite/unparse.jsonnet.fmt.golden
@@ -22,6 +22,8 @@ limitations under the License.
     zero: 0,
     string: "'foo\n bar\n\n\"bar\u0005\"\'\t \u0050\b\f\r\\",
     string2: '"foo\n bar\n\n\'bar\u0005\"\'\t \u0050\b\f\r\\',
+    string3: @'"foo\n bar\n\n''bar\u0005\"''\t \u0050\b\f\r\\',
+    string4: @"'foo\n bar\n\n'bar\u0005""'\t \u0050\b\f\r\\",
     lit_field1: 1,
     lit_field2: 1,
     "false": false,
diff --git a/test_suite/unparse.jsonnet.golden b/test_suite/unparse.jsonnet.golden
@@ -9,6 +9,8 @@
    "small_number": 1e-14,
    "string": "'foo\n bar\n\n\"bar\u0005\"'\t P\b\f\r\\",
    "string2": "\"foo\n bar\n\n'bar\u0005\"'\t P\b\f\r\\",
+   "string3": "\"foo\\n bar\\n\\n'bar\\u0005\\\"'\\t \\u0050\\b\\f\\r\\\\",
+   "string4": "'foo\\n bar\\n\\n'bar\\u0005\"'\\t \\u0050\\b\\f\\r\\\\",
    "true": true,
    "with\"quote": "\"",
    "zero": 0