Merge pull request #265 from benley/verbatim-strings

sparkprime · web-flow · commit 516f68331fa5 · 2016-12-19T20:42:02.000-05:00
Implement verbatim string literals
diff --git a/core/ast.h b/core/ast.h
@@ -473,7 +473,7 @@ struct LiteralNumber : public AST {
 /** Represents JSON strings. */
 struct LiteralString : public AST {
     String value;
-    enum TokenKind { SINGLE, DOUBLE, BLOCK };
+    enum TokenKind { SINGLE, DOUBLE, BLOCK, VERBATIM_SINGLE, VERBATIM_DOUBLE };
     TokenKind tokenKind;
     std::string blockIndent;  // Only contains ' ' and '\t'.
     std::string blockTermIndent;  // Only contains ' ' and '\t'.
diff --git a/core/desugarer.cpp b/core/desugarer.cpp
@@ -658,7 +658,9 @@ class Desugarer {
             // Nothing to do.
 
         } else if (auto *ast = dynamic_cast<LiteralString*>(ast_)) {
-            if (ast->tokenKind != LiteralString::BLOCK) {
+            if ((ast->tokenKind != LiteralString::BLOCK) &&
+                (ast->tokenKind != LiteralString::VERBATIM_DOUBLE) &&
+                (ast->tokenKind != LiteralString::VERBATIM_SINGLE)) {
                 ast->value = jsonnet_string_unescape(ast->location, ast->value);
             }
             ast->tokenKind = LiteralString::DOUBLE;
diff --git a/core/formatter.cpp b/core/formatter.cpp
@@ -427,7 +427,7 @@ class Unparser {
                 if (bind.functionSugar) {
                     unparseParams(bind.parenLeftFodder, bind.params, bind.trailingComma,
                                   bind.parenRightFodder);
-                } 
+                }
                 fill(bind.opFodder, true, true);
                 o << "=";
                 unparse(bind.body, true);
@@ -464,6 +464,30 @@ class Unparser {
                     }
                 }
                 o << ast->blockTermIndent << "|||";
+            } else if (ast->tokenKind == LiteralString::VERBATIM_DOUBLE) {
+                o << "@\"";
+                for (const char32_t *cp = ast->value.c_str() ; *cp != U'\0' ; ++cp) {
+                    if (*cp == U'"') {
+                        o << "\"\"";
+                    } else {
+                        std::string utf8;
+                        encode_utf8(*cp, utf8);
+                        o << utf8;
+                    }
+                }
+                o << "\"";
+            } else if (ast->tokenKind == LiteralString::VERBATIM_SINGLE) {
+                o << "@'";
+                for (const char32_t *cp = ast->value.c_str() ; *cp != U'\0' ; ++cp) {
+                    if (*cp == U'\'') {
+                        o << "''";
+                    } else {
+                        std::string utf8;
+                        encode_utf8(*cp, utf8);
+                        o << utf8;
+                    }
+                }
+                o << "'";
             }
 
         } else if (dynamic_cast<const LiteralNull*>(ast_)) {
@@ -617,6 +641,8 @@ class EnforceStringStyle : public FmtPass {
     void visit(LiteralString *lit)
     {
         if (lit->tokenKind == LiteralString::BLOCK) return;
+        if (lit->tokenKind == LiteralString::VERBATIM_DOUBLE) return;
+        if (lit->tokenKind == LiteralString::VERBATIM_SINGLE) return;
         String canonical = jsonnet_string_unescape(lit->location, lit->value);
         unsigned num_single = 0, num_double = 0;
         for (char32_t c : canonical) {
@@ -1302,7 +1328,7 @@ class FixIndentation {
             Indent new_indent = strong_indent
                                 ? newIndentStrong(first_fodder, indent, new_column)
                                 : newIndent(first_fodder, indent, new_column);
-                
+
             first = true;
             for (auto &element : ast->elements) {
                 if (!first) column++;
@@ -1430,7 +1456,7 @@ class FixIndentation {
                 if (bind.functionSugar) {
                     params(bind.parenLeftFodder, bind.params, bind.trailingComma,
                            bind.parenRightFodder, new_indent);
-                } 
+                }
                 fill(bind.opFodder, true, true, new_indent.lineUp);
                 column++;  // '='
                 Indent new_indent2 = newIndent(open_fodder(bind.body), new_indent, column + 1);
@@ -1456,6 +1482,24 @@ class FixIndentation {
                 ast->blockTermIndent = std::string(indent.base, ' ');
                 column = indent.base;  // blockTermIndent
                 column += 3;  // "|||"
+            } else if (ast->tokenKind == LiteralString::VERBATIM_SINGLE) {
+                column += 2;  // Include start and end quotes
+                for (const char32_t *cp = ast->value.c_str() ; ; ++cp) {
+                    if (*cp == U'\'') {
+                        column += 2;
+                    } else {
+                        column += 1;
+                    }
+                }
+            } else if (ast->tokenKind == LiteralString::VERBATIM_DOUBLE) {
+                column += 2;  // Include start and end quotes
+                for (const char32_t *cp = ast->value.c_str() ; ; ++cp) {
+                    if (*cp == U'"') {
+                        column += 2;
+                    } else {
+                        column += 1;
+                    }
+                }
             }
 
         } else if (dynamic_cast<LiteralNull*>(ast_)) {
diff --git a/core/lexer.cpp b/core/lexer.cpp
@@ -97,7 +97,7 @@ static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const
 }
 
 
-/** 
+/**
 # Consume all text until the end of the line, return number of newlines after that and indent
 */
 static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent,
@@ -507,6 +507,43 @@ Tokens jsonnet_lex(const std::string &filename, const char *input)
             }
             break;
 
+            // Verbatim string literals.
+            // ' and " quoting is interpreted here, unlike non-verbatim strings
+            // where it is done later by jsonnet_string_unescape.  This is OK
+            // in this case because no information is lost by resoving the
+            // repeated quote into a single quote, so we can go back to the
+            // original form in the formatter.
+            case '@': {
+                c++;
+                if (*c != '"' && *c != '\'') {
+                    std::stringstream ss;
+                    ss << "Couldn't lex verbatim string, junk after '@': " << *c;
+                    throw StaticError(filename, begin, ss.str());
+                }
+                const char quot = *c;
+                c++;  // Advance beyond the opening quote.
+                for (; ; ++c) {
+                    if (*c == '\0')  {
+                        throw StaticError(filename, begin, "Unterminated verbatim string");
+                    }
+                    if (*c == quot) {
+                        if (*(c+1) == quot) {
+                           c++;
+                       } else {
+                           break;
+                       }
+                    }
+                    data += *c;
+                }
+                c++;  // Advance beyond the closing quote.
+                if (quot == '"') {
+                    kind = Token::VERBATIM_STRING_DOUBLE;
+                } else {
+                    kind = Token::VERBATIM_STRING_SINGLE;
+                }
+            }
+            break;
+
             // Keywords
             default:
             if (is_identifier_first(*c)) {
@@ -534,7 +571,7 @@ Tokens jsonnet_lex(const std::string &filename, const char *input)
                 if (*c == '/' && *(c+1) == '*') {
 
                     unsigned margin = c - line_start;
- 
+
                     const char *initial_c = c;
                     c += 2;  // Avoid matching /*/: skip the /* before starting the search for */.
 
diff --git a/core/lexer.h b/core/lexer.h
@@ -102,6 +102,8 @@ struct Token {
         STRING_DOUBLE,
         STRING_SINGLE,
         STRING_BLOCK,
+        VERBATIM_STRING_SINGLE,
+        VERBATIM_STRING_DOUBLE,
 
         // Keywords
         ASSERT,
@@ -176,6 +178,8 @@ struct Token {
             case OPERATOR: return "OPERATOR";
             case STRING_SINGLE: return "STRING_SINGLE";
             case STRING_DOUBLE: return "STRING_DOUBLE";
+            case VERBATIM_STRING_SINGLE: return "VERBATIM_STRING_SINGLE";
+            case VERBATIM_STRING_DOUBLE: return "VERBATIM_STRING_DOUBLE";
             case STRING_BLOCK: return "STRING_BLOCK";
 
             case ASSERT: return "assert";
diff --git a/core/lexer_test.cpp b/core/lexer_test.cpp
@@ -140,6 +140,38 @@ TEST(Lexer, TestSingleStrings)
             "'hi", {}, "single string 'hi:1:1: Unterminated string");
 }
 
+TEST(Lexer, TestVerbatimDoubleStrings)
+{
+    testLex("verbatim double string @\"hi\"",
+            "@\"hi\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi")}, "");
+    testLex("verbatim double string @\"hi nl\"",
+            "@\"hi\n\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\n")}, "");
+    testLex("verbatim double string @\"hi\\\"",
+            "@\"hi\\\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\\")}, "");
+    testLex("verbatim double string @\"hi\\\\\"",
+            "@\"hi\\\\\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\\\\")}, "");
+    testLex("verbatim double string @\"hi\"\"\"",
+            "@\"hi\"\"\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "hi\"")}, "");
+    testLex("verbatim double string @\"\"\"hi\"",
+            "@\"\"\"hi\"", {Token(Token::Kind::VERBATIM_STRING_DOUBLE, "\"hi")}, "");
+}
+
+TEST(Lexer, TestVerbatimSingleStrings)
+{
+    testLex("verbatim single string @'hi'",
+            "@'hi'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi")}, "");
+    testLex("verbatim single string @'hi nl'",
+            "@'hi\n'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\n")}, "");
+    testLex("verbatim single string @'hi\\'",
+            "@'hi\\'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\\")}, "");
+    testLex("verbatim single string @'hi\\\\'",
+            "@'hi\\\\'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi\\\\")}, "");
+    testLex("verbatim single string @'hi'''",
+            "@'hi'''", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "hi'")}, "");
+    testLex("verbatim single string @'''hi'",
+            "@'''hi'", {Token(Token::Kind::VERBATIM_STRING_SINGLE, "'hi")}, "");
+}
+
 TEST(Lexer, TestBlockStringSpaces)
 {
     const char str[] =
diff --git a/core/parser.cpp b/core/parser.cpp
@@ -634,6 +634,13 @@ class Parser {
             return alloc->make<LiteralString>(
                 span(tok), tok.fodder, tok.data32(), LiteralString::BLOCK,
                 tok.stringBlockIndent, tok.stringBlockTermIndent);
+            case Token::VERBATIM_STRING_SINGLE:
+            return alloc->make<LiteralString>(
+                span(tok), tok.fodder, tok.data32(), LiteralString::VERBATIM_SINGLE, "", "");
+            case Token::VERBATIM_STRING_DOUBLE:
+            return alloc->make<LiteralString>(
+                span(tok), tok.fodder, tok.data32(), LiteralString::VERBATIM_DOUBLE, "", "");
+
 
             case Token::FALSE:
             return alloc->make<LiteralBoolean>(span(tok), tok.fodder, false);
diff --git a/doc/language/spec.html b/doc/language/spec.html
@@ -123,13 +123,17 @@ <h2 id="lexing">Lexing</h2>
 <li><i>number</i>: As defined by <a href="http://json.org/">JSON</a> but without the leading
 minus.</li>
 
-<li><i>string</i>: Which can have three quoting forms:
+<li><i>string</i>: Which can have five quoting forms:
 
 <ul>
 <li>Double-quoted, beginning with <code>"</code> and ending with the first subsequent non-quoted
 <code>"</code> </li>
 <li>Single-quoted, beginning with <code>'</code> and ending with the first subsequent non-quoted
 <code>'</code> </li>
+<li>Double-quoted verbatim, beginning with <code>@"</code> and ending with the first subsequent
+<code>non-quoted "</code> </li>
+<li>Single-quoted verbatim, beginning with <code>@'</code> and ending with the first subsequent
+<code>non-quoted '</code> </li>
 <li>Text block, beginning with <code>|||</code>, followed by optional whitespace and a new-line.
 The next line must be prefixed with some non-zero length whitespace <i>W</i>.  The block ends at the
 first subsequent line that does not begin with <i>W</i>, and it is an error if this line does not
@@ -143,6 +147,11 @@ <h2 id="lexing">Lexing</h2>
 characters: <code>"'\bfnrt0</code> which have their standard meanings, as well as
 <code>\uXXXX</code> for hexadecimal unicode escapes.</p>
 
+<p>Verbatim strings eschew all of the normal string escaping, including hexidecimal unicode escapes.
+Every character in a verbatim string is processed literally, with the exception of doubled
+end-quotes.  Within a verbatim single-quoted string, <code>''</code> is processed as <code>'</code>,
+and a verbatim double-quoted string, <code>""</code> is processed as <code>"</code>.</p>
+
 <p>In the rest of this specification, the string is assumed to be canonicalized into a sequence of
 unicode codepoints with no record of the original quoting form as well and any escape characters
 removed. </p>
diff --git a/test_suite/unicode.jsonnet b/test_suite/unicode.jsonnet
@@ -32,5 +32,7 @@ local test_chinese = "肉";  // Meat.
 std.assertEqual(std.length(test_chinese), 1) &&
 
 std.assertEqual("\u0100", "Ā") &&
+std.assertEqual(@"\u0100", "\\u0100") &&
+std.assertEqual(@"Ā", "Ā") &&
 
 true
diff --git a/test_suite/unparse.jsonnet b/test_suite/unparse.jsonnet
@@ -22,6 +22,8 @@ limitations under the License.
     zero: 0,
     string: "'foo\n bar\n\n\"bar\u0005\"\'\t \u0050\b\f\r\\",
     string2: '"foo\n bar\n\n\'bar\u0005\"\'\t \u0050\b\f\r\\',
+    string3: @'"foo\n bar\n\n''bar\u0005\"''\t \u0050\b\f\r\\',
+    string4: @"'foo\n bar\n\n'bar\u0005""'\t \u0050\b\f\r\\",
     "lit_field1": 1,
     'lit_field2': 1,
     "false": false,
diff --git a/test_suite/unparse.jsonnet.fmt.golden b/test_suite/unparse.jsonnet.fmt.golden
@@ -22,6 +22,8 @@ limitations under the License.
     zero: 0,
     string: "'foo\n bar\n\n\"bar\u0005\"\'\t \u0050\b\f\r\\",
     string2: '"foo\n bar\n\n\'bar\u0005\"\'\t \u0050\b\f\r\\',
+    string3: @'"foo\n bar\n\n''bar\u0005\"''\t \u0050\b\f\r\\',
+    string4: @"'foo\n bar\n\n'bar\u0005""'\t \u0050\b\f\r\\",
     lit_field1: 1,
     lit_field2: 1,
     "false": false,
diff --git a/test_suite/unparse.jsonnet.golden b/test_suite/unparse.jsonnet.golden
@@ -9,6 +9,8 @@
    "small_number": 1e-14,
    "string": "'foo\n bar\n\n\"bar\u0005\"'\t P\b\f\r\\",
    "string2": "\"foo\n bar\n\n'bar\u0005\"'\t P\b\f\r\\",
+   "string3": "\"foo\\n bar\\n\\n'bar\\u0005\\\"'\\t \\u0050\\b\\f\\r\\\\",
+   "string4": "'foo\\n bar\\n\\n'bar\\u0005\"'\\t \\u0050\\b\\f\\r\\\\",
    "true": true,
    "with\"quote": "\"",
    "zero": 0