SI-12290: support JDK15 text blocks in Java parser

harpocrates · harpocrates · commit 13d9f67099e3 · 2021-03-26T05:46:24.000-07:00
JDK15 introduced text blocks (JEP 378) for writing multiline strings. This adds support for parsing these strings in the Java parser. The logic for interpretting the literals is a little complicated, but follows from the "3.10.6. Text Blocks" of the Java language specification. The test cases include examples from there and from the JEP. Fixes scala/bug#12290
diff --git a/src/compiler/scala/tools/nsc/javac/JavaScanners.scala b/src/compiler/scala/tools/nsc/javac/JavaScanners.scala
@@ -239,6 +239,9 @@ trait JavaScanners extends ast.parser.ScannersCommon {
     */
     protected def putChar(c: Char): Unit = { cbuf.append(c) }
 
+    /** Remove the last N characters from the buffer */
+    private def popNChars(n: Int): Unit = if (n > 0) cbuf.setLength(cbuf.length - n)
+
     /** Clear buffer and set name */
     private def setName(): Unit = {
       name = newTermName(cbuf.toString())
@@ -322,15 +325,26 @@ trait JavaScanners extends ast.parser.ScannersCommon {
 
               case '\"' =>
                 in.next()
-                while (in.ch != '\"' && (in.isUnicode || in.ch != CR && in.ch != LF && in.ch != SU)) {
-                  getlitch()
-                }
-                if (in.ch == '\"') {
-                  token = STRINGLIT
-                  setName()
-                  in.next()
+                if (in.ch != '\"') { // "..." non-empty string literal
+                  while (in.ch != '\"' && (in.isUnicode || in.ch != CR && in.ch != LF && in.ch != SU)) {
+                    getlitch()
+                  }
+                  if (in.ch == '\"') {
+                    token = STRINGLIT
+                    setName()
+                    in.next()
+                  } else {
+                    syntaxError("unclosed string literal")
+                  }
                 } else {
-                  syntaxError("unclosed string literal")
+                  in.next()
+                  if (in.ch != '\"') { // "" empty string literal
+                    token = STRINGLIT
+                    setName()
+                  } else {
+                    in.next()
+                    getTextBlock()
+                  }
                 }
                 return
 
@@ -691,6 +705,8 @@ trait JavaScanners extends ast.parser.ScannersCommon {
             case '\"' => putChar('\"')
             case '\'' => putChar('\'')
             case '\\' => putChar('\\')
+            case 's'  => putChar(' ')  // specific to text blocks
+            case CR | LF =>            // specific to text blocks
             case _    =>
               syntaxError(in.cpos - 1, "invalid escape character")
               putChar(in.ch)
@@ -702,6 +718,120 @@ trait JavaScanners extends ast.parser.ScannersCommon {
         in.next()
       }
 
+    /** read a triple-quote delimited text block, starting after the first three
+      * double quotes
+      */
+    private def getTextBlock(): Unit = {
+      // Open delimiter is followed by optional space, then a newline
+      while (in.ch == ' ' || in.ch == '\t' || in.ch == FF) {
+        in.next()
+      }
+      if (in.ch != LF && in.ch != CR) { // CR-LF is already normalized into LF by `JavaCharArrayReader`
+        syntaxError("illegal text block open delimiter sequence, missing line terminator")
+        return
+      }
+      in.next()
+
+      /* Do a lookahead scan over the full text block to:
+       *   - compute common white space prefix
+       *   - find the offset where the text block ends
+       */
+      var commonWhiteSpacePrefix = Int.MaxValue
+      var blockEndOffset = 0
+      val backtrackTo = in.copy
+      var blockClosed = false
+      var lineWhiteSpacePrefix = 0
+      var lineIsOnlyWhitespace = true
+      while (!blockClosed && (in.isUnicode || in.ch != SU)) {
+        if (in.ch == '\"') { // Potential end of the block
+          in.next()
+          if (in.ch == '\"') {
+            in.next()
+            if (in.ch == '\"') {
+              blockClosed = true
+              commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
+              blockEndOffset = in.cpos - 2
+            }
+          }
+
+          // Not the end of the block - just a single or double " character
+          if (!blockClosed) {
+            lineIsOnlyWhitespace = false
+          }
+        } else if (in.ch == CR || in.ch == LF) { // new line in the block
+          in.next()
+          if (!lineIsOnlyWhitespace) {
+            commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
+          }
+          lineWhiteSpacePrefix = 0
+          lineIsOnlyWhitespace = true
+        } else if (lineIsOnlyWhitespace &&
+                   (in.ch == ' ' || in.ch == '\t' || in.ch == FF)) { // extend white space prefix
+          in.next()
+          lineWhiteSpacePrefix += 1
+        } else {
+          lineIsOnlyWhitespace = false
+          getlitch()
+        }
+      }
+      setName() // clear the literal buffer
+
+      // Bail out if the block never did have an end
+      if (!blockClosed) {
+        syntaxError("unclosed text block")
+        return
+      }
+
+      // Second pass: construct the literal string value this time
+      in = backtrackTo
+      while (in.cpos < blockEndOffset) {
+        // Drop the line's leading whitespace
+        var remainingPrefix = commonWhiteSpacePrefix
+        while (remainingPrefix > 0 && in.ch != CR && in.ch != LF && in.cpos < blockEndOffset) {
+          in.next()
+          remainingPrefix -= 1
+        }
+
+        var trailingWhitespaceLength = 0
+        var escapedNewline = false         // Does the line end with `\`?
+        while (in.ch != CR && in.ch != LF && in.cpos < blockEndOffset && !escapedNewline) {
+          if (isWhitespace(in.ch)) {
+            trailingWhitespaceLength += 1
+          } else {
+            trailingWhitespaceLength = 0
+          }
+
+          // Detect if the line is about to end with `\`
+          if (in.ch == '\\' && {
+            val lookahead = in.copy
+            lookahead.next()
+            lookahead.ch == CR || lookahead.ch == LF
+          }) {
+            escapedNewline = true
+          }
+
+          getlitch()
+        }
+
+        // Drop the line's trailing whitespace
+        popNChars(trailingWhitespaceLength)
+
+        // Normalize line terminators
+        if ((in.ch == CR || in.ch == LF) && !escapedNewline) {
+          in.next()
+          putChar('\n')
+        }
+      }
+
+      token = STRINGLIT
+      setName()
+
+      // Trailing """
+      in.next()
+      in.next()
+      in.next()
+    }
+
     /** read fractional part and exponent of floating point number
      *  if one is present.
      */
diff --git a/test/files/run/t12290.check b/test/files/run/t12290.check
@@ -0,0 +1,55 @@
+====
+A text
+
+====
+<html>
+    <body>
+        <p>Hello, world</p>
+    </body>
+</html>
+
+====
+SELECT "EMP_ID", "LAST_NAME" FROM "EMPLOYEE_TB"
+WHERE "CITY" = 'INDIANAPOLIS'
+ORDER BY "EMP_ID", "LAST_NAME";
+
+====
+<html>
+    <body>
+        <p>Hello, world</p>
+    </body>
+</html>
+
+====
+                            <html>
+                                <body>
+                                    <p>Hello, world</p>
+                                </body>
+                            </html>
+
+====
+<html>
+    <body>
+        <p>Hello, world</p>
+    </body>
+
+</html>
+
+====
+<html>
+
+    <body>        <p>Hello ,	world</p>
+    </body>
+</html>
+
+====
+String text = """
+    A text block inside a text block
+""";
+
+====
+foo	bar
+baz
+====
+
+====
diff --git a/test/files/run/t12290/Test.scala b/test/files/run/t12290/Test.scala
@@ -0,0 +1,27 @@
+/* Using `valueOf` is a way to check that the Java string literals were properly
+ * parsed, since the parsed value is what the Scala compiler will use when
+ * resolving the singleton types
+ */
+object Test extends App {
+  println("====")
+  println(valueOf[TextBlocks.aText.type])
+  println("====")
+  println(valueOf[TextBlocks.html1.type])
+  println("====")
+  println(valueOf[TextBlocks.query.type])
+  println("====")
+  println(valueOf[TextBlocks.html2.type])
+  println("====")
+  println(valueOf[TextBlocks.html3.type])
+  println("====")
+  println(valueOf[TextBlocks.html4.type])
+  println("====")
+  println(valueOf[TextBlocks.html5.type])
+  println("====")
+  println(valueOf[TextBlocks.code.type])
+  println("====")
+  println(valueOf[TextBlocks.simpleString.type])
+  println("====")
+  println(valueOf[TextBlocks.emptyString.type])
+  println("====")
+}
diff --git a/test/files/run/t12290/TextBlocks.java b/test/files/run/t12290/TextBlocks.java
@@ -0,0 +1,68 @@
+class TextBlocks {
+
+    final static String aText = """
+      A text
+      """;
+
+    final static String html1 = """
+                                <html>
+                                    <body>
+                                        <p>Hello, world</p>
+                                    </body>
+                                </html>
+                                """;
+
+    // quote characters are unescaped
+    final static String query = """
+                                SELECT "EMP_ID", "LAST_NAME" FROM "EMPLOYEE_TB"
+                                WHERE "CITY" = 'INDIANAPOLIS'
+                                ORDER BY "EMP_ID", "LAST_NAME";
+                                """;
+
+    // incidental trailing spaces
+    final static String html2 = """
+                                <html>   
+                                    <body>
+                                        <p>Hello, world</p>    
+                                    </body> 
+                                </html>   
+                                """;
+
+    // trailing delimiter influences
+    final static String html3 = """
+                                <html>
+                                    <body>
+                                        <p>Hello, world</p>
+                                    </body>
+                                </html>
+    """;
+
+    // blank line does not affect 
+    final static String html4 = """
+                                <html>
+                                    <body>
+                                        <p>Hello, world</p>
+                                    </body>
+
+                                </html>
+                                    """;
+
+    // escape sequences
+    final static String html5 = """
+                                <html>\n
+                                    <body>\
+                                        <p>Hello\s,\tworld</p>
+                                    </body>
+                                </html>
+                                """;
+    final static String code =
+        """
+        String text = \"""
+            A text block inside a text block
+        \""";
+        """;
+
+    final static String simpleString = "foo\tbar\nbaz";
+
+    final static String emptyString = "";
+}