Skip to content

Improve support for Unicode supplementary characters in identifiers and string interpolation (as in Scala 2) #16278

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions compiler/src/dotty/tools/dotc/core/NameOps.scala
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,25 @@ object NameOps {
def isVarPattern: Boolean =
testSimple { n =>
n.length > 0 && {
def isLowerLetterSupplementary: Boolean =
import Character.{isHighSurrogate, isLowSurrogate, isLetter, isLowerCase, isValidCodePoint, toCodePoint}
isHighSurrogate(n(0)) && n.length > 1 && isLowSurrogate(n(1)) && {
val codepoint = toCodePoint(n(0), n(1))
isValidCodePoint(codepoint) && isLetter(codepoint) && isLowerCase(codepoint)
}
val first = n.head
(((first.isLower && first.isLetter) || first == '_')
&& (n != false_)
&& (n != true_)
&& (n != null_))
((first.isLower && first.isLetter || first == '_' || isLowerLetterSupplementary)
&& n != false_
&& n != true_
&& n != null_)
}
} || name.is(PatMatGivenVarName)

def isOpAssignmentName: Boolean = name match {
case raw.NE | raw.LE | raw.GE | EMPTY =>
false
case name: SimpleName =>
name.length > 0 && name.last == '=' && name.head != '=' && isOperatorPart(name.head)
name.length > 0 && name.last == '=' && name.head != '=' && isOperatorPart(name.firstCodePoint)
case _ =>
false
}
Expand Down Expand Up @@ -352,6 +358,14 @@ object NameOps {
val unmangled = kinds.foldLeft(name)(_.unmangle(_))
if (unmangled eq name) name else unmangled.unmangle(kinds)
}

def firstCodePoint: Int =
val first = name.firstPart
import Character.{isHighSurrogate, isLowSurrogate, isValidCodePoint, toCodePoint}
if isHighSurrogate(first(0)) && first.length > 1 && isLowSurrogate(first(1)) then
val codepoint = toCodePoint(first(0), first(1))
if isValidCodePoint(codepoint) then codepoint else first(0)
else first(0)
}

extension (name: TermName) {
Expand Down
2 changes: 1 addition & 1 deletion compiler/src/dotty/tools/dotc/core/Names.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ object Names {
*/
abstract class Designator

/** A name if either a term name or a type name. Term names can be simple
/** A name is either a term name or a type name. Term names can be simple
* or derived. A simple term name is essentially an interned string stored
* in a name table. A derived term name adds a tag, and possibly a number
* or a further simple name to some other name.
Expand Down
61 changes: 26 additions & 35 deletions compiler/src/dotty/tools/dotc/parsing/Scanners.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import config.Feature.{migrateTo3, fewerBracesEnabled}
import config.SourceVersion.`3.0`
import reporting.{NoProfile, Profile, Message}

import java.util.Objects

object Scanners {

/** Offset into source character array */
Expand Down Expand Up @@ -777,19 +779,21 @@ object Scanners {
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
val low = lookaheadChar()
if isLowSurrogate(low) then
nextChar()
val codepoint = toCodePoint(high, low)
if isValidCodePoint(codepoint) && test(codepoint) then
putChar(high)
putChar(low)
res = true
if isValidCodePoint(codepoint) then
if test(codepoint) then
putChar(high)
putChar(low)
nextChar()
nextChar()
res = true
else
error(em"illegal character '${toUnicode(high)}${toUnicode(low)}'")
else if !strict then
putChar(high)
nextChar()
res = true
else
error(em"illegal character '${toUnicode(high)}' missing low surrogate")
Expand Down Expand Up @@ -889,7 +893,6 @@ object Scanners {
if (ch == '\"') {
if (lookaheadChar() == '\"') {
nextRawChar()
//offset += 3 // first part is positioned at the quote
nextRawChar()
stringPart(multiLine = true)
}
Expand All @@ -900,7 +903,6 @@ object Scanners {
}
}
else {
//offset += 1 // first part is positioned at the quote
stringPart(multiLine = false)
}
}
Expand Down Expand Up @@ -977,30 +979,29 @@ object Scanners {
}
case _ =>
def fetchOther() =
if (ch == '\u21D2') {
if ch == '\u21D2' then
nextChar(); token = ARROW
report.deprecationWarning(em"The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (ch == '\u2190') {
else if ch == '\u2190' then
nextChar(); token = LARROW
report.deprecationWarning(em"The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (Character.isUnicodeIdentifierStart(ch)) {
else if isUnicodeIdentifierStart(ch) then
putChar(ch)
nextChar()
getIdentRest()
}
else if (isSpecial(ch)) {
if ch == '"' && token == IDENTIFIER then token = INTERPOLATIONID
else if isSpecial(ch) then
putChar(ch)
nextChar()
getOperatorRest()
}
else if isSupplementary(ch, isUnicodeIdentifierStart) then
getIdentRest()
else {
if ch == '"' && token == IDENTIFIER then token = INTERPOLATIONID
else if isSupplementary(ch, isSpecial) then
getOperatorRest()
else
error(em"illegal character '${toUnicode(ch)}'")
nextChar()
}
fetchOther()
}
}
Expand Down Expand Up @@ -1115,7 +1116,7 @@ object Scanners {
else error(em"unclosed quoted identifier")
}

private def getIdentRest(): Unit = (ch: @switch) match {
@tailrec private def getIdentRest(): Unit = (ch: @switch) match {
case 'A' | 'B' | 'C' | 'D' | 'E' |
'F' | 'G' | 'H' | 'I' | 'J' |
'K' | 'L' | 'M' | 'N' | 'O' |
Expand Down Expand Up @@ -1150,7 +1151,7 @@ object Scanners {
finishNamed()
}

private def getOperatorRest(): Unit = (ch: @switch) match {
@tailrec private def getOperatorRest(): Unit = (ch: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
Expand All @@ -1161,23 +1162,13 @@ object Scanners {
if nxch == '/' || nxch == '*' then finishNamed()
else { putChar(ch); nextChar(); getOperatorRest() }
case _ =>
if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
if isSpecial(ch) then { putChar(ch); nextChar(); getOperatorRest() }
else if isSupplementary(ch, isSpecial) then getOperatorRest()
else finishNamed()
}

private def getIdentOrOperatorRest(): Unit =
if (isIdentifierPart(ch))
getIdentRest()
else ch match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
'|' | '\\' | '/' =>
getOperatorRest()
case _ =>
if (isSpecial(ch)) getOperatorRest()
else finishNamed()
}
if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()

def isSoftModifier: Boolean =
token == IDENTIFIER
Expand Down Expand Up @@ -1500,7 +1491,7 @@ object Scanners {
if (ch == '\'') finishCharLit()
else {
token = op
strVal = if (name != null) name.toString else null
strVal = Objects.toString(name)
litBuf.clear()
}
}
Expand Down
2 changes: 1 addition & 1 deletion compiler/src/dotty/tools/dotc/parsing/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ package object parsing {
def precedence(operator: Name): Int =
if (operator eq nme.ERROR) -1
else {
val firstCh = operator.firstPart.head
val firstCh = operator.firstCodePoint
if (isScalaLetter(firstCh)) 1
else if (operator.isOpAssignmentName) 0
else firstCh match {
Expand Down
57 changes: 35 additions & 22 deletions compiler/src/dotty/tools/dotc/util/Chars.scala
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
package dotty.tools.dotc.util

import scala.annotation.switch
import java.lang.{Character => JCharacter}
import java.lang.Character.LETTER_NUMBER
import java.lang.Character.LOWERCASE_LETTER
import java.lang.Character.OTHER_LETTER
import java.lang.Character.TITLECASE_LETTER
import java.lang.Character.UPPERCASE_LETTER
import Character.{LETTER_NUMBER, LOWERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, UPPERCASE_LETTER}
import Character.{MATH_SYMBOL, OTHER_SYMBOL}
import Character.{isJavaIdentifierPart, isUnicodeIdentifierStart, isUnicodeIdentifierPart}

/** Contains constants and classifier methods for characters */
object Chars {
object Chars:

inline val LF = '\u000A'
inline val FF = '\u000C'
inline val CR = '\u000D'
inline val SU = '\u001A'

type CodePoint = Int

/** Convert a character digit to an Int according to given base,
* -1 if no success
*/
Expand Down Expand Up @@ -59,17 +58,21 @@ object Chars {
'0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'

/** Can character start an alphanumeric Scala identifier? */
def isIdentifierStart(c: Char): Boolean =
(c == '_') || (c == '$') || JCharacter.isUnicodeIdentifierStart(c)
def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)

/** Can character form part of an alphanumeric Scala identifier? */
def isIdentifierPart(c: Char): Boolean =
(c == '$') || JCharacter.isUnicodeIdentifierPart(c)
def isIdentifierPart(c: Char): Boolean = (c == '$') || isUnicodeIdentifierPart(c)
def isIdentifierPart(c: CodePoint) = (c == '$') || isUnicodeIdentifierPart(c)

/** Is character a math or other symbol in Unicode? */
def isSpecial(c: Char): Boolean = {
val chtp = JCharacter.getType(c)
chtp == JCharacter.MATH_SYMBOL.toInt || chtp == JCharacter.OTHER_SYMBOL.toInt
val chtp = Character.getType(c)
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
}
def isSpecial(codePoint: CodePoint) = {
val chtp = Character.getType(codePoint)
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
}

def isValidJVMChar(c: Char): Boolean =
Expand All @@ -78,15 +81,26 @@ object Chars {
def isValidJVMMethodChar(c: Char): Boolean =
!(c == '.' || c == ';' || c =='[' || c == '/' || c == '<' || c == '>')

private final val otherLetters = Set[Char]('\u0024', '\u005F') // '$' and '_'
private final val letterGroups = {
import JCharacter._
Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
}
def isScalaLetter(ch: Char): Boolean = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
def isScalaLetter(c: Char): Boolean =
Character.getType(c: @switch) match {
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
case _ => c == '$' || c == '_'
}
def isScalaLetter(c: CodePoint): Boolean =
Character.getType(c: @switch) match {
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
case _ => c == '$' || c == '_'
}

/** Can character form part of a Scala operator name? */
def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
def isOperatorPart(c: Char): Boolean = (c: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
'|' | '/' | '\\' => true
case c => isSpecial(c)
}
def isOperatorPart(c: CodePoint): Boolean = (c: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
Expand All @@ -95,5 +109,4 @@ object Chars {
}

/** Would the character be encoded by `NameTransformer.encode`? */
def willBeEncoded(c : Char) : Boolean = !JCharacter.isJavaIdentifierPart(c)
}
def willBeEncoded(c: Char): Boolean = !isJavaIdentifierPart(c)
4 changes: 4 additions & 0 deletions tests/pos/surrogates.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,8 @@ class Construction {
def reversed = "xyz\udc00\ud801abc"
}

class Demon {
val 😈 = 42
}

// was: error: illegal character '\ud801', '\udc00'
32 changes: 32 additions & 0 deletions tests/pos/t1406.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

class Identifiers {

def f(x: Any): Boolean = x match {
case 𐐨XYZ: String => true
case 𐐨 => true
}
def g(x: Any) = x match {
case 𐐨 @ _ => 𐐨
}
}
class Ops {
def 𝆗 = 42 // was error: illegal character
def op_𝆗 = 42 // was error: illegal character
def 🌀 = 42
def op_🌀 = 42
def 🚀 = 42
def op_🚀 = 42
def 🜀 = 42
def op_🜀 = 42
def 𝓅 = 42
def op_𝓅 = 42
}
class Strings {
implicit class Interps(sc: StringContext) {
def 𝓅(parts: Any*) = "done"
}
def 𝓅 = 42
def interpolated = s"$𝓅"
def e = "a 𝓅 b"
def f = 𝓅"one"
}
28 changes: 28 additions & 0 deletions tests/run/t1406b.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

case class C(n: Int) {
def 𐀀(c: C): C = C(n * c.n) // actually a letter but supplementary 0x10000
def ☀(c: C): C = C(n * c.n) // just a symbol
def ☀=(c: C): C = C(n * c.n) // just a symbol
def 🌀(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
def 🌀=(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
def *(c: C): C = C(n * c.n)
def +(c: C): C = C(n + c.n)
}
object Test extends App {
val Sum = 84
val Product = 1764
val ProductSum = 1806
val SumProduct = 3528
val c, d = C(42)
def assertEquals(expected: Int, actual: C) = assert(expected == actual.n)
assertEquals(Sum, c + d)
assertEquals(Product, c * d)
assertEquals(Product, c ☀ d)
assertEquals(ProductSum, c * d + d)
assertEquals(ProductSum, c ☀ d + d)
assertEquals(SumProduct, c ☀= d + d) // assignment op is low precedence
assertEquals(SumProduct, c 𐀀 d + d) // the first one, letter should be low precedence
assertEquals(ProductSum, c 🌀d + d) // the second one, cyclone should be high precedence
assertEquals(SumProduct, c 🌀= d + d) // assignment op is low precedence
}