Skip to content

Commit 805dda8

Browse files
authored
Improve support for Unicode supplementary characters in identifiers and string interpolation (as in Scala 2) (#16278)
Fixes #16271
2 parents 6f5bb34 + 22f11cd commit 805dda8

File tree

8 files changed

+146
-64
lines changed

8 files changed

+146
-64
lines changed

compiler/src/dotty/tools/dotc/core/NameOps.scala

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,19 +86,25 @@ object NameOps {
8686
def isVarPattern: Boolean =
8787
testSimple { n =>
8888
n.length > 0 && {
89+
def isLowerLetterSupplementary: Boolean =
90+
import Character.{isHighSurrogate, isLowSurrogate, isLetter, isLowerCase, isValidCodePoint, toCodePoint}
91+
isHighSurrogate(n(0)) && n.length > 1 && isLowSurrogate(n(1)) && {
92+
val codepoint = toCodePoint(n(0), n(1))
93+
isValidCodePoint(codepoint) && isLetter(codepoint) && isLowerCase(codepoint)
94+
}
8995
val first = n.head
90-
(((first.isLower && first.isLetter) || first == '_')
91-
&& (n != false_)
92-
&& (n != true_)
93-
&& (n != null_))
96+
((first.isLower && first.isLetter || first == '_' || isLowerLetterSupplementary)
97+
&& n != false_
98+
&& n != true_
99+
&& n != null_)
94100
}
95101
} || name.is(PatMatGivenVarName)
96102

97103
def isOpAssignmentName: Boolean = name match {
98104
case raw.NE | raw.LE | raw.GE | EMPTY =>
99105
false
100106
case name: SimpleName =>
101-
name.length > 0 && name.last == '=' && name.head != '=' && isOperatorPart(name.head)
107+
name.length > 0 && name.last == '=' && name.head != '=' && isOperatorPart(name.firstCodePoint)
102108
case _ =>
103109
false
104110
}
@@ -352,6 +358,14 @@ object NameOps {
352358
val unmangled = kinds.foldLeft(name)(_.unmangle(_))
353359
if (unmangled eq name) name else unmangled.unmangle(kinds)
354360
}
361+
362+
def firstCodePoint: Int =
363+
val first = name.firstPart
364+
import Character.{isHighSurrogate, isLowSurrogate, isValidCodePoint, toCodePoint}
365+
if isHighSurrogate(first(0)) && first.length > 1 && isLowSurrogate(first(1)) then
366+
val codepoint = toCodePoint(first(0), first(1))
367+
if isValidCodePoint(codepoint) then codepoint else first(0)
368+
else first(0)
355369
}
356370

357371
extension (name: TermName) {

compiler/src/dotty/tools/dotc/core/Names.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ object Names {
2525
*/
2626
abstract class Designator
2727

28-
/** A name if either a term name or a type name. Term names can be simple
28+
/** A name is either a term name or a type name. Term names can be simple
2929
* or derived. A simple term name is essentially an interned string stored
3030
* in a name table. A derived term name adds a tag, and possibly a number
3131
* or a further simple name to some other name.

compiler/src/dotty/tools/dotc/parsing/Scanners.scala

Lines changed: 26 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import config.Feature.{migrateTo3, fewerBracesEnabled}
2121
import config.SourceVersion.`3.0`
2222
import reporting.{NoProfile, Profile, Message}
2323

24+
import java.util.Objects
25+
2426
object Scanners {
2527

2628
/** Offset into source character array */
@@ -777,19 +779,21 @@ object Scanners {
777779
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
778780
isHighSurrogate(high) && {
779781
var res = false
780-
nextChar()
781-
val low = ch
782+
val low = lookaheadChar()
782783
if isLowSurrogate(low) then
783-
nextChar()
784784
val codepoint = toCodePoint(high, low)
785-
if isValidCodePoint(codepoint) && test(codepoint) then
786-
putChar(high)
787-
putChar(low)
788-
res = true
785+
if isValidCodePoint(codepoint) then
786+
if test(codepoint) then
787+
putChar(high)
788+
putChar(low)
789+
nextChar()
790+
nextChar()
791+
res = true
789792
else
790793
error(em"illegal character '${toUnicode(high)}${toUnicode(low)}'")
791794
else if !strict then
792795
putChar(high)
796+
nextChar()
793797
res = true
794798
else
795799
error(em"illegal character '${toUnicode(high)}' missing low surrogate")
@@ -889,7 +893,6 @@ object Scanners {
889893
if (ch == '\"') {
890894
if (lookaheadChar() == '\"') {
891895
nextRawChar()
892-
//offset += 3 // first part is positioned at the quote
893896
nextRawChar()
894897
stringPart(multiLine = true)
895898
}
@@ -900,7 +903,6 @@ object Scanners {
900903
}
901904
}
902905
else {
903-
//offset += 1 // first part is positioned at the quote
904906
stringPart(multiLine = false)
905907
}
906908
}
@@ -977,30 +979,29 @@ object Scanners {
977979
}
978980
case _ =>
979981
def fetchOther() =
980-
if (ch == '\u21D2') {
982+
if ch == '\u21D2' then
981983
nextChar(); token = ARROW
982984
report.deprecationWarning(em"The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
983-
}
984-
else if (ch == '\u2190') {
985+
else if ch == '\u2190' then
985986
nextChar(); token = LARROW
986987
report.deprecationWarning(em"The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
987-
}
988-
else if (Character.isUnicodeIdentifierStart(ch)) {
988+
else if isUnicodeIdentifierStart(ch) then
989989
putChar(ch)
990990
nextChar()
991991
getIdentRest()
992-
}
993-
else if (isSpecial(ch)) {
992+
if ch == '"' && token == IDENTIFIER then token = INTERPOLATIONID
993+
else if isSpecial(ch) then
994994
putChar(ch)
995995
nextChar()
996996
getOperatorRest()
997-
}
998997
else if isSupplementary(ch, isUnicodeIdentifierStart) then
999998
getIdentRest()
1000-
else {
999+
if ch == '"' && token == IDENTIFIER then token = INTERPOLATIONID
1000+
else if isSupplementary(ch, isSpecial) then
1001+
getOperatorRest()
1002+
else
10011003
error(em"illegal character '${toUnicode(ch)}'")
10021004
nextChar()
1003-
}
10041005
fetchOther()
10051006
}
10061007
}
@@ -1115,7 +1116,7 @@ object Scanners {
11151116
else error(em"unclosed quoted identifier")
11161117
}
11171118

1118-
private def getIdentRest(): Unit = (ch: @switch) match {
1119+
@tailrec private def getIdentRest(): Unit = (ch: @switch) match {
11191120
case 'A' | 'B' | 'C' | 'D' | 'E' |
11201121
'F' | 'G' | 'H' | 'I' | 'J' |
11211122
'K' | 'L' | 'M' | 'N' | 'O' |
@@ -1150,7 +1151,7 @@ object Scanners {
11501151
finishNamed()
11511152
}
11521153

1153-
private def getOperatorRest(): Unit = (ch: @switch) match {
1154+
@tailrec private def getOperatorRest(): Unit = (ch: @switch) match {
11541155
case '~' | '!' | '@' | '#' | '%' |
11551156
'^' | '*' | '+' | '-' | '<' |
11561157
'>' | '?' | ':' | '=' | '&' |
@@ -1161,23 +1162,13 @@ object Scanners {
11611162
if nxch == '/' || nxch == '*' then finishNamed()
11621163
else { putChar(ch); nextChar(); getOperatorRest() }
11631164
case _ =>
1164-
if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
1165+
if isSpecial(ch) then { putChar(ch); nextChar(); getOperatorRest() }
1166+
else if isSupplementary(ch, isSpecial) then getOperatorRest()
11651167
else finishNamed()
11661168
}
11671169

11681170
private def getIdentOrOperatorRest(): Unit =
1169-
if (isIdentifierPart(ch))
1170-
getIdentRest()
1171-
else ch match {
1172-
case '~' | '!' | '@' | '#' | '%' |
1173-
'^' | '*' | '+' | '-' | '<' |
1174-
'>' | '?' | ':' | '=' | '&' |
1175-
'|' | '\\' | '/' =>
1176-
getOperatorRest()
1177-
case _ =>
1178-
if (isSpecial(ch)) getOperatorRest()
1179-
else finishNamed()
1180-
}
1171+
if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()
11811172

11821173
def isSoftModifier: Boolean =
11831174
token == IDENTIFIER
@@ -1500,7 +1491,7 @@ object Scanners {
15001491
if (ch == '\'') finishCharLit()
15011492
else {
15021493
token = op
1503-
strVal = if (name != null) name.toString else null
1494+
strVal = Objects.toString(name)
15041495
litBuf.clear()
15051496
}
15061497
}

compiler/src/dotty/tools/dotc/parsing/package.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ package object parsing {
1717
def precedence(operator: Name): Int =
1818
if (operator eq nme.ERROR) -1
1919
else {
20-
val firstCh = operator.firstPart.head
20+
val firstCh = operator.firstCodePoint
2121
if (isScalaLetter(firstCh)) 1
2222
else if (operator.isOpAssignmentName) 0
2323
else firstCh match {

compiler/src/dotty/tools/dotc/util/Chars.scala

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
package dotty.tools.dotc.util
22

33
import scala.annotation.switch
4-
import java.lang.{Character => JCharacter}
5-
import java.lang.Character.LETTER_NUMBER
6-
import java.lang.Character.LOWERCASE_LETTER
7-
import java.lang.Character.OTHER_LETTER
8-
import java.lang.Character.TITLECASE_LETTER
9-
import java.lang.Character.UPPERCASE_LETTER
4+
import Character.{LETTER_NUMBER, LOWERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, UPPERCASE_LETTER}
5+
import Character.{MATH_SYMBOL, OTHER_SYMBOL}
6+
import Character.{isJavaIdentifierPart, isUnicodeIdentifierStart, isUnicodeIdentifierPart}
107

118
/** Contains constants and classifier methods for characters */
12-
object Chars {
9+
object Chars:
1310

1411
inline val LF = '\u000A'
1512
inline val FF = '\u000C'
1613
inline val CR = '\u000D'
1714
inline val SU = '\u001A'
1815

16+
type CodePoint = Int
17+
1918
/** Convert a character digit to an Int according to given base,
2019
* -1 if no success
2120
*/
@@ -59,17 +58,21 @@ object Chars {
5958
'0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
6059

6160
/** Can character start an alphanumeric Scala identifier? */
62-
def isIdentifierStart(c: Char): Boolean =
63-
(c == '_') || (c == '$') || JCharacter.isUnicodeIdentifierStart(c)
61+
def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
62+
def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
6463

6564
/** Can character form part of an alphanumeric Scala identifier? */
66-
def isIdentifierPart(c: Char): Boolean =
67-
(c == '$') || JCharacter.isUnicodeIdentifierPart(c)
65+
def isIdentifierPart(c: Char): Boolean = (c == '$') || isUnicodeIdentifierPart(c)
66+
def isIdentifierPart(c: CodePoint) = (c == '$') || isUnicodeIdentifierPart(c)
6867

6968
/** Is character a math or other symbol in Unicode? */
7069
def isSpecial(c: Char): Boolean = {
71-
val chtp = JCharacter.getType(c)
72-
chtp == JCharacter.MATH_SYMBOL.toInt || chtp == JCharacter.OTHER_SYMBOL.toInt
70+
val chtp = Character.getType(c)
71+
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
72+
}
73+
def isSpecial(codePoint: CodePoint) = {
74+
val chtp = Character.getType(codePoint)
75+
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
7376
}
7477

7578
def isValidJVMChar(c: Char): Boolean =
@@ -78,15 +81,26 @@ object Chars {
7881
def isValidJVMMethodChar(c: Char): Boolean =
7982
!(c == '.' || c == ';' || c =='[' || c == '/' || c == '<' || c == '>')
8083

81-
private final val otherLetters = Set[Char]('\u0024', '\u005F') // '$' and '_'
82-
private final val letterGroups = {
83-
import JCharacter._
84-
Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
85-
}
86-
def isScalaLetter(ch: Char): Boolean = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
84+
def isScalaLetter(c: Char): Boolean =
85+
Character.getType(c: @switch) match {
86+
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
87+
case _ => c == '$' || c == '_'
88+
}
89+
def isScalaLetter(c: CodePoint): Boolean =
90+
Character.getType(c: @switch) match {
91+
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
92+
case _ => c == '$' || c == '_'
93+
}
8794

8895
/** Can character form part of a Scala operator name? */
89-
def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
96+
def isOperatorPart(c: Char): Boolean = (c: @switch) match {
97+
case '~' | '!' | '@' | '#' | '%' |
98+
'^' | '*' | '+' | '-' | '<' |
99+
'>' | '?' | ':' | '=' | '&' |
100+
'|' | '/' | '\\' => true
101+
case c => isSpecial(c)
102+
}
103+
def isOperatorPart(c: CodePoint): Boolean = (c: @switch) match {
90104
case '~' | '!' | '@' | '#' | '%' |
91105
'^' | '*' | '+' | '-' | '<' |
92106
'>' | '?' | ':' | '=' | '&' |
@@ -95,5 +109,4 @@ object Chars {
95109
}
96110

97111
/** Would the character be encoded by `NameTransformer.encode`? */
98-
def willBeEncoded(c : Char) : Boolean = !JCharacter.isJavaIdentifierPart(c)
99-
}
112+
def willBeEncoded(c: Char): Boolean = !isJavaIdentifierPart(c)

tests/pos/surrogates.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,8 @@ class Construction {
2525
def reversed = "xyz\udc00\ud801abc"
2626
}
2727

28+
class Demon {
29+
val 😈 = 42
30+
}
31+
2832
// was: error: illegal character '\ud801', '\udc00'

tests/pos/t1406.scala

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
class Identifiers {
3+
4+
def f(x: Any): Boolean = x match {
5+
case 𐐨XYZ: String => true
6+
case 𐐨 => true
7+
}
8+
def g(x: Any) = x match {
9+
case 𐐨 @ _ => 𐐨
10+
}
11+
}
12+
class Ops {
13+
def 𝆗 = 42 // was error: illegal character
14+
def op_𝆗 = 42 // was error: illegal character
15+
def 🌀 = 42
16+
def op_🌀 = 42
17+
def 🚀 = 42
18+
def op_🚀 = 42
19+
def 🜀 = 42
20+
def op_🜀 = 42
21+
def 𝓅 = 42
22+
def op_𝓅 = 42
23+
}
24+
class Strings {
25+
implicit class Interps(sc: StringContext) {
26+
def 𝓅(parts: Any*) = "done"
27+
}
28+
def 𝓅 = 42
29+
def interpolated = s"$𝓅"
30+
def e = "a 𝓅 b"
31+
def f = 𝓅"one"
32+
}

tests/run/t1406b.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
2+
case class C(n: Int) {
3+
def 𐀀(c: C): C = C(n * c.n) // actually a letter but supplementary 0x10000
4+
def (c: C): C = C(n * c.n) // just a symbol
5+
def ☀=(c: C): C = C(n * c.n) // just a symbol
6+
def 🌀(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
7+
def 🌀=(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
8+
def *(c: C): C = C(n * c.n)
9+
def +(c: C): C = C(n + c.n)
10+
}
11+
object Test extends App {
12+
val Sum = 84
13+
val Product = 1764
14+
val ProductSum = 1806
15+
val SumProduct = 3528
16+
val c, d = C(42)
17+
def assertEquals(expected: Int, actual: C) = assert(expected == actual.n)
18+
assertEquals(Sum, c + d)
19+
assertEquals(Product, c * d)
20+
assertEquals(Product, c d)
21+
assertEquals(ProductSum, c * d + d)
22+
assertEquals(ProductSum, c d + d)
23+
assertEquals(SumProduct, c ☀= d + d) // assignment op is low precedence
24+
assertEquals(SumProduct, c 𐀀 d + d) // the first one, letter should be low precedence
25+
assertEquals(ProductSum, c 🌀d + d) // the second one, cyclone should be high precedence
26+
assertEquals(SumProduct, c 🌀= d + d) // assignment op is low precedence
27+
}
28+

0 commit comments

Comments
 (0)