Merge pull request google#5 from sparkprime/lexer_changes

jbeda · jbeda · commit 04c51f703474 · 2016-02-25T17:05:16.000-08:00
Port lexer changes from google/jsonnet 0c96da7 to 27ddf2c Fix google#1
diff --git a/lexer.go b/lexer.go
@@ -55,7 +55,6 @@ const (
 	tokenBraceR
 	tokenBracketL
 	tokenBracketR
-	tokenColon
 	tokenComma
 	tokenDollar
 	tokenDot
@@ -101,7 +100,6 @@ var tokenKindStrings = []string{
 	tokenBraceR:    "\"}\"",
 	tokenBracketL:  "\"[\"",
 	tokenBracketR:  "\"]\"",
-	tokenColon:     "\":\"",
 	tokenComma:     "\",\"",
 	tokenDollar:    "\"$\"",
 	tokenDot:       "\".\"",
@@ -197,7 +195,7 @@ func isIdentifier(r rune) bool {
 
 func isSymbol(r rune) bool {
 	switch r {
-	case '&', '|', '^', '=', '<', '>', '*', '/', '%', '#':
+	case '!', '$', ':', '~', '+', '-', '&', '|', '^', '=', '<', '>', '*', '/', '%':
 		return true
 	}
 	return false
@@ -533,7 +531,7 @@ func (l *lexer) lexIdentifier() {
 }
 
 // lexSymbol will lex a token that starts with a symbol.  This could be a
-// comment, block quote or an operator.  This function assumes that the next
+// C or C++ comment, block quote or an operator.  This function assumes that the next
 // rune to be served by the lexer will be the first rune of the new token.
 func (l *lexer) lexSymbol() error {
 	r := l.next()
@@ -550,16 +548,6 @@ func (l *lexer) lexSymbol() error {
 		return nil
 	}
 
-	if r == '#' {
-		l.resetTokenStart() // Throw out the leading #
-		for r = l.next(); r != lexEOF && r != '\n'; r = l.next() {
-		}
-		// Leave the '\n' in the lexer to be fodder for the next round
-		l.backup()
-		l.addCommentFodder(fodderCommentHash)
-		return nil
-	}
-
 	if r == '/' && l.peek() == '*' {
 		commentStartLoc := l.tokenStartLoc
 		l.next()            // consume the '*'
@@ -640,10 +628,39 @@ func (l *lexer) lexSymbol() error {
 
 	// Assume any string of symbols is a single operator.
 	for r = l.next(); isSymbol(r); r = l.next() {
-
+		// Not allowed // in operators
+		if r == '/' && strings.HasPrefix(l.input[l.pos:], "/") {
+			break
+		}
+		// Not allowed /* in operators
+		if r == '/' && strings.HasPrefix(l.input[l.pos:], "*") {
+			break
+		}
+		// Not allowed ||| in operators
+		if r == '|' && strings.HasPrefix(l.input[l.pos:], "||") {
+			break
+		}
 	}
+
 	l.backup()
-	l.emitToken(tokenOperator)
+
+	// Operators are not allowed to end with + - ~ ! unless they are one rune long.
+	// So, wind it back if we need to, but stop at the first rune.
+	// This relies on the hack that all operator symbols are ASCII and thus there is
+	// no need to treat this substring as general UTF-8.
+	for r = rune(l.input[l.pos - 1]); l.pos > l.tokenStart + 1; l.pos-- {
+		switch r {
+			case '+', '-', '~', '!':
+			continue
+		}
+		break
+	}
+
+	if l.input[l.tokenStart:l.pos] == "$" {
+		l.emitToken(tokenDollar)
+	} else {
+		l.emitToken(tokenOperator)
+	}
 	return nil
 }
 
@@ -665,12 +682,8 @@ func lex(fn string, input string) (tokens, error) {
 			l.emitToken(tokenBracketL)
 		case ']':
 			l.emitToken(tokenBracketR)
-		case ':':
-			l.emitToken(tokenColon)
 		case ',':
 			l.emitToken(tokenComma)
-		case '$':
-			l.emitToken(tokenDollar)
 		case '.':
 			l.emitToken(tokenDot)
 		case '(':
@@ -680,15 +693,6 @@ func lex(fn string, input string) (tokens, error) {
 		case ';':
 			l.emitToken(tokenSemicolon)
 
-			// Operators
-		case '!':
-			if l.peek() == '=' {
-				_ = l.next()
-			}
-			l.emitToken(tokenOperator)
-		case '~', '+', '-':
-			l.emitToken(tokenOperator)
-
 		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 			l.backup()
 			err = l.lexNumber()
@@ -733,6 +737,14 @@ func lex(fn string, input string) (tokens, error) {
 					r = l.next()
 				}
 			}
+		case '#':
+			l.resetTokenStart() // Throw out the leading #
+			for r = l.next(); r != lexEOF && r != '\n'; r = l.next() {
+			}
+			// Leave the '\n' in the lexer to be fodder for the next round
+			l.backup()
+			l.addCommentFodder(fodderCommentHash)
+
 		default:
 			if isIdentifierFirst(r) {
 				l.backup()
diff --git a/lexer_test.go b/lexer_test.go
@@ -38,7 +38,12 @@ var lexTests = []lexTest{
 	{"brace R", "}", tokens{{kind: tokenBraceR, data: "}"}}, ""},
 	{"bracket L", "[", tokens{{kind: tokenBracketL, data: "["}}, ""},
 	{"bracket R", "]", tokens{{kind: tokenBracketR, data: "]"}}, ""},
-	{"colon", ":", tokens{{kind: tokenColon, data: ":"}}, ""},
+	{"colon", ":", tokens{{kind: tokenOperator, data: ":"}}, ""},
+	{"colon2", "::", tokens{{kind: tokenOperator, data: "::"}}, ""},
+	{"colon3", ":::", tokens{{kind: tokenOperator, data: ":::"}}, ""},
+	{"arrow right", "->", tokens{{kind: tokenOperator, data: "->"}}, ""},
+	{"less than minus", "<-", tokens{{kind: tokenOperator, data: "<"},
+                                     {kind: tokenOperator, data: "-"}}, ""},
 	{"comma", ",", tokens{{kind: tokenComma, data: ","}}, ""},
 	{"dollar", "$", tokens{{kind: tokenDollar, data: "$"}}, ""},
 	{"dot", ".", tokens{{kind: tokenDot, data: "."}}, ""},
diff --git a/parser.go b/parser.go
@@ -144,7 +144,7 @@ func (p *parser) parse(prec precedence) (astNode, error) {
 			return nil, err
 		}
 		var msg astNode
-		if p.peek().kind == tokenColon {
+		if p.peek().kind == tokenOperator && p.peek().data == ":" {
 			p.pop()
 			msg, err = p.parse(maxPrecedence)
 			if err != nil {

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ func (p *parser) parse(prec precedence) (astNode, error) {`
`144`	`144`	`return nil, err`
`145`	`145`	`}`
`146`	`146`	`var msg astNode`
`147`		`- if p.peek().kind == tokenColon {`
	`147`	`+ if p.peek().kind == tokenOperator && p.peek().data == ":" {`
`148`	`148`	`p.pop()`
`149`	`149`	`msg, err = p.parse(maxPrecedence)`
`150`	`150`	`if err != nil {`