diff --git a/src/kinds.jl b/src/kinds.jl index a99f3b1d..70c35982 100644 --- a/src/kinds.jl +++ b/src/kinds.jl @@ -26,6 +26,7 @@ const _kind_names = "ErrorInvalidUTF8" "ErrorInvisibleChar" "ErrorUnknownCharacter" + "ErrorBidiFormatting" # Generic error "error" "END_ERRORS" @@ -1049,6 +1050,7 @@ const _nonunique_kind_names = Set([ K"ErrorInvalidUTF8" K"ErrorInvisibleChar" K"ErrorUnknownCharacter" + K"ErrorBidiFormatting" K"ErrorInvalidOperator" K"Integer" @@ -1098,6 +1100,7 @@ const _token_error_descriptions = Dict{Kind, String}( K"ErrorInvalidUTF8"=>"invalid UTF-8 character", K"ErrorInvisibleChar"=>"invisible character", K"ErrorUnknownCharacter"=>"unknown unicode character", + K"ErrorBidiFormatting"=>"unbalanced bidirectional unicode formatting", K"ErrorInvalidOperator" => "invalid operator", K"Error**" => "use `x^y` instead of `x**y` for exponentiation, and `x...` instead of `**x` for splatting", K"error" => "unknown error token", diff --git a/src/parse_stream.jl b/src/parse_stream.jl index e8ebbfd5..94feba03 100644 --- a/src/parse_stream.jl +++ b/src/parse_stream.jl @@ -949,6 +949,8 @@ function validate_tokens(stream::ParseStream) # Emit messages for non-generic token errors msg = if k in KSet"ErrorInvalidUTF8 ErrorInvisibleChar ErrorUnknownCharacter" "$(_token_error_descriptions[k]) $(repr(text[fbyte]))" + elseif k == K"ErrorBidiFormatting" + "$(_token_error_descriptions[k]) $(repr(text[fbyte:prevind(text, nbyte)]))" else _token_error_descriptions[k] end diff --git a/src/parser.jl b/src/parser.jl index d388a49f..e2691e73 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -3282,6 +3282,9 @@ function parse_string(ps::ParseState, raw::Bool) first_chunk = false n_valid_chunks += 1 end + elseif k == K"ErrorInvalidInterpolationTerminator" || k == K"ErrorBidiFormatting" + # Treat these errors as string chunks + bump(ps) else break end @@ -3381,6 +3384,8 @@ function parse_atom(ps::ParseState, check_identifiers=true) else if k == K"Char" bump(ps) + elseif is_error(k) + bump(ps) else # FIXME: This case is actually a tokenization error. # Make a best-effort attempt to workaround this for now by diff --git a/src/tokenize.jl b/src/tokenize.jl index 6f37e85e..86e21d31 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -2,7 +2,7 @@ module Tokenize export tokenize, untokenize, Tokens -using ..JuliaSyntax: JuliaSyntax, Kind, @K_str +using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str import ..JuliaSyntax: kind, is_literal, is_error, is_contextual_keyword, is_word_operator @@ -382,9 +382,6 @@ end Returns the next character and increments the current position. """ -function readchar end - - function readchar(l::Lexer) c = readchar(l.io) l.chars = (l.chars[2], l.chars[3], l.chars[4], c) @@ -446,17 +443,6 @@ function emit(l::Lexer, kind::Kind, maybe_op=true) return tok end -""" - emit_error(l::Lexer, err::Kind) - -Returns an `K"error"` token with error `err` and starts a new `RawToken`. -""" -function emit_error(l::Lexer, err::Kind) - @assert is_error(err) - return emit(l, err) -end - - """ next_token(l::Lexer) @@ -551,13 +537,33 @@ function _next_token(l::Lexer, c) elseif (k = get(_unicode_ops, c, K"error")) != K"error" return emit(l, k) else - emit_error(l, + emit(l, !isvalid(c) ? K"ErrorInvalidUTF8" : is_invisible_char(c) ? K"ErrorInvisibleChar" : K"ErrorUnknownCharacter") end end +# UAX #9: Unicode Bidirectional Algorithm +# https://unicode.org/reports/tr9/ +# Very partial implementation - just enough to check correct nesting in strings +# and multiline comments. +function update_bidi_state((embedding_nesting, isolate_nesting), c) + if c == '\n' + embedding_nesting = 0 + isolate_nesting = 0 + elseif c == '\U202A' || c == '\U202B' || c == '\U202D' || c == '\U202E' # LRE RLE LRO RLO + embedding_nesting += 1 + elseif c == '\U202C' # PDF + embedding_nesting -= 1 + elseif c == '\U2066' || c == '\U2067' || c == '\U2068' # LRI RLI FSI + isolate_nesting += 1 + elseif c == '\U2069' # PDI + isolate_nesting -= 1 + end + return (embedding_nesting, isolate_nesting) +end + # We're inside a string; possibly reading the string characters, or maybe in # Julia code within an interpolation. function lex_string_chunk(l) @@ -565,6 +571,9 @@ function lex_string_chunk(l) if state.paren_depth > 0 # Read normal Julia code inside an interpolation but track nesting of # parentheses. + # TODO: This stateful tracking should probably, somehow, be done by the + # parser instead? Especially for recovery of unbalanced parens inside + # interpolations? c = readchar(l) if c == '(' l.string_states[end] = StringState(state.triplestr, state.raw, state.delim, @@ -598,7 +607,7 @@ function lex_string_chunk(l) # Only allow certain characters after interpolated vars # https://github.com/JuliaLang/julia/pull/25234 readchar(l) - return emit_error(l, K"ErrorInvalidInterpolationTerminator") + return emit(l, K"ErrorInvalidInterpolationTerminator") end if pc == EOF_CHAR return emit(l, K"EndMarker") @@ -637,6 +646,8 @@ function lex_string_chunk(l) end end # Read a chunk of string characters + init_bidi_state = (0,0) + bidi_state = init_bidi_state if state.raw # Raw strings treat all characters as literals with the exception that # the closing quotes can be escaped with an odd number of \ characters. @@ -647,7 +658,10 @@ function lex_string_chunk(l) elseif state.triplestr && (pc == '\n' || pc == '\r') # triple quoted newline splitting readchar(l) - if pc == '\r' && peekchar(l) == '\n' + if pc == '\n' + bidi_state = init_bidi_state + elseif pc == '\r' && peekchar(l) == '\n' + bidi_state = init_bidi_state readchar(l) end break @@ -663,6 +677,7 @@ function lex_string_chunk(l) readchar(l) end end + bidi_state = update_bidi_state(bidi_state, c) end else while true @@ -672,16 +687,22 @@ function lex_string_chunk(l) elseif state.triplestr && (pc == '\n' || pc == '\r') # triple quoted newline splitting readchar(l) - if pc == '\r' && peekchar(l) == '\n' + if pc == '\n' + bidi_state = init_bidi_state + elseif pc == '\r' && peekchar(l) == '\n' readchar(l) + bidi_state = init_bidi_state end break elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr) break elseif pc == '\\' # Escaped newline - pc2 = dpeekchar(l)[2] + _, pc2, pc3 = peekchar3(l) if pc2 == '\r' || pc2 == '\n' + if pc2 == '\n' || pc3 == '\n' + bidi_state = init_bidi_state + end break end end @@ -689,12 +710,16 @@ function lex_string_chunk(l) if c == '\\' c = readchar(l) c == EOF_CHAR && break - continue end + bidi_state = update_bidi_state(bidi_state, c) end end - return emit(l, state.delim == '"' ? K"String" : - state.delim == '`' ? K"CmdString" : K"Char") + outk = state.delim == '\'' ? K"Char" : + bidi_state != init_bidi_state ? K"ErrorBidiFormatting" : + state.delim == '"' ? K"String" : + state.delim == '`' ? K"CmdString" : + (@assert(state.delim in KSet"' \" `"); K"error") + return emit(l, outk) end # Lex whitespace, a whitespace char `c` has been consumed @@ -725,13 +750,16 @@ function lex_comment(l::Lexer) end else c = readchar(l) # consume the '=' + init_bidi_state = (0,0) + bidi_state = init_bidi_state skip = true # true => c was part of the prev comment marker pair nesting = 1 while true if c == EOF_CHAR - return emit_error(l, K"ErrorEofMultiComment") + return emit(l, K"ErrorEofMultiComment") end nc = readchar(l) + bidi_state = update_bidi_state(bidi_state, nc) if skip skip = false else @@ -742,7 +770,9 @@ function lex_comment(l::Lexer) nesting -= 1 skip = true if nesting == 0 - return emit(l, K"Comment") + outk = bidi_state == init_bidi_state ? + K"Comment" : K"ErrorBidiFormatting" + return emit(l, outk) end end end @@ -791,12 +821,12 @@ function lex_less(l::Lexer) elseif dpeekchar(l) == ('-', '-') readchar(l); readchar(l) if accept(l, '-') - return emit_error(l, K"ErrorInvalidOperator") + return emit(l, K"ErrorInvalidOperator") else if accept(l, '>') return emit(l, K"<-->") elseif accept(l, '-') - return emit_error(l, K"ErrorInvalidOperator") + return emit(l, K"ErrorInvalidOperator") else return emit(l, K"<--") end @@ -879,7 +909,7 @@ function lex_minus(l::Lexer) if accept(l, '>') return emit(l, K"-->") else - return emit_error(l, K"ErrorInvalidOperator") # "--" is an invalid operator + return emit(l, K"ErrorInvalidOperator") # "--" is an invalid operator end elseif !l.dotop && accept(l, '>') return emit(l, K"->") @@ -891,7 +921,7 @@ end function lex_star(l::Lexer) if accept(l, '*') - return emit_error(l, K"Error**") # "**" is an invalid operator use ^ + return emit(l, K"Error**") # "**" is an invalid operator use ^ elseif accept(l, '=') return emit(l, K"*=") end @@ -952,15 +982,15 @@ function lex_digit(l::Lexer, kind) elseif kind === K"Float" # If we enter the function with kind == K"Float" then a '.' has been parsed. readchar(l) - return emit_error(l, K"ErrorInvalidNumericConstant") + return emit(l, K"ErrorInvalidNumericConstant") elseif is_dottable_operator_start_char(ppc) readchar(l) - return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+` + return emit(l, K"ErrorAmbiguousNumericConstant") # `1.+` end readchar(l) kind = K"Float" - accept(l, '_') && return emit_error(l, K"ErrorInvalidNumericConstant") # `1._` + accept(l, '_') && return emit(l, K"ErrorInvalidNumericConstant") # `1._` had_fraction_digs = accept_number(l, isdigit) pc, ppc = dpeekchar(l) if (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−') @@ -971,18 +1001,18 @@ function lex_digit(l::Lexer, kind) pc,ppc = dpeekchar(l) if pc === '.' && !is_dottable_operator_start_char(ppc) readchar(l) - return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.` + return emit(l, K"ErrorInvalidNumericConstant") # `1.e1.` end else - return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e` + return emit(l, K"ErrorInvalidNumericConstant") # `1.e` end elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc) readchar(l) - return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.` + return emit(l, K"ErrorInvalidNumericConstant") # `1.1.` elseif !had_fraction_digs && (is_identifier_start_char(pc) || pc == '(' || pc == '[' || pc == '{' || pc == '@' || pc == '`' || pc == '"') - return emit_error(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x` + return emit(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x` end elseif (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−') kind = pc == 'f' ? K"Float32" : K"Float" @@ -992,10 +1022,10 @@ function lex_digit(l::Lexer, kind) pc,ppc = dpeekchar(l) if pc === '.' && !is_dottable_operator_start_char(ppc) accept(l, '.') - return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.` + return emit(l, K"ErrorInvalidNumericConstant") # `1e1.` end else - return emit_error(l, K"ErrorInvalidNumericConstant") # `1e+` + return emit(l, K"ErrorInvalidNumericConstant") # `1e+` end elseif position(l) - startpos(l) == 1 && l.chars[1] == '0' kind == K"Integer" @@ -1015,10 +1045,10 @@ function lex_digit(l::Lexer, kind) kind = K"Float" accept(l, "+-−") if !accept_number(l, isdigit) || !had_digits - return emit_error(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0` + return emit(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0` end elseif isfloat - return emit_error(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0` + return emit(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0` end is_bin_oct_hex_int = !isfloat elseif pc == 'b' @@ -1038,7 +1068,7 @@ function lex_digit(l::Lexer, kind) accept_batch(l, c->isdigit(c) || is_identifier_start_char(c)) # `0x` `0xg` `0x_` `0x-` # `0b123` `0o78p` `0xenomorph` `0xaα` - return emit_error(l, K"ErrorInvalidNumericConstant") + return emit(l, K"ErrorInvalidNumericConstant") end end end @@ -1132,7 +1162,7 @@ function lex_dot(l::Lexer) else if is_dottable_operator_start_char(peekchar(l)) readchar(l) - return emit_error(l, K"ErrorInvalidOperator") + return emit(l, K"ErrorInvalidOperator") else return emit(l, K"..") end diff --git a/test/diagnostics.jl b/test/diagnostics.jl index 87e2abea..c2fd5c1f 100644 --- a/test/diagnostics.jl +++ b/test/diagnostics.jl @@ -19,6 +19,13 @@ end Diagnostic(2, 1+sizeof(string(c)), :error, "invisible character $(repr(c))") end @test diagnostic(":⥻") == Diagnostic(2, 4, :error, "unknown unicode character '⥻'") + + @test diagnostic("\"X \u202a X\"") == Diagnostic(2, 8, :error, "unbalanced bidirectional unicode formatting \"X \\u202a X\"") + @test diagnostic("#= \u202a =#") == Diagnostic(1, 9, :error, "unbalanced bidirectional unicode formatting \"#= \\u202a =#\"") + @test diagnostic("\"X \u202a \$xx\u202c\"", allow_multiple=true) == [ + Diagnostic(2, 7, :error, "unbalanced bidirectional unicode formatting \"X \\u202a \"") + Diagnostic(11, 13, :error, "unbalanced bidirectional unicode formatting \"\\u202c\"") + ] end @testset "parser errors" begin diff --git a/test/parser.jl b/test/parser.jl index 736201cf..88dff3ad 100644 --- a/test/parser.jl +++ b/test/parser.jl @@ -1004,3 +1004,24 @@ end @test parse_to_sexpr_str(JuliaSyntax.parse_eq, "a \u2212= b") == "(-= a b)" @test parse_to_sexpr_str(JuliaSyntax.parse_eq, "a .\u2212= b") == "(.-= a b)" end + +@testset "Unbalanced bidirectional unicode" begin + # https://trojansource.codes + @test_throws JuliaSyntax.ParseError parsestmt(GreenNode, """ + function checkUserAccess(u::User) + if u.accessLevel != "user\u202e \u2066# users are not allowed\u2069\u2066" + return true + end + return false + end + """) + + @test_throws JuliaSyntax.ParseError parsestmt(GreenNode, """ + function checkUserAccess(u::User) + #=\u202e \u2066if (u.isAdmin)\u2069 \u2066 begin admins only =# + return true + #= end admin only \u202e \u2066end\u2069 \u2066=# + return false + end + """) +end diff --git a/test/test_utils.jl b/test/test_utils.jl index 52b97b05..de80ed1a 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -39,6 +39,14 @@ if VERSION < v"1.6" using JuliaSyntax: isnothing, only, peek end +function toks(str) + ts = [JuliaSyntax.Tokenize.untokenize(t, str)=>kind(t) + for t in JuliaSyntax.Tokenize.tokenize(str)] + @test ts[end] == (""=>K"EndMarker") + pop!(ts) + ts +end + function remove_macro_linenums!(ex) if Meta.isexpr(ex, :macrocall) ex.args[2] = nothing diff --git a/test/tokenize.jl b/test/tokenize.jl index baaa08d0..4e5926e6 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -17,17 +17,12 @@ using JuliaSyntax.Tokenize: untokenize, RawToken +using ..Main: toks + tok(str, i = 1) = collect(tokenize(str))[i] strtok(str) = untokenize.(collect(tokenize(str)), str) -function toks(str) - ts = [untokenize(t, str)=>kind(t) for t in tokenize(str)] - @test ts[end] == (""=>K"EndMarker") - pop!(ts) - ts -end - function onlytok(str) ts = collect(tokenize(str)) (length(ts) == 2 && ts[2].kind == K"EndMarker") || @@ -993,6 +988,133 @@ end end end +@testset "unbalanced bidirectional unicode" begin + open_embedding = ['\U202A', '\U202B', '\U202D', '\U202E'] + close_embedding = '\U202C' + open_isolate = ['\U2066', '\U2067', '\U2068'] + close_isolate = '\U2069' + close_all = '\n' + + all_bidi_codes = [open_embedding; close_embedding; open_isolate; close_isolate] + + bidi_pairs = [Iterators.product(open_embedding, [close_embedding, close_all])..., + Iterators.product(open_isolate, [close_isolate, close_all])...] + + @testset "delimiter $kd" for (kd, chunk_kind) in [ + (K"\"", K"String"), + (K"\"\"\"", K"String"), + (K"`", K"CmdString"), + (K"```", K"CmdString") + ] + d = string(kd) + @testset "Single unbalanced codes" begin + for c in all_bidi_codes + @test toks("$d$c$d") == + [d=>kd, "$c"=>K"ErrorBidiFormatting", d=>kd] + @test toks("pfx$d$c$d") == + ["pfx"=>K"Identifier", d=>kd, "$c"=>K"ErrorBidiFormatting", d=>kd] + end + end + @testset "Balanced pairs" begin + for (openc, closec) in bidi_pairs + str = "$(openc)##$(closec)" + @test toks("$d$str$d") == + [d=>kd, str=>chunk_kind, d=>kd] + @test toks("pfx$d$str$d") == + ["pfx"=>K"Identifier", d=>kd, str=>chunk_kind, d=>kd] + end + end + end + + @testset "multi line comments" begin + @testset "Single unbalanced codes" begin + for c in all_bidi_codes + comment = "#=$c=#" + @test toks(comment) == [comment=>K"ErrorBidiFormatting"] + end + end + @testset "Balanced pairs" begin + for (openc, closec) in bidi_pairs + str = "#=$(openc)zz$(closec)=#" + @test toks(str) == [str=>K"Comment"] + end + end + end + + @testset "extended balanced/unbalanced bidi state" begin + @testset "delimiter $kd" for (kd, chunk_kind) in [ + (K"\"", K"String"), + (K"\"\"\"", K"String"), + (K"`", K"CmdString"), + (K"```", K"CmdString") + ] + d = string(kd) + for balanced in [# Balanced pairs + "\u202a\u202bzz\u202c\u202c" + "\u2066\u2067zz\u2069\u2069" + # Newline is complete bidi state reset + "\u202a\u2067zz\n" + "\u202a\u202azz\n" + # \r\n and \n terminate a line + "\u202azz\r\n" + ] + @test toks("$d$balanced$d") == [ + d=>kd + balanced=>chunk_kind + d=>kd + ] + end + for unbalanced in ["\u202azz\u202c\u202c" + "\u202a\u202bzz\u202c" + # \r does not terminate a bidi line + "\u202azz\r" + ] + @test toks("$d$unbalanced$d") == [ + d=>kd + unbalanced=>K"ErrorBidiFormatting" + d=>kd + ] + end + end + end + + # Interpolations reset bidi state + @test toks("\"\u202a\$zz\n\"") == [ + "\""=>K"\"" + "\u202a"=>K"ErrorBidiFormatting" + "\$"=>K"$" + "zz"=>K"Identifier" + "\n"=>K"String" + "\""=>K"\"" + ] + @testset "newline escaping" begin + @test toks("\"a\u202a\\\n\"") == [ + "\""=>K"\"" + "a\u202a"=>K"String" + "\\\n"=>K"Whitespace" + "\""=>K"\"" + ] + @test toks("\"a\u202a\\\r\n\"") == [ + "\""=>K"\"" + "a\u202a"=>K"String" + "\\\r\n"=>K"Whitespace" + "\""=>K"\"" + ] + @test toks("\"a\u202a\\\r\"") == [ + "\""=>K"\"" + "a\u202a"=>K"ErrorBidiFormatting" + "\\\r"=>K"Whitespace" + "\""=>K"\"" + ] + end + + @testset "delimiter '" begin + for c in all_bidi_codes + @test toks("'$c'") == ["'"=>K"'", "$c"=>K"Char", "'"=>K"'"] + end + end +end + @testset "dotop miscellanea" begin @test strtok("a .-> b") == ["a", " ", ".-", ">", " ", "b", ""] @test strtok(".>: b") == [".>:", " ", "b", ""]