diff --git a/diff/diff_test.go b/diff/diff_test.go index 3cd800f..dd5eb51 100644 --- a/diff/diff_test.go +++ b/diff/diff_test.go @@ -704,6 +704,69 @@ func TestParseMultiFileDiffHeaders(t *testing.T) { }, }, }, + { + filename: "complicated_filenames.diff", + wantDiffs: []*FileDiff{ + { + OrigName: "/dev/null", + NewName: "b/new empty file with spaces", + Extended: []string{ + "diff --git a/new empty file with spaces b/new empty file with spaces", + "new file mode 100644", + "index 0000000..e69de29", + }, + }, + { + OrigName: "/dev/null", + NewName: "b/new file with text", + Extended: []string{ + "diff --git a/new file with text b/new file with text", + "new file mode 100644", + "index 0000000..c3ed4be", + }, + }, + { + OrigName: "a/existing file with spaces", + NewName: "b/new file with spaces", + Extended: []string{ + "diff --git a/existing file with spaces b/new file with spaces", + "similarity index 100%", + "copy from existing file with spaces", + "copy to new file with spaces", + }, + }, + { + OrigName: "a/existing file with spaces", + NewName: "b/new, complicated\nfilenøme", + Extended: []string{ + `diff --git a/existing file with spaces "b/new, complicated\nfilen\303\270me"`, + "similarity index 100%", + "copy from existing file with spaces", + `copy to "new, complicated\nfilen\303\270me"`, + }, + }, + { + OrigName: "a/existing file with spaces", + NewName: `b/new "complicated" filename`, + Extended: []string{ + `diff --git a/existing file with spaces "b/new \"complicated\" filename"`, + "similarity index 100%", + "copy from existing file with spaces", + `copy to "new \"complicated\" filename"`, + }, + }, + { + OrigName: `a/existing "complicated" filename`, + NewName: "b/new, simpler filename", + Extended: []string{ + `diff --git "a/existing \"complicated\" filename" b/new, simpler filename`, + "similarity index 100%", + `copy from "existing \"complicated\" filename"`, + "copy to new, simpler filename", + }, + }, + }, + }, } for _, test := range tests { t.Run(test.filename, func(t *testing.T) { diff --git a/diff/parse.go b/diff/parse.go index ac89747..ae2ede4 100644 --- a/diff/parse.go +++ b/diff/parse.go @@ -372,6 +372,102 @@ func (r *FileDiffReader) ReadExtendedHeaders() ([]string, error) { } } +// readQuotedFilename extracts a quoted filename from the beginning of a string, +// returning the unquoted filename and any remaining text after the filename. +func readQuotedFilename(text string) (value string, remainder string, err error) { + if text == "" || text[0] != '"' { + return "", "", fmt.Errorf(`string must start with a '"': %s`, text) + } + + // The end quote is the first quote NOT preceeded by an uneven number of backslashes. + numberOfBackslashes := 0 + for i, c := range text { + if c == '"' && i > 0 && numberOfBackslashes%2 == 0 { + value, err = strconv.Unquote(text[:i+1]) + remainder = text[i+1:] + return + } else if c == '\\' { + numberOfBackslashes++ + } else { + numberOfBackslashes = 0 + } + } + return "", "", fmt.Errorf(`end of string found while searching for '"': %s`, text) +} + +// parseDiffGitArgs extracts the two filenames from a 'diff --git' line. +// Returns false on syntax error, true if syntax is valid. Even with a +// valid syntax, it may be impossible to extract filenames; if so, the +// function returns ("", "", true). +func parseDiffGitArgs(diffArgs string) (string, string, bool) { + length := len(diffArgs) + if length < 3 { + return "", "", false + } + + if diffArgs[0] != '"' && diffArgs[length-1] != '"' { + // Both filenames are unquoted. + firstSpace := strings.IndexByte(diffArgs, ' ') + if firstSpace <= 0 || firstSpace == length-1 { + return "", "", false + } + + secondSpace := strings.IndexByte(diffArgs[firstSpace+1:], ' ') + if secondSpace == -1 { + if diffArgs[firstSpace+1] == '"' { + // The second filename begins with '"', but doesn't end with one. + return "", "", false + } + return diffArgs[:firstSpace], diffArgs[firstSpace+1:], true + } + + // One or both filenames contain a space, but the names are + // unquoted. Here, the 'diff --git' syntax is ambiguous, and + // we have to obtain the filenames elsewhere (e.g. from the + // hunk headers or extended headers). HOWEVER, if the file + // is newly created and empty, there IS no other place to + // find the filename. In this case, the two filenames are + // identical (except for the leading 'a/' prefix), and we have + // to handle that case here. + first := diffArgs[:length/2] + second := diffArgs[length/2+1:] + if len(first) >= 3 && length%2 == 1 && first[1] == '/' && first[1:] == second[1:] { + return first, second, true + } + + // The syntax is (unfortunately) valid, but we could not extract + // the filenames. + return "", "", true + } + + if diffArgs[0] == '"' { + first, remainder, err := readQuotedFilename(diffArgs) + if err != nil || len(remainder) < 2 || remainder[0] != ' ' { + return "", "", false + } + if remainder[1] == '"' { + second, remainder, err := readQuotedFilename(remainder[1:]) + if remainder != "" || err != nil { + return "", "", false + } + return first, second, true + } + return first, remainder[1:], true + } + + // In this case, second argument MUST be quoted (or it's a syntax error) + i := strings.IndexByte(diffArgs, '"') + if i == -1 || i+2 >= length || diffArgs[i-1] != ' ' { + return "", "", false + } + + second, remainder, err := readQuotedFilename(diffArgs[i:]) + if remainder != "" || err != nil { + return "", "", false + } + return diffArgs[:i-1], second, true +} + // handleEmpty detects when FileDiff was an empty diff and will not have any hunks // that follow. It updates fd fields from the parsed extended headers. func handleEmpty(fd *FileDiff) (wasEmpty bool) { @@ -388,6 +484,10 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) { return lineHasPrefix(idx1, prefix1) && lineHasPrefix(idx2, prefix2) } + isCopy := (lineCount == 4 && linesHavePrefixes(2, "copy from ", 3, "copy to ")) || + (lineCount == 6 && linesHavePrefixes(2, "copy from ", 3, "copy to ") && lineHasPrefix(5, "Binary files ")) || + (lineCount == 6 && linesHavePrefixes(1, "old mode ", 2, "new mode ") && linesHavePrefixes(4, "copy from ", 5, "copy to ")) + isRename := (lineCount == 4 && linesHavePrefixes(2, "rename from ", 3, "rename to ")) || (lineCount == 6 && linesHavePrefixes(2, "rename from ", 3, "rename to ") && lineHasPrefix(5, "Binary files ")) || (lineCount == 6 && linesHavePrefixes(1, "old mode ", 2, "new mode ") && linesHavePrefixes(4, "rename from ", 5, "rename to ")) @@ -402,22 +502,12 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) { isBinaryPatch := lineCount == 3 && lineHasPrefix(2, "Binary files ") || lineCount > 3 && lineHasPrefix(2, "GIT binary patch") - if !isModeChange && !isRename && !isBinaryPatch && !isNewFile && !isDeletedFile { + if !isModeChange && !isCopy && !isRename && !isBinaryPatch && !isNewFile && !isDeletedFile { return false } - names := strings.SplitN(fd.Extended[0][len("diff --git "):], " ", 2) - - var err error - fd.OrigName, err = strconv.Unquote(names[0]) - if err != nil { - fd.OrigName = names[0] - } - fd.NewName, err = strconv.Unquote(names[1]) - if err != nil { - fd.NewName = names[1] - } - + var success bool + fd.OrigName, fd.NewName, success = parseDiffGitArgs(fd.Extended[0][len("diff --git "):]) if isNewFile { fd.OrigName = "/dev/null" } @@ -426,7 +516,38 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) { fd.NewName = "/dev/null" } - return true + // For ambiguous 'diff --git' lines, try to reconstruct filenames using extended headers. + if success && (isCopy || isRename) && fd.OrigName == "" && fd.NewName == "" { + diffArgs := fd.Extended[0][len("diff --git "):] + + tryReconstruct := func(header string, prefix string, whichFile int, result *string) { + if !strings.HasPrefix(header, prefix) { + return + } + rawFilename := header[len(prefix):] + + // extract the filename prefix (e.g. "a/") from the 'diff --git' line. + var prefixLetterIndex int + if whichFile == 1 { + prefixLetterIndex = 0 + } else if whichFile == 2 { + prefixLetterIndex = len(diffArgs) - len(rawFilename) - 2 + } + if prefixLetterIndex < 0 || diffArgs[prefixLetterIndex+1] != '/' { + return + } + + *result = diffArgs[prefixLetterIndex:prefixLetterIndex+2] + rawFilename + } + + for _, header := range fd.Extended { + tryReconstruct(header, "copy from ", 1, &fd.OrigName) + tryReconstruct(header, "copy to ", 2, &fd.NewName) + tryReconstruct(header, "rename from ", 1, &fd.OrigName) + tryReconstruct(header, "rename to ", 2, &fd.NewName) + } + } + return success } var ( diff --git a/diff/parse_test.go b/diff/parse_test.go new file mode 100644 index 0000000..ad00c2f --- /dev/null +++ b/diff/parse_test.go @@ -0,0 +1,93 @@ +package diff + +import ( + "testing" +) + +func TestReadQuotedFilename_Success(t *testing.T) { + tests := []struct { + input, value, remainder string + }{ + {input: `""`, value: "", remainder: ""}, + {input: `"aaa"`, value: "aaa", remainder: ""}, + {input: `"aaa" bbb`, value: "aaa", remainder: " bbb"}, + {input: `"aaa" "bbb" ccc`, value: "aaa", remainder: ` "bbb" ccc`}, + {input: `"\""`, value: "\"", remainder: ""}, + {input: `"uh \"oh\""`, value: "uh \"oh\"", remainder: ""}, + {input: `"uh \\"oh\\""`, value: "uh \\", remainder: `oh\\""`}, + {input: `"uh \\\"oh\\\""`, value: "uh \\\"oh\\\"", remainder: ""}, + } + for _, tc := range tests { + value, remainder, err := readQuotedFilename(tc.input) + if err != nil { + t.Errorf("readQuotedFilename(`%s`): expected success, got '%s'", tc.input, err) + } else if value != tc.value || remainder != tc.remainder { + t.Errorf("readQuotedFilename(`%s`): expected `%s` and `%s`, got `%s` and `%s`", tc.input, tc.value, tc.remainder, value, remainder) + } + } +} + +func TestReadQuotedFilename_Error(t *testing.T) { + tests := []string{ + // Doesn't start with a quote + ``, + `foo`, + ` "foo"`, + // Missing end quote + `"`, + `"\"`, + // "\x" is not a valid Go string literal escape + `"\xxx"`, + } + for _, input := range tests { + _, _, err := readQuotedFilename(input) + if err == nil { + t.Errorf("readQuotedFilename(`%s`): expected error", input) + } + } +} + +func TestParseDiffGitArgs_Success(t *testing.T) { + tests := []struct { + input, first, second string + }{ + {input: `aaa bbb`, first: "aaa", second: "bbb"}, + {input: `"aaa" bbb`, first: "aaa", second: "bbb"}, + {input: `aaa "bbb"`, first: "aaa", second: "bbb"}, + {input: `"aaa" "bbb"`, first: "aaa", second: "bbb"}, + {input: `1/a 2/z`, first: "1/a", second: "2/z"}, + {input: `1/hello world 2/hello world`, first: "1/hello world", second: "2/hello world"}, + {input: `"new\nline" and spaces`, first: "new\nline", second: "and spaces"}, + {input: `a/existing file with spaces "b/new, complicated\nfilen\303\270me"`, first: "a/existing file with spaces", second: "b/new, complicated\nfilen\303\270me"}, + } + for _, tc := range tests { + first, second, success := parseDiffGitArgs(tc.input) + if !success { + t.Errorf("`diff --git %s`: expected success", tc.input) + } else if first != tc.first || second != tc.second { + t.Errorf("`diff --git %s`: expected `%s` and `%s`, got `%s` and `%s`", tc.input, tc.first, tc.second, first, second) + } + } +} + +func TestParseDiffGitArgs_Unsuccessful(t *testing.T) { + tests := []string{ + ``, + `hello_world.txt`, + `word `, + ` word`, + `"a/bad_quoting b/bad_quoting`, + `a/bad_quoting "b/bad_quoting`, + `a/bad_quoting b/bad_quoting"`, + `"a/bad_quoting b/bad_quoting"`, + `"a/bad""b/bad"`, + `"a/bad" "b/bad" "c/bad"`, + `a/bad "b/bad" "c/bad"`, + } + for _, input := range tests { + first, second, success := parseDiffGitArgs(input) + if success { + t.Errorf("`diff --git %s`: expected unsuccessful; got `%s` and `%s`", input, first, second) + } + } +} diff --git a/diff/testdata/complicated_filenames.diff b/diff/testdata/complicated_filenames.diff new file mode 100644 index 0000000..29521bd --- /dev/null +++ b/diff/testdata/complicated_filenames.diff @@ -0,0 +1,26 @@ +diff --git a/new empty file with spaces b/new empty file with spaces +new file mode 100644 +index 0000000..e69de29 +diff --git a/new file with text b/new file with text +new file mode 100644 +index 0000000..c3ed4be +--- /dev/null ++++ b/new file with text +@@ -0,0 +1 @@ ++new file with text +diff --git a/existing file with spaces b/new file with spaces +similarity index 100% +copy from existing file with spaces +copy to new file with spaces +diff --git a/existing file with spaces "b/new, complicated\nfilen\303\270me" +similarity index 100% +copy from existing file with spaces +copy to "new, complicated\nfilen\303\270me" +diff --git a/existing file with spaces "b/new \"complicated\" filename" +similarity index 100% +copy from existing file with spaces +copy to "new \"complicated\" filename" +diff --git "a/existing \"complicated\" filename" b/new, simpler filename +similarity index 100% +copy from "existing \"complicated\" filename" +copy to new, simpler filename