Skip to content

Commit eff651f

Browse files
committed
fix: use indivisble line hashes
Revert implementation of diffLines to use runes to fix sergi#140. In order to not regress sergi#89, skip invalid utf8 runes when munging lines.
1 parent facec63 commit eff651f

File tree

3 files changed

+165
-116
lines changed

3 files changed

+165
-116
lines changed

diffmatchpatch/diff.go

+54-31
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ import (
2222
"unicode/utf8"
2323
)
2424

25+
// LineMap is a mapping from a line hash to its text.
26+
type LineMap map[rune]string
27+
2528
// Operation defines the operation of a diff item.
2629
type Operation int8
2730

@@ -34,8 +37,6 @@ const (
3437
DiffInsert Operation = 1
3538
// DiffEqual item represents an equal diff.
3639
DiffEqual Operation = 0
37-
//IndexSeparator is used to seperate the array indexes in an index string
38-
IndexSeparator = ","
3940
)
4041

4142
// Diff represents one diff operation
@@ -83,12 +84,16 @@ func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff {
8384

8485
// DiffMain finds the differences between two texts.
8586
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
87+
//
88+
// Note: if checklines is true, the limitation noted in DiffLinesToChars applies
8689
func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff {
8790
return dmp.DiffMainRunes([]rune(text1), []rune(text2), checklines)
8891
}
8992

9093
// DiffMainRunes finds the differences between two rune sequences.
9194
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
95+
//
96+
// Note: if checklines is true, the limitation noted in DiffLinesToRunes applies
9297
func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff {
9398
var deadline time.Time
9499
if dmp.DiffTimeout > 0 {
@@ -391,29 +396,34 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int,
391396

392397
// DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line.
393398
// It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes.
394-
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) {
395-
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
396-
return chars1, chars2, lineArray
399+
//
400+
// Note: since we hash lines to runes, there is an upper limit to the number of
401+
// unique lines this algorithm can handle. That limit is 1,112,063 unique
402+
// lines.
403+
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, LineMap) {
404+
chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
405+
return chars1, chars2, lineMap
397406
}
398407

399408
// DiffLinesToRunes splits two texts into a list of runes.
400-
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
401-
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
402-
return []rune(chars1), []rune(chars2), lineArray
409+
//
410+
// Note: since we hash lines to runes, there is an upper limit to the number of
411+
// unique lines this algorithm can handle. That limit is 1,112,063 unique
412+
// lines.
413+
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, LineMap) {
414+
chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
415+
return []rune(chars1), []rune(chars2), lineMap
403416
}
404417

405418
// DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
406-
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
419+
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineMap LineMap) []Diff {
407420
hydrated := make([]Diff, 0, len(diffs))
408421
for _, aDiff := range diffs {
409-
chars := strings.Split(aDiff.Text, IndexSeparator)
410-
text := make([]string, len(chars))
422+
runes := []rune(aDiff.Text)
423+
text := make([]string, len(runes))
411424

412-
for i, r := range chars {
413-
i1, err := strconv.Atoi(r)
414-
if err == nil {
415-
text[i] = lineArray[i1]
416-
}
425+
for i, r := range runes {
426+
text[i] = lineMap[r]
417427
}
418428

419429
aDiff.Text = strings.Join(text, "")
@@ -1309,24 +1319,29 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di
13091319
}
13101320

13111321
// diffLinesToStrings splits two texts into a list of strings. Each string represents one line.
1312-
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) {
1313-
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
1314-
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
1322+
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, LineMap) {
1323+
lineMap := LineMap{} // e.g. lineMap[4] == 'Hello\n'
13151324

1316-
lineHash := make(map[string]int)
1317-
//Each string has the index of lineArray which it points to
1318-
strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray, lineHash)
1319-
strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray, lineHash)
1325+
lineHash := make(map[string]rune)
1326+
//Each string has the index of lineMap which it points to
1327+
runes1 := dmp.diffLinesToRunesMunge(text1, lineMap, lineHash)
1328+
runes2 := dmp.diffLinesToRunesMunge(text2, lineMap, lineHash)
13201329

1321-
return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray
1330+
return string(runes1), string(runes2), lineMap
13221331
}
13231332

1324-
// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string.
1325-
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []uint32 {
1333+
// Code points in the surrogate range are not valid for UTF-8.
1334+
const (
1335+
surrogateMin = 0xD800
1336+
surrogateMax = 0xDFFF
1337+
)
1338+
1339+
// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a LineMap.
1340+
func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineMap LineMap, lineHash map[string]rune) []rune {
13261341
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
13271342
lineStart := 0
13281343
lineEnd := -1
1329-
strs := []uint32{}
1344+
var strs []rune
13301345

13311346
for lineEnd < len(text)-1 {
13321347
lineEnd = indexOf(text, "\n", lineStart)
@@ -1340,11 +1355,19 @@ func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]str
13401355
lineValue, ok := lineHash[line]
13411356

13421357
if ok {
1343-
strs = append(strs, uint32(lineValue))
1358+
strs = append(strs, lineValue)
13441359
} else {
1345-
*lineArray = append(*lineArray, line)
1346-
lineHash[line] = len(*lineArray) - 1
1347-
strs = append(strs, uint32(len(*lineArray)-1))
1360+
nextRune := rune(len(lineMap) + 1)
1361+
if nextRune >= surrogateMin {
1362+
// Skip invalid utf8 runes, if needed.
1363+
nextRune += surrogateMax - surrogateMin + 1
1364+
}
1365+
if nextRune > utf8.MaxRune {
1366+
panic("too many unique lines to use rune hashing")
1367+
}
1368+
lineMap[nextRune] = line
1369+
lineHash[line] = nextRune
1370+
strs = append(strs, nextRune)
13481371
}
13491372
}
13501373

diffmatchpatch/diff_test.go

+111-67
Original file line numberDiff line numberDiff line change
@@ -308,18 +308,16 @@ func TestDiffLinesToChars(t *testing.T) {
308308

309309
ExpectedChars1 string
310310
ExpectedChars2 string
311-
ExpectedLines []string
311+
ExpectedLines LineMap
312312
}
313313

314314
dmp := New()
315315

316316
for i, tc := range []TestCase{
317-
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
318-
{"a", "b", "1", "2", []string{"", "a", "b"}},
317+
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", map[rune]string{1: "alpha\r\n", 2: "beta\r\n", 3: "\r\n"}},
318+
{"a", "b", "\u0001", "\u0002", map[rune]string{1: "a", 2: "b"}},
319319
// Omit final newline.
320-
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
321-
// Same lines in Text1 and Text2
322-
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
320+
{"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", map[rune]string{1: "alpha\n", 2: "beta\n", 3: "alpha"}},
323321
} {
324322
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
325323
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
@@ -329,28 +327,28 @@ func TestDiffLinesToChars(t *testing.T) {
329327

330328
// More than 256 to reveal any 8-bit limitations.
331329
n := 300
332-
lineList := []string{
333-
"", // Account for the initial empty element of the lines array.
334-
}
335-
var charList []string
330+
var lines []string
331+
lineMap := LineMap{}
332+
var charList []rune
336333
for x := 1; x < n+1; x++ {
337-
lineList = append(lineList, strconv.Itoa(x)+"\n")
338-
charList = append(charList, strconv.Itoa(x))
334+
line := strconv.Itoa(x) + "\n"
335+
lines = append(lines, line)
336+
lineMap[rune(x)] = line
337+
charList = append(charList, rune(x))
339338
}
340-
lines := strings.Join(lineList, "")
341-
chars := strings.Join(charList[:], ",")
342-
assert.Equal(t, n, len(strings.Split(chars, ",")))
339+
chars := string(charList)
340+
assert.Equal(t, n, utf8.RuneCountInString(chars))
343341

344-
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
342+
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(strings.Join(lines, ""), "")
345343
assert.Equal(t, chars, actualChars1)
346344
assert.Equal(t, "", actualChars2)
347-
assert.Equal(t, lineList, actualLines)
345+
assert.Equal(t, lineMap, actualLines)
348346
}
349347

350348
func TestDiffCharsToLines(t *testing.T) {
351349
type TestCase struct {
352350
Diffs []Diff
353-
Lines []string
351+
Lines map[rune]string
354352

355353
Expected []Diff
356354
}
@@ -360,10 +358,10 @@ func TestDiffCharsToLines(t *testing.T) {
360358
for i, tc := range []TestCase{
361359
{
362360
Diffs: []Diff{
363-
{DiffEqual, "1,2,1"},
364-
{DiffInsert, "2,1,2"},
361+
{DiffEqual, "\u0001\u0002\u0001"},
362+
{DiffInsert, "\u0002\u0001\u0002"},
365363
},
366-
Lines: []string{"", "alpha\n", "beta\n"},
364+
Lines: map[rune]string{1: "alpha\n", 2: "beta\n"},
367365

368366
Expected: []Diff{
369367
{DiffEqual, "alpha\nbeta\nalpha\n"},
@@ -377,19 +375,19 @@ func TestDiffCharsToLines(t *testing.T) {
377375

378376
// More than 256 to reveal any 8-bit limitations.
379377
n := 300
380-
lineList := []string{
381-
"", // Account for the initial empty element of the lines array.
382-
}
383-
charList := []string{}
378+
var lines []string
379+
lineMap := LineMap{}
380+
charList := []rune{}
384381
for x := 1; x <= n; x++ {
385-
lineList = append(lineList, strconv.Itoa(x)+"\n")
386-
charList = append(charList, strconv.Itoa(x))
382+
line := strconv.Itoa(x) + "\n"
383+
lines = append(lines, line)
384+
lineMap[rune(x)] = line
385+
charList = append(charList, rune(x))
387386
}
388387
assert.Equal(t, n, len(charList))
389-
chars := strings.Join(charList[:], ",")
390388

391-
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
392-
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
389+
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineMap)
390+
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lines, "")}}, actual)
393391
}
394392

395393
func TestDiffCleanupMerge(t *testing.T) {
@@ -826,43 +824,6 @@ func TestDiffCleanupSemantic(t *testing.T) {
826824
{DiffDelete, " deal"},
827825
},
828826
},
829-
{
830-
"Taken from python / CPP library",
831-
[]Diff{
832-
{DiffInsert, "星球大戰:新的希望 "},
833-
{DiffEqual, "star wars: "},
834-
{DiffDelete, "episodio iv - un"},
835-
{DiffEqual, "a n"},
836-
{DiffDelete, "u"},
837-
{DiffEqual, "e"},
838-
{DiffDelete, "va"},
839-
{DiffInsert, "w"},
840-
{DiffEqual, " "},
841-
{DiffDelete, "es"},
842-
{DiffInsert, "ho"},
843-
{DiffEqual, "pe"},
844-
{DiffDelete, "ranza"},
845-
},
846-
[]Diff{
847-
{DiffInsert, "星球大戰:新的希望 "},
848-
{DiffEqual, "star wars: "},
849-
{DiffDelete, "episodio iv - una nueva esperanza"},
850-
{DiffInsert, "a new hope"},
851-
},
852-
},
853-
{
854-
"panic",
855-
[]Diff{
856-
{DiffInsert, "킬러 인 "},
857-
{DiffEqual, "리커버리"},
858-
{DiffDelete, " 보이즈"},
859-
},
860-
[]Diff{
861-
{DiffInsert, "킬러 인 "},
862-
{DiffEqual, "리커버리"},
863-
{DiffDelete, " 보이즈"},
864-
},
865-
},
866827
} {
867828
actual := dmp.DiffCleanupSemantic(tc.Diffs)
868829
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %s", i, tc.Name))
@@ -1531,3 +1492,86 @@ func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) {
15311492
diffs = dmp.DiffCharsToLines(diffs, linearray)
15321493
}
15331494
}
1495+
1496+
func TestLineDiff(t *testing.T) {
1497+
t.Run("VeryLarge", func(t *testing.T) {
1498+
var beforeBuf, afterBuf bytes.Buffer
1499+
1500+
for i := 0; i <= surrogateMax+1; i++ {
1501+
beforeBuf.WriteString(fmt.Sprintf("%d\n", i))
1502+
afterBuf.WriteString(fmt.Sprintf("%d\n", i/2))
1503+
}
1504+
1505+
before, after := beforeBuf.String(), afterBuf.String()
1506+
1507+
diff := New().DiffMain(before, after, true)
1508+
checkDiffText(t, before, after, diff)
1509+
})
1510+
1511+
t.Run("Chars", func(t *testing.T) {
1512+
before := `1
1513+
2
1514+
3
1515+
4
1516+
5
1517+
6
1518+
7
1519+
8
1520+
9
1521+
`
1522+
after := `10
1523+
`
1524+
1525+
dmp := New()
1526+
txt1, txt2, lines := dmp.DiffLinesToChars(string(before), string(after))
1527+
diff := dmp.DiffMain(txt1, txt2, false)
1528+
diff = dmp.DiffCharsToLines(diff, lines)
1529+
1530+
checkDiffText(t, before, after, diff)
1531+
})
1532+
1533+
t.Run("Runes", func(t *testing.T) {
1534+
before := `1
1535+
2
1536+
3
1537+
4
1538+
5
1539+
6
1540+
7
1541+
8
1542+
9
1543+
`
1544+
after := `10
1545+
`
1546+
1547+
dmp := New()
1548+
txt1, txt2, lines := dmp.DiffLinesToRunes(string(before), string(after))
1549+
diff := dmp.DiffMainRunes(txt1, txt2, false)
1550+
diff = dmp.DiffCharsToLines(diff, lines)
1551+
1552+
checkDiffText(t, before, after, diff)
1553+
})
1554+
}
1555+
1556+
func checkDiffText(t *testing.T, before, after string, diff []Diff) {
1557+
t.Helper()
1558+
var foundBefore, foundAfter string
1559+
for _, d := range diff {
1560+
switch d.Type {
1561+
case DiffEqual:
1562+
foundBefore += d.Text
1563+
foundAfter += d.Text
1564+
case DiffDelete:
1565+
foundBefore += d.Text
1566+
case DiffInsert:
1567+
foundAfter += d.Text
1568+
}
1569+
}
1570+
1571+
if foundBefore != before {
1572+
t.Errorf("Expected before %q; found %q", before, foundBefore)
1573+
}
1574+
if foundAfter != after {
1575+
t.Errorf("Expected after %q; found %q", after, foundAfter)
1576+
}
1577+
}

0 commit comments

Comments
 (0)