Skip to content

Commit 4019a12

Browse files
committed
fix: use indivisble line hashes
Revert implementation of diffLines to use runes to fix sergi#140. In order to not regress sergi#89, skip invalid utf8 runes when munging lines.
1 parent facec63 commit 4019a12

File tree

3 files changed

+165
-79
lines changed

3 files changed

+165
-79
lines changed

diffmatchpatch/diff.go

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ import (
2222
"unicode/utf8"
2323
)
2424

25+
// LineMap is a mapping from a line hash to its text.
26+
type LineMap map[rune]string
27+
2528
// Operation defines the operation of a diff item.
2629
type Operation int8
2730

@@ -34,8 +37,6 @@ const (
3437
DiffInsert Operation = 1
3538
// DiffEqual item represents an equal diff.
3639
DiffEqual Operation = 0
37-
//IndexSeparator is used to seperate the array indexes in an index string
38-
IndexSeparator = ","
3940
)
4041

4142
// Diff represents one diff operation
@@ -83,12 +84,16 @@ func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff {
8384

8485
// DiffMain finds the differences between two texts.
8586
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
87+
//
88+
// Note: if checklines is true, the limitation noted in DiffLinesToChars applies
8689
func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff {
8790
return dmp.DiffMainRunes([]rune(text1), []rune(text2), checklines)
8891
}
8992

9093
// DiffMainRunes finds the differences between two rune sequences.
9194
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
95+
//
96+
// Note: if checklines is true, the limitation noted in DiffLinesToRunes applies
9297
func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff {
9398
var deadline time.Time
9499
if dmp.DiffTimeout > 0 {
@@ -391,29 +396,34 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int,
391396

392397
// DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line.
393398
// It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes.
394-
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) {
395-
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
396-
return chars1, chars2, lineArray
399+
//
400+
// Note: since we hash lines to runes, there is an upper limit to the number of
401+
// unique lines this algorithm can handle. That limit is 1,112,063 unique
402+
// lines.
403+
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, LineMap) {
404+
chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
405+
return chars1, chars2, lineMap
397406
}
398407

399408
// DiffLinesToRunes splits two texts into a list of runes.
400-
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
401-
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
402-
return []rune(chars1), []rune(chars2), lineArray
409+
//
410+
// Note: since we hash lines to runes, there is an upper limit to the number of
411+
// unique lines this algorithm can handle. That limit is 1,112,063 unique
412+
// lines.
413+
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, LineMap) {
414+
chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
415+
return []rune(chars1), []rune(chars2), lineMap
403416
}
404417

405418
// DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
406-
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
419+
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineMap LineMap) []Diff {
407420
hydrated := make([]Diff, 0, len(diffs))
408421
for _, aDiff := range diffs {
409-
chars := strings.Split(aDiff.Text, IndexSeparator)
410-
text := make([]string, len(chars))
422+
runes := []rune(aDiff.Text)
423+
text := make([]string, len(runes))
411424

412-
for i, r := range chars {
413-
i1, err := strconv.Atoi(r)
414-
if err == nil {
415-
text[i] = lineArray[i1]
416-
}
425+
for i, r := range runes {
426+
text[i] = lineMap[r]
417427
}
418428

419429
aDiff.Text = strings.Join(text, "")
@@ -1309,24 +1319,29 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di
13091319
}
13101320

13111321
// diffLinesToStrings splits two texts into a list of strings. Each string represents one line.
1312-
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) {
1313-
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
1314-
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
1322+
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, LineMap) {
1323+
lineMap := LineMap{} // e.g. lineMap[4] == 'Hello\n'
13151324

1316-
lineHash := make(map[string]int)
1317-
//Each string has the index of lineArray which it points to
1318-
strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray, lineHash)
1319-
strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray, lineHash)
1325+
lineHash := make(map[string]rune)
1326+
//Each string has the index of lineMap which it points to
1327+
runes1 := dmp.diffLinesToRunesMunge(text1, lineMap, lineHash)
1328+
runes2 := dmp.diffLinesToRunesMunge(text2, lineMap, lineHash)
13201329

1321-
return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray
1330+
return string(runes1), string(runes2), lineMap
13221331
}
13231332

1324-
// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string.
1325-
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []uint32 {
1333+
// Code points in the surrogate range are not valid for UTF-8.
1334+
const (
1335+
surrogateMin = 0xD800
1336+
surrogateMax = 0xDFFF
1337+
)
1338+
1339+
// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a LineMap.
1340+
func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineMap LineMap, lineHash map[string]rune) []rune {
13261341
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
13271342
lineStart := 0
13281343
lineEnd := -1
1329-
strs := []uint32{}
1344+
var strs []rune
13301345

13311346
for lineEnd < len(text)-1 {
13321347
lineEnd = indexOf(text, "\n", lineStart)
@@ -1340,11 +1355,19 @@ func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]str
13401355
lineValue, ok := lineHash[line]
13411356

13421357
if ok {
1343-
strs = append(strs, uint32(lineValue))
1358+
strs = append(strs, lineValue)
13441359
} else {
1345-
*lineArray = append(*lineArray, line)
1346-
lineHash[line] = len(*lineArray) - 1
1347-
strs = append(strs, uint32(len(*lineArray)-1))
1360+
nextRune := rune(len(lineMap) + 1)
1361+
if nextRune >= surrogateMin {
1362+
// Skip invalid utf8 runes, if needed.
1363+
nextRune += surrogateMax - surrogateMin + 1
1364+
}
1365+
if nextRune > utf8.MaxRune {
1366+
panic("too many unique lines to use rune hashing")
1367+
}
1368+
lineMap[nextRune] = line
1369+
lineHash[line] = nextRune
1370+
strs = append(strs, nextRune)
13481371
}
13491372
}
13501373

diffmatchpatch/diff_test.go

Lines changed: 111 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -308,18 +308,16 @@ func TestDiffLinesToChars(t *testing.T) {
308308

309309
ExpectedChars1 string
310310
ExpectedChars2 string
311-
ExpectedLines []string
311+
ExpectedLines LineMap
312312
}
313313

314314
dmp := New()
315315

316316
for i, tc := range []TestCase{
317-
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
318-
{"a", "b", "1", "2", []string{"", "a", "b"}},
317+
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", map[rune]string{1: "alpha\r\n", 2: "beta\r\n", 3: "\r\n"}},
318+
{"a", "b", "\u0001", "\u0002", map[rune]string{1: "a", 2: "b"}},
319319
// Omit final newline.
320-
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
321-
// Same lines in Text1 and Text2
322-
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
320+
{"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", map[rune]string{1: "alpha\n", 2: "beta\n", 3: "alpha"}},
323321
} {
324322
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
325323
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
@@ -329,28 +327,28 @@ func TestDiffLinesToChars(t *testing.T) {
329327

330328
// More than 256 to reveal any 8-bit limitations.
331329
n := 300
332-
lineList := []string{
333-
"", // Account for the initial empty element of the lines array.
334-
}
335-
var charList []string
330+
var lines []string
331+
lineMap := LineMap{}
332+
var charList []rune
336333
for x := 1; x < n+1; x++ {
337-
lineList = append(lineList, strconv.Itoa(x)+"\n")
338-
charList = append(charList, strconv.Itoa(x))
334+
line := strconv.Itoa(x) + "\n"
335+
lines = append(lines, line)
336+
lineMap[rune(x)] = line
337+
charList = append(charList, rune(x))
339338
}
340-
lines := strings.Join(lineList, "")
341-
chars := strings.Join(charList[:], ",")
342-
assert.Equal(t, n, len(strings.Split(chars, ",")))
339+
chars := string(charList)
340+
assert.Equal(t, n, utf8.RuneCountInString(chars))
343341

344-
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
342+
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(strings.Join(lines, ""), "")
345343
assert.Equal(t, chars, actualChars1)
346344
assert.Equal(t, "", actualChars2)
347-
assert.Equal(t, lineList, actualLines)
345+
assert.Equal(t, lineMap, actualLines)
348346
}
349347

350348
func TestDiffCharsToLines(t *testing.T) {
351349
type TestCase struct {
352350
Diffs []Diff
353-
Lines []string
351+
Lines map[rune]string
354352

355353
Expected []Diff
356354
}
@@ -360,10 +358,10 @@ func TestDiffCharsToLines(t *testing.T) {
360358
for i, tc := range []TestCase{
361359
{
362360
Diffs: []Diff{
363-
{DiffEqual, "1,2,1"},
364-
{DiffInsert, "2,1,2"},
361+
{DiffEqual, "\u0001\u0002\u0001"},
362+
{DiffInsert, "\u0002\u0001\u0002"},
365363
},
366-
Lines: []string{"", "alpha\n", "beta\n"},
364+
Lines: map[rune]string{1: "alpha\n", 2: "beta\n"},
367365

368366
Expected: []Diff{
369367
{DiffEqual, "alpha\nbeta\nalpha\n"},
@@ -377,19 +375,19 @@ func TestDiffCharsToLines(t *testing.T) {
377375

378376
// More than 256 to reveal any 8-bit limitations.
379377
n := 300
380-
lineList := []string{
381-
"", // Account for the initial empty element of the lines array.
382-
}
383-
charList := []string{}
378+
var lines []string
379+
lineMap := LineMap{}
380+
charList := []rune{}
384381
for x := 1; x <= n; x++ {
385-
lineList = append(lineList, strconv.Itoa(x)+"\n")
386-
charList = append(charList, strconv.Itoa(x))
382+
line := strconv.Itoa(x) + "\n"
383+
lines = append(lines, line)
384+
lineMap[rune(x)] = line
385+
charList = append(charList, rune(x))
387386
}
388387
assert.Equal(t, n, len(charList))
389-
chars := strings.Join(charList[:], ",")
390388

391-
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
392-
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
389+
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineMap)
390+
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lines, "")}}, actual)
393391
}
394392

395393
func TestDiffCleanupMerge(t *testing.T) {
@@ -1531,3 +1529,86 @@ func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) {
15311529
diffs = dmp.DiffCharsToLines(diffs, linearray)
15321530
}
15331531
}
1532+
1533+
func TestLineDiff(t *testing.T) {
1534+
t.Run("VeryLarge", func(t *testing.T) {
1535+
var beforeBuf, afterBuf bytes.Buffer
1536+
1537+
for i := 0; i <= surrogateMax+1; i++ {
1538+
beforeBuf.WriteString(fmt.Sprintf("%d\n", i))
1539+
afterBuf.WriteString(fmt.Sprintf("%d\n", i/2))
1540+
}
1541+
1542+
before, after := beforeBuf.String(), afterBuf.String()
1543+
1544+
diff := New().DiffMain(before, after, true)
1545+
checkDiffText(t, before, after, diff)
1546+
})
1547+
1548+
t.Run("Chars", func(t *testing.T) {
1549+
before := `1
1550+
2
1551+
3
1552+
4
1553+
5
1554+
6
1555+
7
1556+
8
1557+
9
1558+
`
1559+
after := `10
1560+
`
1561+
1562+
dmp := New()
1563+
txt1, txt2, lines := dmp.DiffLinesToChars(string(before), string(after))
1564+
diff := dmp.DiffMain(txt1, txt2, false)
1565+
diff = dmp.DiffCharsToLines(diff, lines)
1566+
1567+
checkDiffText(t, before, after, diff)
1568+
})
1569+
1570+
t.Run("Runes", func(t *testing.T) {
1571+
before := `1
1572+
2
1573+
3
1574+
4
1575+
5
1576+
6
1577+
7
1578+
8
1579+
9
1580+
`
1581+
after := `10
1582+
`
1583+
1584+
dmp := New()
1585+
txt1, txt2, lines := dmp.DiffLinesToRunes(string(before), string(after))
1586+
diff := dmp.DiffMainRunes(txt1, txt2, false)
1587+
diff = dmp.DiffCharsToLines(diff, lines)
1588+
1589+
checkDiffText(t, before, after, diff)
1590+
})
1591+
}
1592+
1593+
func checkDiffText(t *testing.T, before, after string, diff []Diff) {
1594+
t.Helper()
1595+
var foundBefore, foundAfter string
1596+
for _, d := range diff {
1597+
switch d.Type {
1598+
case DiffEqual:
1599+
foundBefore += d.Text
1600+
foundAfter += d.Text
1601+
case DiffDelete:
1602+
foundBefore += d.Text
1603+
case DiffInsert:
1604+
foundAfter += d.Text
1605+
}
1606+
}
1607+
1608+
if foundBefore != before {
1609+
t.Errorf("Expected before %q; found %q", before, foundBefore)
1610+
}
1611+
if foundAfter != after {
1612+
t.Errorf("Expected after %q; found %q", after, foundAfter)
1613+
}
1614+
}

diffmatchpatch/stringutil.go

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
package diffmatchpatch
1010

1111
import (
12-
"strconv"
1312
"strings"
1413
"unicode/utf8"
1514
)
@@ -87,20 +86,3 @@ func runesIndex(r1, r2 []rune) int {
8786
}
8887
return -1
8988
}
90-
91-
func intArrayToString(ns []uint32) string {
92-
if len(ns) == 0 {
93-
return ""
94-
}
95-
96-
indexSeparator := IndexSeparator[0]
97-
98-
// Appr. 3 chars per num plus the comma.
99-
b := []byte{}
100-
for _, n := range ns {
101-
b = strconv.AppendInt(b, int64(n), 10)
102-
b = append(b, indexSeparator)
103-
}
104-
b = b[:len(b)-1]
105-
return string(b)
106-
}

0 commit comments

Comments
 (0)