Skip to content

Commit c23354f

Browse files
committed
Merge pull request #7 from op/unicode-boundaries
Various unicode fixes
2 parents 77f7e99 + 1d4c130 commit c23354f

File tree

2 files changed

+77
-18
lines changed

2 files changed

+77
-18
lines changed

diffmatchpatch/dmp.go

+35-18
Original file line numberDiff line numberDiff line change
@@ -566,25 +566,39 @@ func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []
566566
// DiffCommonPrefix determines the common prefix length of two strings.
567567
func (dmp *DiffMatchPatch) DiffCommonPrefix(text1, text2 string) int {
568568
n := min(len(text1), len(text2))
569-
for i := 0; i < n; i++ {
570-
if text1[i] != text2[i] {
569+
i := 0
570+
for i < n {
571+
_, sz := utf8.DecodeRuneInString(text1[i:])
572+
if sz > n-i {
571573
return i
572574
}
575+
for j := 0; j < sz; j++ {
576+
if text1[i+j] != text2[i+j] {
577+
return i
578+
}
579+
}
580+
i += sz
573581
}
574-
return n
582+
return i
575583
}
576584

577585
// DiffCommonSuffix determines the common suffix length of two strings.
578586
func (dmp *DiffMatchPatch) DiffCommonSuffix(text1, text2 string) int {
579-
text1_length := len(text1)
580-
text2_length := len(text2)
581-
n := min(text1_length, text2_length)
582-
for i := 1; i <= n; i++ {
583-
if text1[text1_length-i] != text2[text2_length-i] {
584-
return i - 1
587+
n := min(len(text1), len(text2))
588+
i := 0
589+
for i < n {
590+
_, sz := utf8.DecodeLastRuneInString(text1[:len(text1)-i])
591+
if sz > n-i {
592+
return i
593+
}
594+
for j := 0; j < sz; j++ {
595+
if text1[len(text1)-1-i-j] != text2[len(text2)-1-i-j] {
596+
return i
597+
}
585598
}
599+
i += sz
586600
}
587-
return n
601+
return i
588602
// Binary search.
589603
// Performance analysis: http://neil.fraser.name/news/2007/10/09/
590604
/*
@@ -901,16 +915,15 @@ func (dmp *DiffMatchPatch) DiffCleanupSemanticLossless(diffs []Diff) []Diff {
901915
return 6
902916
}
903917

904-
_one := []rune(one)
905-
_two := []rune(two)
906-
907918
// Each port of this function behaves slightly differently due to
908919
// subtle differences in each language's definition of things like
909920
// 'whitespace'. Since this function's purpose is largely cosmetic,
910921
// the choice has been made to use each language's native features
911922
// rather than force total conformity.
912-
char1 := string(_one[len(one)-1])
913-
char2 := string(_two[0])
923+
rune1, _ := utf8.DecodeLastRuneInString(one)
924+
rune2, _ := utf8.DecodeRuneInString(two)
925+
char1 := string(rune1)
926+
char2 := string(rune2)
914927

915928
nonAlphaNumeric1 := nonAlphaNumericRegex_.MatchString(char1)
916929
nonAlphaNumeric2 := nonAlphaNumericRegex_.MatchString(char2)
@@ -968,10 +981,14 @@ func (dmp *DiffMatchPatch) DiffCleanupSemanticLossless(diffs []Diff) []Diff {
968981
bestScore := diffCleanupSemanticScore_(equality1, edit) +
969982
diffCleanupSemanticScore_(edit, equality2)
970983

971-
for len(edit) != 0 && len(equality2) != 0 && edit[0] == equality2[0] {
984+
for len(edit) != 0 && len(equality2) != 0 {
985+
_, sz := utf8.DecodeRuneInString(edit)
986+
if edit[:sz] != equality2[:sz] {
987+
break
988+
}
972989
equality1 += string(edit[0])
973-
edit = edit[1:] + string(equality2[0])
974-
equality2 = equality2[1:]
990+
edit = edit[sz:] + string(equality2[0])
991+
equality2 = equality2[sz:]
975992
score := diffCleanupSemanticScore_(equality1, edit) +
976993
diffCleanupSemanticScore_(edit, equality2)
977994
// The >= encourages trailing rather than leading whitespace on

diffmatchpatch/dmp_test.go

+42
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,32 @@ func Test_diffCleanupSemanticLossless(t *testing.T) {
437437
Diff{DiffEqual, "The xxx."},
438438
Diff{DiffInsert, " The zzz."},
439439
Diff{DiffEqual, " The yyy."}}, diffs)
440+
441+
// UTF-8 strings.
442+
diffs = []Diff{
443+
Diff{DiffEqual, "The ♕. The "},
444+
Diff{DiffInsert, "♔. The "},
445+
Diff{DiffEqual, "♖."}}
446+
447+
dmp.DiffCleanupSemanticLossless(diffs)
448+
449+
assertDiffEqual(t, []Diff{
450+
Diff{DiffEqual, "The ♕."},
451+
Diff{DiffInsert, " The ♔."},
452+
Diff{DiffEqual, " The ♖."}}, diffs)
453+
454+
// Rune boundaries.
455+
diffs = []Diff{
456+
Diff{DiffEqual, "♕♕"},
457+
Diff{DiffInsert, "♔♔"},
458+
Diff{DiffEqual, "♖♖"}}
459+
460+
dmp.DiffCleanupSemanticLossless(diffs)
461+
462+
assertDiffEqual(t, []Diff{
463+
Diff{DiffEqual, "♕♕"},
464+
Diff{DiffInsert, "♔♔"},
465+
Diff{DiffEqual, "♖♖"}}, diffs)
440466
}
441467

442468
func Test_diffCleanupSemantic(t *testing.T) {
@@ -1306,3 +1332,19 @@ func Benchmark_DiffMain(bench *testing.B) {
13061332
dmp.DiffMain(a, b, true)
13071333
}
13081334
}
1335+
1336+
func Benchmark_DiffCommonPrefix(b *testing.B) {
1337+
dmp := New()
1338+
a := "ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ"
1339+
for i := 0; i < b.N; i++ {
1340+
dmp.DiffCommonPrefix(a, a)
1341+
}
1342+
}
1343+
1344+
func Benchmark_DiffCommonSuffix(b *testing.B) {
1345+
dmp := New()
1346+
a := "ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ"
1347+
for i := 0; i < b.N; i++ {
1348+
dmp.DiffCommonSuffix(a, a)
1349+
}
1350+
}

0 commit comments

Comments
 (0)