internal/diff/lcs: introduce line diffs
WIP, not ready for review.
diff --git a/internal/diff/lcs/old.go b/internal/diff/lcs/old.go
index 7b7c5cc..e941b86 100644
--- a/internal/diff/lcs/old.go
+++ b/internal/diff/lcs/old.go
@@ -27,9 +27,22 @@
// DiffRunes returns the differences between two rune sequences.
func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) }
+// DiffLines returns the differencesw between two string sequences.
+func DiffLines(a, b []string) []Diff { return diff(linesSeqs{a, b}) }
+
+// A limit on how deeply the LCS algorithm should search. The value is just a guess.
+var maxDiffs = 100
+
+// set maxDiffs and return the old value.
+// (This may be a bad idea, as there could be concurrent calls to diff
+// using different values. The window is old=MaxDiffs(new); Diff(...); MaxDiffs(old))
+func MaxDiffs(n int) int {
+ old := maxDiffs
+ maxDiffs = n
+ return old
+}
+
func diff(seqs sequences) []Diff {
- // A limit on how deeply the LCS algorithm should search. The value is just a guess.
- const maxDiffs = 100
diff, _ := compute(seqs, twosided, maxDiffs/2)
return diff
}
diff --git a/internal/diff/lcs/old_test.go b/internal/diff/lcs/old_test.go
index 7381954..4c4ef31 100644
--- a/internal/diff/lcs/old_test.go
+++ b/internal/diff/lcs/old_test.go
@@ -256,4 +256,11 @@
compute(runesSeqs{srcRunes, dstRunes}, twosided, len(srcRunes)+len(dstRunes))
}
})
+ srcLines := strings.Split(src, "\n")
+ dstLines := strings.Split(dst, "\n")
+ b.Run("lines", func(b *testing.B) {
+ for b.Loop() {
+ compute(linesSeqs{srcLines, dstLines}, twosided, len(srcLines)+len(dstLines))
+ }
+ })
}
diff --git a/internal/diff/lcs/sequence.go b/internal/diff/lcs/sequence.go
index 811bb216..bb5b7c5 100644
--- a/internal/diff/lcs/sequence.go
+++ b/internal/diff/lcs/sequence.go
@@ -29,20 +29,30 @@
func (s bytesSeqs) lengths() (int, int) { return len(s.a), len(s.b) }
func (s bytesSeqs) commonPrefixLen(ai, aj, bi, bj int) int {
- return commonPrefixLenBytes(s.a[ai:aj:aj], s.b[bi:bj:bj])
+ return commonPrefixLen(s.a[ai:aj:aj], s.b[bi:bj:bj])
}
func (s bytesSeqs) commonSuffixLen(ai, aj, bi, bj int) int {
- return commonSuffixLenBytes(s.a[ai:aj:aj], s.b[bi:bj:bj])
+ return commonSuffixLen(s.a[ai:aj:aj], s.b[bi:bj:bj])
}
type runesSeqs struct{ a, b []rune }
func (s runesSeqs) lengths() (int, int) { return len(s.a), len(s.b) }
func (s runesSeqs) commonPrefixLen(ai, aj, bi, bj int) int {
- return commonPrefixLenRunes(s.a[ai:aj:aj], s.b[bi:bj:bj])
+ return commonPrefixLen(s.a[ai:aj:aj], s.b[bi:bj:bj])
}
func (s runesSeqs) commonSuffixLen(ai, aj, bi, bj int) int {
- return commonSuffixLenRunes(s.a[ai:aj:aj], s.b[bi:bj:bj])
+ return commonSuffixLen(s.a[ai:aj:aj], s.b[bi:bj:bj])
+}
+
+type linesSeqs struct{ a, b []string }
+
+func (s linesSeqs) lengths() (int, int) { return len(s.a), len(s.b) }
+func (s linesSeqs) commonPrefixLen(ai, aj, bi, bj int) int {
+ return commonPrefixLen(s.a[ai:aj], s.b[bi:bj])
+}
+func (s linesSeqs) commonSuffixLen(ai, aj, bi, bj int) int {
+ return commonSuffixLen(s.a[ai:aj], s.b[bi:bj])
}
// TODO(adonovan): optimize these functions using ideas from:
@@ -52,8 +62,8 @@
// TODO(adonovan): factor using generics when available,
// but measure performance impact.
-// commonPrefixLen* returns the length of the common prefix of a[ai:aj] and b[bi:bj].
-func commonPrefixLenBytes(a, b []byte) int {
+// commonPrefixLen returns the length of the common prefix of a[ai:aj] and b[bi:bj].
+func commonPrefixLen[T comparable](a, b []T) int {
n := min(len(a), len(b))
i := 0
for i < n && a[i] == b[i] {
@@ -61,14 +71,7 @@
}
return i
}
-func commonPrefixLenRunes(a, b []rune) int {
- n := min(len(a), len(b))
- i := 0
- for i < n && a[i] == b[i] {
- i++
- }
- return i
-}
+
func commonPrefixLenString(a, b string) int {
n := min(len(a), len(b))
i := 0
@@ -78,8 +81,8 @@
return i
}
-// commonSuffixLen* returns the length of the common suffix of a[ai:aj] and b[bi:bj].
-func commonSuffixLenBytes(a, b []byte) int {
+// commonSuffixLen returns the length of the common suffix of a[ai:aj] and b[bi:bj].
+func commonSuffixLen[T comparable](a, b []T) int {
n := min(len(a), len(b))
i := 0
for i < n && a[len(a)-1-i] == b[len(b)-1-i] {
@@ -87,14 +90,7 @@
}
return i
}
-func commonSuffixLenRunes(a, b []rune) int {
- n := min(len(a), len(b))
- i := 0
- for i < n && a[len(a)-1-i] == b[len(b)-1-i] {
- i++
- }
- return i
-}
+
func commonSuffixLenString(a, b string) int {
n := min(len(a), len(b))
i := 0
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
return commonPrefixLenString(s.a[ai:aj], s.b[bi:bj])commonPrefixLen[string]
// DiffLines returns the differencesw between two string sequences.delete
// TODO(adonovan): factor using generics when available,
// but measure performance impact.delete
if len(newedits) != len(edits) {I assume this is a cross check. Do we expect the algorithms to always produce the same number of edits, but not identical edits? Or is this a quick sanity check to avoid allocation in the common case when they are equal?
log.Printf("%d != %d %v, %v", len(newedits), len(edits), newlocs, locs)
return edits // PJW: work in progress, do not submitThis looks unfinished.
// Lines computes lines differenes between two strings.differences
Better:
Lines computes differences between two strings. All edits are at line boundaries.
offsets := []int{0}Use bytes.Count to scan the file and preallocate the correct array size?
for i := 0; i < len(text); i++ {
if text[i] == '\n' { // keep the newlines
lines = append(lines, text[offsets[len(offsets)-1]:i+1])
offsets = append(offsets, i+1)
}
}Use bytes.Index as it uses vector instructions to compare 32 bytes at a time.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
return commonPrefixLenString(s.a[ai:aj], s.b[bi:bj])Peter WeinbergercommonPrefixLen[string]
That doesn't work. CommonPrefixLen takes []T, which does not match string.
// DiffLines returns the differencesw between two string sequences.Peter Weinbergerdelete
Done
// TODO(adonovan): factor using generics when available,
// but measure performance impact.Peter Weinbergerdelete
Done
I assume this is a cross check. Do we expect the algorithms to always produce the same number of edits, but not identical edits? Or is this a quick sanity check to avoid allocation in the common case when they are equal?
it's now gone.
This whole routine would go away once the code review for my other CL (https://go-review.git.corp.google.com/c/tools/+/724660) is finished.
log.Printf("%d != %d %v, %v", len(newedits), len(edits), newlocs, locs)
return edits // PJW: work in progress, do not submitPeter WeinbergerThis looks unfinished.
Done
differences
Better:Lines computes differences between two strings. All edits are at line boundaries.
Done
Use bytes.Count to scan the file and preallocate the correct array size?
I don't think it is worth it. The resulting logic would have to check for 0 and the savings is at best a tiny fraction of all the other processing.
for i := 0; i < len(text); i++ {
if text[i] == '\n' { // keep the newlines
lines = append(lines, text[offsets[len(offsets)-1]:i+1])
offsets = append(offsets, i+1)
}
}Use bytes.Index as it uses vector instructions to compare 32 bytes at a time.
I couldn't see a simple way of writing that code so that it works if the last line does not end with \n, as in some of the tests.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |