Skip to content

Commit 58fedf6

Browse files
committed
internal/diffp: add patience diff from main repo
This is the main repo's internal/diff, renamed because there is a different internal/diff already in x/tools. Change-Id: I7b6da718e46c9fa23931908b520b9f39c178206b Reviewed-on: https://go-review.googlesource.com/c/tools/+/491915 Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> gopls-CI: kokoro <noreply+kokoro@google.com> Reviewed-by: David Chase <drchase@google.com>
1 parent 7b684a9 commit 58fedf6

File tree

14 files changed

+633
-0
lines changed

14 files changed

+633
-0
lines changed

internal/diffp/diff.go

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Package diffp implements a basic diff algorithm equivalent to patience diff.
6+
// It is a copy of internal/diff from the main Go repo, renamed to diffp to avoid
7+
// conflict with the existing golang.org/x/tools/internal/diff.
8+
package diffp
9+
10+
import (
11+
"bytes"
12+
"fmt"
13+
"sort"
14+
"strings"
15+
)
16+
17+
// A pair is a pair of values tracked for both the x and y side of a diff.
18+
// It is typically a pair of line indexes.
19+
type pair struct{ x, y int }
20+
21+
// Diff returns an anchored diff of the two texts old and new
22+
// in the “unified diff” format. If old and new are identical,
23+
// Diff returns a nil slice (no output).
24+
//
25+
// Unix diff implementations typically look for a diff with
26+
// the smallest number of lines inserted and removed,
27+
// which can in the worst case take time quadratic in the
28+
// number of lines in the texts. As a result, many implementations
29+
// either can be made to run for a long time or cut off the search
30+
// after a predetermined amount of work.
31+
//
32+
// In contrast, this implementation looks for a diff with the
33+
// smallest number of “unique” lines inserted and removed,
34+
// where unique means a line that appears just once in both old and new.
35+
// We call this an “anchored diff” because the unique lines anchor
36+
// the chosen matching regions. An anchored diff is usually clearer
37+
// than a standard diff, because the algorithm does not try to
38+
// reuse unrelated blank lines or closing braces.
39+
// The algorithm also guarantees to run in O(n log n) time
40+
// instead of the standard O(n²) time.
41+
//
42+
// Some systems call this approach a “patience diff,” named for
43+
// the “patience sorting” algorithm, itself named for a solitaire card game.
44+
// We avoid that name for two reasons. First, the name has been used
45+
// for a few different variants of the algorithm, so it is imprecise.
46+
// Second, the name is frequently interpreted as meaning that you have
47+
// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
48+
// when in fact the algorithm is faster than the standard one.
49+
func Diff(oldName string, old []byte, newName string, new []byte) []byte {
50+
if bytes.Equal(old, new) {
51+
return nil
52+
}
53+
x := lines(old)
54+
y := lines(new)
55+
56+
// Print diff header.
57+
var out bytes.Buffer
58+
fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
59+
fmt.Fprintf(&out, "--- %s\n", oldName)
60+
fmt.Fprintf(&out, "+++ %s\n", newName)
61+
62+
// Loop over matches to consider,
63+
// expanding each match to include surrounding lines,
64+
// and then printing diff chunks.
65+
// To avoid setup/teardown cases outside the loop,
66+
// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
67+
// in the sequence of matches.
68+
var (
69+
done pair // printed up to x[:done.x] and y[:done.y]
70+
chunk pair // start lines of current chunk
71+
count pair // number of lines from each side in current chunk
72+
ctext []string // lines for current chunk
73+
)
74+
for _, m := range tgs(x, y) {
75+
if m.x < done.x {
76+
// Already handled scanning forward from earlier match.
77+
continue
78+
}
79+
80+
// Expand matching lines as far possible,
81+
// establishing that x[start.x:end.x] == y[start.y:end.y].
82+
// Note that on the first (or last) iteration we may (or definitey do)
83+
// have an empty match: start.x==end.x and start.y==end.y.
84+
start := m
85+
for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
86+
start.x--
87+
start.y--
88+
}
89+
end := m
90+
for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
91+
end.x++
92+
end.y++
93+
}
94+
95+
// Emit the mismatched lines before start into this chunk.
96+
// (No effect on first sentinel iteration, when start = {0,0}.)
97+
for _, s := range x[done.x:start.x] {
98+
ctext = append(ctext, "-"+s)
99+
count.x++
100+
}
101+
for _, s := range y[done.y:start.y] {
102+
ctext = append(ctext, "+"+s)
103+
count.y++
104+
}
105+
106+
// If we're not at EOF and have too few common lines,
107+
// the chunk includes all the common lines and continues.
108+
const C = 3 // number of context lines
109+
if (end.x < len(x) || end.y < len(y)) &&
110+
(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
111+
for _, s := range x[start.x:end.x] {
112+
ctext = append(ctext, " "+s)
113+
count.x++
114+
count.y++
115+
}
116+
done = end
117+
continue
118+
}
119+
120+
// End chunk with common lines for context.
121+
if len(ctext) > 0 {
122+
n := end.x - start.x
123+
if n > C {
124+
n = C
125+
}
126+
for _, s := range x[start.x : start.x+n] {
127+
ctext = append(ctext, " "+s)
128+
count.x++
129+
count.y++
130+
}
131+
done = pair{start.x + n, start.y + n}
132+
133+
// Format and emit chunk.
134+
// Convert line numbers to 1-indexed.
135+
// Special case: empty file shows up as 0,0 not 1,0.
136+
if count.x > 0 {
137+
chunk.x++
138+
}
139+
if count.y > 0 {
140+
chunk.y++
141+
}
142+
fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
143+
for _, s := range ctext {
144+
out.WriteString(s)
145+
}
146+
count.x = 0
147+
count.y = 0
148+
ctext = ctext[:0]
149+
}
150+
151+
// If we reached EOF, we're done.
152+
if end.x >= len(x) && end.y >= len(y) {
153+
break
154+
}
155+
156+
// Otherwise start a new chunk.
157+
chunk = pair{end.x - C, end.y - C}
158+
for _, s := range x[chunk.x:end.x] {
159+
ctext = append(ctext, " "+s)
160+
count.x++
161+
count.y++
162+
}
163+
done = end
164+
}
165+
166+
return out.Bytes()
167+
}
168+
169+
// lines returns the lines in the file x, including newlines.
170+
// If the file does not end in a newline, one is supplied
171+
// along with a warning about the missing newline.
172+
func lines(x []byte) []string {
173+
l := strings.SplitAfter(string(x), "\n")
174+
if l[len(l)-1] == "" {
175+
l = l[:len(l)-1]
176+
} else {
177+
// Treat last line as having a message about the missing newline attached,
178+
// using the same text as BSD/GNU diff (including the leading backslash).
179+
l[len(l)-1] += "\n\\ No newline at end of file\n"
180+
}
181+
return l
182+
}
183+
184+
// tgs returns the pairs of indexes of the longest common subsequence
185+
// of unique lines in x and y, where a unique line is one that appears
186+
// once in x and once in y.
187+
//
188+
// The longest common subsequence algorithm is as described in
189+
// Thomas G. Szymanski, “A Special Case of the Maximal Common
190+
// Subsequence Problem,” Princeton TR #170 (January 1975),
191+
// available at https://research.swtch.com/tgs170.pdf.
192+
func tgs(x, y []string) []pair {
193+
// Count the number of times each string appears in a and b.
194+
// We only care about 0, 1, many, counted as 0, -1, -2
195+
// for the x side and 0, -4, -8 for the y side.
196+
// Using negative numbers now lets us distinguish positive line numbers later.
197+
m := make(map[string]int)
198+
for _, s := range x {
199+
if c := m[s]; c > -2 {
200+
m[s] = c - 1
201+
}
202+
}
203+
for _, s := range y {
204+
if c := m[s]; c > -8 {
205+
m[s] = c - 4
206+
}
207+
}
208+
209+
// Now unique strings can be identified by m[s] = -1+-4.
210+
//
211+
// Gather the indexes of those strings in x and y, building:
212+
// xi[i] = increasing indexes of unique strings in x.
213+
// yi[i] = increasing indexes of unique strings in y.
214+
// inv[i] = index j such that x[xi[i]] = y[yi[j]].
215+
var xi, yi, inv []int
216+
for i, s := range y {
217+
if m[s] == -1+-4 {
218+
m[s] = len(yi)
219+
yi = append(yi, i)
220+
}
221+
}
222+
for i, s := range x {
223+
if j, ok := m[s]; ok && j >= 0 {
224+
xi = append(xi, i)
225+
inv = append(inv, j)
226+
}
227+
}
228+
229+
// Apply Algorithm A from Szymanski's paper.
230+
// In those terms, A = J = inv and B = [0, n).
231+
// We add sentinel pairs {0,0}, and {len(x),len(y)}
232+
// to the returned sequence, to help the processing loop.
233+
J := inv
234+
n := len(xi)
235+
T := make([]int, n)
236+
L := make([]int, n)
237+
for i := range T {
238+
T[i] = n + 1
239+
}
240+
for i := 0; i < n; i++ {
241+
k := sort.Search(n, func(k int) bool {
242+
return T[k] >= J[i]
243+
})
244+
T[k] = J[i]
245+
L[i] = k + 1
246+
}
247+
k := 0
248+
for _, v := range L {
249+
if k < v {
250+
k = v
251+
}
252+
}
253+
seq := make([]pair, 2+k)
254+
seq[1+k] = pair{len(x), len(y)} // sentinel at end
255+
lastj := n
256+
for i := n - 1; i >= 0; i-- {
257+
if L[i] == k && J[i] < lastj {
258+
seq[k] = pair{xi[i], yi[J[i]]}
259+
k--
260+
}
261+
}
262+
seq[0] = pair{0, 0} // sentinel at start
263+
return seq
264+
}

internal/diffp/diff_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package diffp
6+
7+
import (
8+
"bytes"
9+
"path/filepath"
10+
"testing"
11+
12+
"golang.org/x/tools/txtar"
13+
)
14+
15+
func clean(text []byte) []byte {
16+
text = bytes.ReplaceAll(text, []byte("$\n"), []byte("\n"))
17+
text = bytes.TrimSuffix(text, []byte("^D\n"))
18+
return text
19+
}
20+
21+
func Test(t *testing.T) {
22+
files, _ := filepath.Glob("testdata/*.txt")
23+
if len(files) == 0 {
24+
t.Fatalf("no testdata")
25+
}
26+
27+
for _, file := range files {
28+
t.Run(filepath.Base(file), func(t *testing.T) {
29+
a, err := txtar.ParseFile(file)
30+
if err != nil {
31+
t.Fatal(err)
32+
}
33+
if len(a.Files) != 3 || a.Files[2].Name != "diff" {
34+
t.Fatalf("%s: want three files, third named \"diff\"", file)
35+
}
36+
diffs := Diff(a.Files[0].Name, clean(a.Files[0].Data), a.Files[1].Name, clean(a.Files[1].Data))
37+
want := clean(a.Files[2].Data)
38+
if !bytes.Equal(diffs, want) {
39+
t.Fatalf("%s: have:\n%s\nwant:\n%s\n%s", file,
40+
diffs, want, Diff("have", diffs, "want", want))
41+
}
42+
})
43+
}
44+
}

internal/diffp/testdata/allnew.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
-- old --
2+
-- new --
3+
a
4+
b
5+
c
6+
-- diff --
7+
diff old new
8+
--- old
9+
+++ new
10+
@@ -0,0 +1,3 @@
11+
+a
12+
+b
13+
+c

internal/diffp/testdata/allold.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
-- old --
2+
a
3+
b
4+
c
5+
-- new --
6+
-- diff --
7+
diff old new
8+
--- old
9+
+++ new
10+
@@ -1,3 +0,0 @@
11+
-a
12+
-b
13+
-c

internal/diffp/testdata/basic.txt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
Example from Hunt and McIlroy, “An Algorithm for Differential File Comparison.”
2+
https://www.cs.dartmouth.edu/~doug/diff.pdf
3+
4+
-- old --
5+
a
6+
b
7+
c
8+
d
9+
e
10+
f
11+
g
12+
-- new --
13+
w
14+
a
15+
b
16+
x
17+
y
18+
z
19+
e
20+
-- diff --
21+
diff old new
22+
--- old
23+
+++ new
24+
@@ -1,7 +1,7 @@
25+
+w
26+
a
27+
b
28+
-c
29+
-d
30+
+x
31+
+y
32+
+z
33+
e
34+
-f
35+
-g

0 commit comments

Comments
 (0)