Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix line diff by using rune index without a separator #136

Merged
merged 2 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions diffmatchpatch/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ const (
DiffInsert Operation = 1
// DiffEqual item represents an equal diff.
DiffEqual Operation = 0
//IndexSeparator is used to seperate the array indexes in an index string
IndexSeparator = ","
)

// Diff represents one diff operation
Expand Down Expand Up @@ -406,14 +404,11 @@ func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
hydrated := make([]Diff, 0, len(diffs))
for _, aDiff := range diffs {
chars := strings.Split(aDiff.Text, IndexSeparator)
text := make([]string, len(chars))
runes := []rune(aDiff.Text)
text := make([]string, len(runes))

for i, r := range chars {
i1, err := strconv.Atoi(r)
if err == nil {
text[i] = lineArray[i1]
}
for i, r := range runes {
text[i] = lineArray[runeToInt(r)]
}

aDiff.Text = strings.Join(text, "")
Expand Down
58 changes: 44 additions & 14 deletions diffmatchpatch/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,12 +314,12 @@ func TestDiffLinesToChars(t *testing.T) {
dmp := New()

for i, tc := range []TestCase{
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
{"a", "b", "1", "2", []string{"", "a", "b"}},
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\x01\x02\x03\x03", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
{"a", "b", "\x01", "\x02", []string{"", "a", "b"}},
// Omit final newline.
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
{"alpha\nbeta\nalpha", "", "\x01\x02\x03", "", []string{"", "alpha\n", "beta\n", "alpha"}},
// Same lines in Text1 and Text2
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "\x01\x02\x03", "\x01\x04\x03\x05", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
} {
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
Expand All @@ -332,14 +332,13 @@ func TestDiffLinesToChars(t *testing.T) {
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
var charList []string
var charList []rune
for x := 1; x < n+1; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, strconv.Itoa(x))
charList = append(charList, rune(x))
}
lines := strings.Join(lineList, "")
chars := strings.Join(charList[:], ",")
assert.Equal(t, n, len(strings.Split(chars, ",")))
chars := string(charList)

actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
assert.Equal(t, chars, actualChars1)
Expand All @@ -360,8 +359,8 @@ func TestDiffCharsToLines(t *testing.T) {
for i, tc := range []TestCase{
{
Diffs: []Diff{
{DiffEqual, "1,2,1"},
{DiffInsert, "2,1,2"},
{DiffEqual, "\x01\x02\x01"},
{DiffInsert, "\x02\x01\x02"},
},
Lines: []string{"", "alpha\n", "beta\n"},

Expand All @@ -380,13 +379,12 @@ func TestDiffCharsToLines(t *testing.T) {
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
charList := []string{}
charList := []rune{}
for x := 1; x <= n; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, strconv.Itoa(x))
charList = append(charList, rune(x))
}
assert.Equal(t, n, len(charList))
chars := strings.Join(charList[:], ",")
chars := string(charList)

actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
Expand Down Expand Up @@ -1468,6 +1466,38 @@ func TestMassiveRuneDiffConversion(t *testing.T) {
assert.NotEmpty(t, diffs)
}

func TestDiffPartialLineIndex(t *testing.T) {
dmp := New()
t1, t2, tt := dmp.DiffLinesToChars(
`line 1
line 2
line 3
line 4
line 5
line 6
line 7
line 8
line 9
line 10 text1`,
`line 1
line 2
line 3
line 4
line 5
line 6
line 7
line 8
line 9
line 10 text2`)
diffs := dmp.DiffMain(t1, t2, false)
diffs = dmp.DiffCharsToLines(diffs, tt)
assert.Equal(t, []Diff{
Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"},
Diff{DiffDelete, "line 10 text1"},
Diff{DiffInsert, "line 10 text2"},
}, diffs)
}

func BenchmarkDiffMain(bench *testing.B) {
s1 := "`Twas brillig, and the slithy toves\nDid gyre and gimble in the wabe:\nAll mimsy were the borogoves,\nAnd the mome raths outgrabe.\n"
s2 := "I am the very model of a modern major general,\nI've information vegetable, animal, and mineral,\nI know the kings of England, and I quote the fights historical,\nFrom Marathon to Waterloo, in order categorical.\n"
Expand Down
100 changes: 92 additions & 8 deletions diffmatchpatch/stringutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,16 @@
package diffmatchpatch

import (
"strconv"
"fmt"
"strings"
"unicode/utf8"
)

const UNICODE_INVALID_RANGE_START = 0xD800
const UNICODE_INVALID_RANGE_END = 0xDFFF
const UNICODE_INVALID_RANGE_DELTA = UNICODE_INVALID_RANGE_END - UNICODE_INVALID_RANGE_START + 1
const UNICODE_RANGE_MAX = 0x10FFFF

// unescaper unescapes selected chars for compatibility with JavaScript's encodeURI.
// In speed critical applications this could be dropped since the receiving application will certainly decode these fine. Note that this function is case-sensitive. Thus "%3F" would not be unescaped. But this is ok because it is only called with the output of HttpUtility.UrlEncode which returns lowercase hex. Example: "%3f" -> "?", "%24" -> "$", etc.
var unescaper = strings.NewReplacer(
Expand Down Expand Up @@ -93,14 +98,93 @@ func intArrayToString(ns []uint32) string {
return ""
}

indexSeparator := IndexSeparator[0]

// Appr. 3 chars per num plus the comma.
b := []byte{}
b := []rune{}
for _, n := range ns {
b = strconv.AppendInt(b, int64(n), 10)
b = append(b, indexSeparator)
b = append(b, intToRune(n))
}
b = b[:len(b)-1]
return string(b)
}

// These constants define the number of bits representable
// in 1,2,3,4 byte utf8 sequences, respectively.
const ONE_BYTE_BITS = 7
const TWO_BYTE_BITS = 11
const THREE_BYTE_BITS = 16
const FOUR_BYTE_BITS = 21

// Helper for getting a sequence of bits from an integer.
func getBits(i uint32, cnt byte, from byte) byte {
return byte((i >> from) & ((1 << cnt) - 1))
}

// Converts an integer in the range 0~1112060 into a rune.
// Based on the ranges table in https://en.wikipedia.org/wiki/UTF-8
func intToRune(i uint32) rune {
if i < (1 << ONE_BYTE_BITS) {
return rune(i)
}

if i < (1 << TWO_BYTE_BITS) {
r, size := utf8.DecodeRune([]byte{0b11000000 | getBits(i, 5, 6), 0b10000000 | getBits(i, 6, 0)})
if size != 2 || r == utf8.RuneError {
panic(fmt.Sprintf("Error encoding an int %d with size 2, got rune %v and size %d", size, r, i))
}
return r
}

// Last -3 here needed because for some reason 3rd to last codepoint 65533 in this range
// was returning utf8.RuneError during encoding.
if i < ((1 << THREE_BYTE_BITS) - UNICODE_INVALID_RANGE_DELTA - 3) {
if i >= UNICODE_INVALID_RANGE_START {
i += UNICODE_INVALID_RANGE_DELTA
}

r, size := utf8.DecodeRune([]byte{0b11100000 | getBits(i, 4, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)})
if size != 3 || r == utf8.RuneError {
panic(fmt.Sprintf("Error encoding an int %d with size 3, got rune %v and size %d", size, r, i))
}
return r
}

if i < (1<<FOUR_BYTE_BITS - UNICODE_INVALID_RANGE_DELTA - 3) {
i += UNICODE_INVALID_RANGE_DELTA + 3
r, size := utf8.DecodeRune([]byte{0b11110000 | getBits(i, 3, 18), 0b10000000 | getBits(i, 6, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)})
if size != 4 || r == utf8.RuneError {
panic(fmt.Sprintf("Error encoding an int %d with size 4, got rune %v and size %d", size, r, i))
}
return r
}
panic(fmt.Sprintf("The integer %d is too large for runeToInt()", i))
}

// Converts a rune generated by intToRune back to an integer
func runeToInt(r rune) uint32 {
i := uint32(r)
if i < (1 << ONE_BYTE_BITS) {
return i
}

bytes := []byte{0, 0, 0, 0}

size := utf8.EncodeRune(bytes, r)

if size == 2 {
return uint32(bytes[0]&0b11111)<<6 | uint32(bytes[1]&0b111111)
}

if size == 3 {
result := uint32(bytes[0]&0b1111)<<12 | uint32(bytes[1]&0b111111)<<6 | uint32(bytes[2]&0b111111)
if result >= UNICODE_INVALID_RANGE_END {
return result - UNICODE_INVALID_RANGE_DELTA
}

return result
}

if size == 4 {
result := uint32(bytes[0]&0b111)<<18 | uint32(bytes[1]&0b111111)<<12 | uint32(bytes[2]&0b111111)<<6 | uint32(bytes[3]&0b111111)
return result - UNICODE_INVALID_RANGE_DELTA - 3
}

panic(fmt.Sprintf("Unexpected state decoding rune=%v size=%d", r, size))
}
18 changes: 18 additions & 0 deletions diffmatchpatch/stringutil_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,21 @@ func TestLastIndexOf(t *testing.T) {
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc))
}
}

// Exhaustive check for all ints from 0 to 1112060 for correctness of implementation
// of `intToRune() -> runeToInt()`.
// This test is slow and runs longer than 5 seconds but it does provide a safety
// guarantee that these 2 functions are correct for the ranges we support.
func TestRuneToInt(t *testing.T) {

for i := uint32(0); i <= UNICODE_RANGE_MAX-UNICODE_INVALID_RANGE_DELTA-3; i += 1 {
r := intToRune(i)
ic := runeToInt(r)

assert.Equal(t, i, ic, fmt.Sprintf("intToRune(%d)=%d and runeToInt(%d)=%d", i, r, r, ic))
}

assert.Panics(t, func() {
intToRune(UNICODE_RANGE_MAX - UNICODE_INVALID_RANGE_DELTA - 2)
})
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ require (
gopkg.in/yaml.v2 v2.4.0 // indirect
)

go 1.12
go 1.13
Copy link
Contributor Author

@kdarkhan kdarkhan Jan 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upgrading to go 1.13 in order to use binary and hex integer literals.