Skip to content

Commit

Permalink
encoding: fix auto detection of utf8 without BOM + add a test
Browse files Browse the repository at this point in the history
fixes #6
  • Loading branch information
martinlindhe committed Feb 26, 2021
1 parent c133f18 commit 8010a8a
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 0 deletions.
2 changes: 2 additions & 0 deletions encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ func ConvertToUTF8(b []byte) string {
s, _ = utf16ToUTF8(b[2:], false)
} else if hasUTF8Marker(b) {
s = string(b[3:])
} else if utf8.ValidString(string(b)) {
s = string(b)
} else if looksLikeLatin1(b) {
s = latin1toUTF8(b)
} else {
Expand Down
6 changes: 6 additions & 0 deletions encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ func TestLooksLikeLatin1(t *testing.T) {
assert.Equal(t, false, looksLikeLatin1([]byte("hallå")))
}

func TestTranscodeToUTF8(t *testing.T) {
assert.Equal(t, "hallå", ConvertToUTF8([]byte("hall\xe5"))) // from: latin1
assert.Equal(t, "hallå", ConvertToUTF8([]byte("hallå"))) // from: utf8 (Swedish)
assert.Equal(t, "烟火里的尘埃", ConvertToUTF8([]byte("烟火里的尘埃"))) // from: utf8 (Chinese)
}

func TestReadFileAsUTF8(t *testing.T) {
f, err := os.Open("README.md")
assert.Equal(t, nil, err)
Expand Down

0 comments on commit 8010a8a

Please sign in to comment.