Skip to content

Commit

Permalink
add utf16x package
Browse files Browse the repository at this point in the history
  • Loading branch information
soypat committed Feb 4, 2024
1 parent fb42f8f commit a9bda00
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 107 deletions.
126 changes: 19 additions & 107 deletions internal/gpt/gpt.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ package gpt
import (
"encoding/binary"
"errors"
"unicode/utf16"
"unicode/utf8"

"github.com/soypat/fat/internal/utf16x"
)

const (
Expand Down Expand Up @@ -228,122 +228,34 @@ func (p *PartitionEntry) SetAttributes(attr PartitionAttributes) {
// encodes it as utf-8 into the provided slice. The number of bytes
// read is returned along with any error.
func (p *PartitionEntry) ReadName(b []byte) (int, error) {
n16 := 0
for ; n16 < pteNameLen; n16++ {
off := pteNameOff + n16*2
wc := rune(binary.LittleEndian.Uint16(p.data[off:]))
if wc == 0 {
// Find the length of the name.
nameLen := 0
for ; nameLen < pteNameLen; nameLen++ {
if p.data[pteNameOff+nameLen] == 0 {
break
}
}
return encodeUTF16to8(b, p.data[pteNameOff:pteNameOff+n16*2], binary.LittleEndian)

n, err := utf16x.ToUTF8(b, p.data[pteNameOff:pteNameOff+nameLen], binary.LittleEndian)
if err != nil {
return n, err
}
return n, nil
}

func (p *PartitionEntry) ClearName() {
p.data[pteNameOff] = 0
}

// WriteName writes a utf-8 encoded string as the Partition Entry's name.
func (p *PartitionEntry) WriteName(name string) error {
pteOff := 0
for len(name) > 0 {
r, size := utf8.DecodeRuneInString(name)
if r == utf8.RuneError {
return errors.New("invalid utf-8 string")
}
const surrogateSelf = 0x10000
switch {
case size == 1 || r < surrogateSelf:
// Does not need special surrogate encoding.
binary.LittleEndian.PutUint16(p.data[pteNameOff+pteOff:], uint16(r))
pteOff += 2

default:
// Needs surrogate encoding.
r1, r2 := utf16.EncodeRune(r)
if r1 == '\uFFFD' && r2 == '\uFFFD' {
return errors.New("gpt: utf8->utf16 conversion error unreachable")
}
binary.LittleEndian.PutUint16(p.data[pteNameOff+pteOff:], uint16(r1))
binary.LittleEndian.PutUint16(p.data[pteNameOff+pteOff+2:], uint16(r2))
pteOff += 4
}
name = name[size:]
}
return nil
}

func encodeUTF16to8(dstUTF8, srcUTF16 []byte, order16 binary.ByteOrder) (int, error) {
// UTF16 values.
const (
// 0xd800-0xdc00 encodes the high 10 bits of a pair.
// 0xdc00-0xe000 encodes the low 10 bits of a pair.
// the value is those 20 bits plus 0x10000.
surr1 = 0xd800
surr2 = 0xdc00
surr3 = 0xe000

surrSelf = 0x10000
)
n := 0
var r1, r2 rune
for {
slen := len(srcUTF16)
if slen == 0 {
break
}
r1 = rune(order16.Uint16(srcUTF16))
if slen >= 4 {
r2 = rune(order16.Uint16(srcUTF16[2:]))
}
var ar rune
switch {
case r1 < surr1, surr3 <= r1:
// normal rune
ar = r1
srcUTF16 = srcUTF16[2:]
case surr1 <= r1 && r1 < surr2 && slen >= 4 &&
surr2 <= r2 && r2 < surr3:
// valid surrogate sequence
ar = utf16.DecodeRune(r1, r2)
srcUTF16 = srcUTF16[4:]
default:
// invalid surrogate sequence
return n, errors.New("invalid utf16")
}
// Encode the rune into UTF-8.
if utf8.RuneLen(ar) > len(dstUTF8[n:]) {
return n, errors.New("insufficient utf8 buffer")
}
n += utf8.EncodeRune(dstUTF8[n:], ar)
func (p *PartitionEntry) WriteName(name []byte) error {
n, err := utf16x.FromUTF8(p.data[pteNameOff:pteNameOff+pteNameLen], name, binary.LittleEndian)
if err != nil {
return err
}
return n, nil
}

func encodeUTF8to16(dst16, src8 []byte, order16 binary.ByteOrder) (int, error) {
n := 0
for len(src8) > 0 {
r1, size := utf8.DecodeRune(src8)
src8 = src8[size:]
switch {
case utf16.IsSurrogate(r1):
// Surrogate pair case.
if len(dst16) < 4 {
return n, errors.New("insufficient utf16 buffer")
}
r1, r2 := utf16.EncodeRune(r1)
order16.PutUint16(dst16[n:], uint16(r1))
order16.PutUint16(dst16[n+2:], uint16(r2))
n += 4
default:
// General case.
if len(dst16) < 2 {
return n, errors.New("insufficient utf16 buffer")
}
// Simplest case for ASCII characters.
order16.PutUint16(dst16[n:], uint16(r1))
n += 2
}
for i := n; i < pteNameLen; i++ {
p.data[pteNameOff+i] = 0
}
return n, nil
return nil
}
120 changes: 120 additions & 0 deletions internal/utf16x/utf16x.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package utf16x

import (
"encoding/binary"
"errors"
"unicode/utf16"
"unicode/utf8"
)

const (
// 0xd800-0xdc00 encodes the high 10 bits of a pair.
// 0xdc00-0xe000 encodes the low 10 bits of a pair.
// the value is those 20 bits plus 0x10000.
surr1 = 0xd800
surr2 = 0xdc00
surr3 = 0xe000

surrSelf = 0x10000
)

// The conditions replacementChar==unicode.ReplacementChar and
// maxRune==unicode.MaxRune are verified in the tests.
// Defining them locally avoids this package depending on package unicode.

const (
replacementChar = '\uFFFD' // Unicode replacement character
maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
)

var (
errMultiple2 = errors.New("UTF16 bytes length must be multiple of 2")
errShortDst = errors.New("short destination buffer")
errInvalidUTF8 = errors.New("invalid utf8 sequence")
errInvalidUTF16 = errors.New("invalid utf16 sequence")
)

func ToUTF8(dstUTF8, srcUTF16 []byte, order16 binary.ByteOrder) (int, error) {
if len(srcUTF16)%2 != 0 {
return 0, errMultiple2
}
n := 0
for len(srcUTF16) > 1 {
r, size := DecodeRune(srcUTF16, order16)
if r == utf8.RuneError {
return n, errInvalidUTF16
} else if utf8.RuneLen(r) > len(dstUTF8) {
return n, errShortDst
}
srcUTF16 = srcUTF16[size:]
n += utf8.EncodeRune(dstUTF8[n:], r)
}
return n, nil
}

func FromUTF8(dst16, src8 []byte, order16 binary.ByteOrder) (int, error) {
n := 0
for len(src8) > 0 {
if len(dst16) < 2 {
return n, errShortDst
}
r1, size := utf8.DecodeRune(src8)
if r1 == utf8.RuneError {
return n, errInvalidUTF8
} else if len(dst16) < 4 && utf16.IsSurrogate(r1) {
return n, errShortDst
}
n += EncodeRune(dst16[n:], r1, order16)
src8 = src8[size:]
}
return n, nil
}

func EncodeRune(dst16 []byte, v rune, order16 binary.ByteOrder) (sizeBytes int) {
switch {
case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
// normal rune
_ = dst16[1] // Eliminate bounds check.
order16.PutUint16(dst16, uint16(v))
return 2

case surrSelf <= v && v <= maxRune:
// needs surrogate sequence
_ = dst16[3] // Eliminate bounds check.
r1, r2 := utf16.EncodeRune(v)
order16.PutUint16(dst16, uint16(r1))
order16.PutUint16(dst16[2:], uint16(r2))
return 4

default:
_ = dst16[1] // Eliminate bounds check.
order16.PutUint16(dst16, uint16(replacementChar))
return 2
}
}

func DecodeRune(srcUTF16 []byte, order16 binary.ByteOrder) (r rune, size int) {
_ = srcUTF16[1] // Eliminate bounds check.
slen := len(srcUTF16)
if slen == 0 {
return '\uFFFD', 1
}
r = rune(order16.Uint16(srcUTF16))
switch {
case r < surr1, surr3 <= r:
// normal rune
return r, 2
case surr1 <= r && r < surr2:
_ = srcUTF16[3] // Eliminate bounds check.
r2 := rune(order16.Uint16(srcUTF16[2:]))
if !(surr2 <= r2 && r2 < surr3) {
// Invalid surrogate sequence.
return replacementChar, 2
}
// valid surrogate sequence
return utf16.DecodeRune(r, r2), 4
default:
// invalid surrogate sequence
return replacementChar, 2
}
}

0 comments on commit a9bda00

Please sign in to comment.