Skip to content

Commit

Permalink
Replace varint with groupvarint.
Browse files Browse the repository at this point in the history
  • Loading branch information
animesh2049 committed Jun 6, 2019
1 parent 78a0a2c commit 974a1b2
Show file tree
Hide file tree
Showing 4 changed files with 329 additions and 257 deletions.
86 changes: 58 additions & 28 deletions codec/codec.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@ package codec

import (
"bytes"
"encoding/binary"
"math"
"sort"

"github.com/dgraph-io/dgraph/protos/pb"
"github.com/dgraph-io/dgraph/x"
"github.com/dgryski/go-groupvarint"
)

type seekPos int
Expand All @@ -45,18 +44,36 @@ func (e *Encoder) packBlock() {
if len(e.uids) == 0 {
return
}
block := &pb.UidBlock{Base: e.uids[0]}
block := &pb.UidBlock{Base: e.uids[0], UidNum: uint32(len(e.uids))}
last := e.uids[0]
e.uids = e.uids[1:]

count := 1
var out bytes.Buffer
var buf [binary.MaxVarintLen64]byte
for _, uid := range e.uids[1:] {
n := binary.PutUvarint(buf[:], uid-last)
x.Check2(out.Write(buf[:n]))
last = uid
count++
buf := make([]byte, 17)
tmpUids := make([]uint32, 4)
for {
i := 0
for ; i < 4; i++ {
if i >= len(e.uids) {
// Padding with '0' because Encode4 encodes only in batch of 4.
tmpUids[i] = 0
} else {
tmpUids[i] = uint32(e.uids[i] - last)
last = e.uids[i]
}
}

data := groupvarint.Encode4(buf, tmpUids)
out.Write(data)

if len(e.uids) <= 4 {
break
}
e.uids = e.uids[4:]
}

// TODO(Animesh): put comment

block.Deltas = out.Bytes()
e.pack.Blocks = append(e.pack.Blocks, block)
}
Expand All @@ -65,6 +82,13 @@ func (e *Encoder) Add(uid uint64) {
if e.pack == nil {
e.pack = &pb.UidPack{BlockSize: uint32(e.BlockSize)}
}

size := len(e.pack.Blocks)
if size > 0 && !match32MSB(e.pack.Blocks[size-1].Base, uid) {
e.packBlock()
e.uids = e.uids[:0]
}

e.uids = append(e.uids, uid)
if len(e.uids) >= e.BlockSize {
e.packBlock()
Expand Down Expand Up @@ -98,17 +122,24 @@ func (d *Decoder) unpackBlock() []uint64 {
last := block.Base
d.uids = append(d.uids, last)

var tmpUids [4]uint32
deltas := block.Deltas
// TODO(Animesh): Explain this padding
deltas = append(deltas, 0, 0, 0)

// Read back the encoded varints.
var offset int
for offset < len(block.Deltas) {
delta, n := binary.Uvarint(block.Deltas[offset:])
x.AssertTrue(n > 0)
offset += n
uid := last + delta
d.uids = append(d.uids, uid)
last = uid
// Because 4 integers are encoded in atleast 5 bytes.
// TODO(Animesh): explain more about condition
for len(deltas) > 5 {
groupvarint.Decode4(tmpUids[:], deltas)
deltas = deltas[groupvarint.BytesUsed[deltas[0]]:]
for i := 0; i < 4; i++ {
d.uids = append(d.uids, uint64(tmpUids[i])+last)
last += uint64(tmpUids[i])
}
}
return d.uids

return d.uids[:block.UidNum]
}

func (d *Decoder) ApproxLen() int {
Expand Down Expand Up @@ -257,16 +288,10 @@ func ExactLen(pack *pb.UidPack) int {
if sz == 0 {
return 0
}
block := pack.Blocks[sz-1]
num := 1 // Count the base.
for _, b := range block.Deltas {
// If the MSB in varint encoding is zero, then it is the final byte, not a continuation of
// the integer. Thus, we can count it as one delta.
if b&0x80 == 0 {
num++
}
num := 0
for _, b := range pack.Blocks {
num += int(b.UidNum) // UidNum includes the base UID.
}
num += (sz - 1) * int(pack.BlockSize)
return num
}

Expand All @@ -281,3 +306,8 @@ func Decode(pack *pb.UidPack, seek uint64) []uint64 {
}
return uids
}

func match32MSB(num1, num2 uint64) bool {
mask := uint64(0xffffffff00000000)
return (num1 & mask) == (num2 & mask)
}
5 changes: 1 addition & 4 deletions codec/codec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@ func TestUidPack(t *testing.T) {

expected := getUids(size)
pack := Encode(expected, 256)
for _, block := range pack.Blocks {
require.True(t, len(block.Deltas) <= 255)
}
require.Equal(t, len(expected), ExactLen(pack))
actual := Decode(pack, 0)
require.Equal(t, expected, actual)
Expand Down Expand Up @@ -111,7 +108,7 @@ func TestSeek(t *testing.T) {
}

func TestDecoder(t *testing.T) {
N := 10001
N := 13
var expected []uint64
enc := Encoder{BlockSize: 10}
for i := 3; i < N; i += 3 {
Expand Down
5 changes: 5 additions & 0 deletions protos/pb.proto
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,11 @@ message UidBlock {
// because when the PB is brought to memory, Go would always use 8-bytes per integer. Instead,
// storing it as a byte slice is a lot cheaper in memory.
bytes deltas = 2;
// uidNum is the number of UIDs in the block. We are including this because we want to
// swtich encoding to groupvarint encoding. Current avaialble open source version implements
// encoding and decoding for uint32. We want to wrap it around our logic to use it here.
// Default Blocksize is 256 so uint32 would be sufficient.
uint32 uidNum = 3;
}

message UidPack {
Expand Down
Loading

0 comments on commit 974a1b2

Please sign in to comment.