Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create a lot simpler sorted uint64 codec #2716

Merged
merged 4 commits into from
Oct 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions binary/codec.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Copyright 2018 Dgraph Labs, Inc. and Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package binary

import (
"github.com/dgraph-io/dgraph/protos/pb"
)

func packBlock(uids []uint64) *pb.UidBlock {
manishrjain marked this conversation as resolved.
Show resolved Hide resolved
if len(uids) == 0 {
return nil
}
block := &pb.UidBlock{Base: uids[0]}
last := uids[0]
for _, uid := range uids[1:] {
block.Deltas = append(block.Deltas, uid-last)
last = uid
}
return block
}

// Encode takes in a list of uids and a block size. It would pack these uids into blocks of the
// given size, with the last block having fewer uids. Within each block, it stores the first uid as
// base. For each next uid, a delta = uids[i] - uids[i-1] is stored. Protobuf uses Varint encoding,
// as mentioned here: https://developers.google.com/protocol-buffers/docs/encoding . This ensures
// that the deltas being considerably smaller than the original uids are nicely packed in fewer
// bytes. Our benchmarks on artificial data show compressed size to be 13% of the original. This
// mechanism is a LOT simpler to understand and if needed, debug.
func Encode(uids []uint64, blockSize int) pb.UidPack {
manishrjain marked this conversation as resolved.
Show resolved Hide resolved
pack := pb.UidPack{BlockSize: uint32(blockSize)}
for {
if len(uids) <= blockSize {
block := packBlock(uids)
pack.Blocks = append(pack.Blocks, block)
return pack
}
block := packBlock(uids[:blockSize])
pack.Blocks = append(pack.Blocks, block)
uids = uids[blockSize:] // Advance.
}
}

// NumUids returns the number of uids stored in a UidPack.
func NumUids(pack pb.UidPack) int {
manishrjain marked this conversation as resolved.
Show resolved Hide resolved
sz := len(pack.Blocks)
if sz == 0 {
return 0
}
lastBlock := pack.Blocks[sz-1]
return (sz-1)*int(pack.BlockSize) + len(lastBlock.Deltas) + 1 // We don't store base in deltas.
}

// Decode decodes the UidPack back into the list of uids. This is a stop-gap function, Decode would
// need to do more specific things than just return the list back.
func Decode(pack pb.UidPack) []uint64 {
manishrjain marked this conversation as resolved.
Show resolved Hide resolved
uids := make([]uint64, NumUids(pack))
uids = uids[:0]

for _, block := range pack.Blocks {
last := block.Base
uids = append(uids, last)
for _, delta := range block.Deltas {
uid := last + delta
uids = append(uids, uid)
last = uid
}
}
return uids
}
153 changes: 153 additions & 0 deletions binary/codec_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Copyright 2018 Dgraph Labs, Inc. and Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package binary

import (
"bytes"
"compress/gzip"
"encoding/binary"
"math/rand"
"testing"
"time"

"github.com/dgraph-io/dgraph/protos/pb"
"github.com/dgraph-io/dgraph/x"
humanize "github.com/dustin/go-humanize"
"github.com/stretchr/testify/require"
)

func getUids(size int) []uint64 {
var uids []uint64
last := uint64(rand.Intn(100))
uids = append(uids, last)
for i := 1; i < size; i++ {
last += uint64(rand.Intn(33))
uids = append(uids, last)
}
return uids
}

func TestUidPack(t *testing.T) {
rand.Seed(time.Now().UnixNano())

// Some edge case tests.
Encode([]uint64{}, 128)
require.Equal(t, 0, NumUids(pb.UidPack{}))
require.Equal(t, 0, len(Decode(pb.UidPack{})))

for i := 0; i < 13; i++ {
size := rand.Intn(10e6)
if size < 0 {
size = 1e6
}
t.Logf("Testing with size = %d", size)

expected := getUids(size)
pack := Encode(expected, 256)
for _, block := range pack.Blocks {
require.True(t, len(block.Deltas) <= 255)
}
require.Equal(t, len(expected), NumUids(pack))
actual := Decode(pack)
require.Equal(t, expected, actual)
}
}

func BenchmarkGzip(b *testing.B) {
rand.Seed(time.Now().UnixNano())

uids := getUids(1e6)
b.ResetTimer()
sz := uint64(len(uids)) * 8

b.Logf("Dataset Len=%d. Size: %s", len(uids), humanize.Bytes(sz))
var data []byte
for i := 0; i < b.N; i++ {
tmp := make([]byte, binary.MaxVarintLen64)
var buf bytes.Buffer
for _, uid := range uids {
n := binary.PutUvarint(tmp, uid)
_, err := buf.Write(tmp[:n])
x.Check(err)
}

var out bytes.Buffer
zw := gzip.NewWriter(&out)
_, err := zw.Write(buf.Bytes())
x.Check(err)

data = out.Bytes()
}
b.Logf("Output size: %s. Compression: %.2f",
humanize.Bytes(uint64(len(data))),
float64(len(data))/float64(sz))
}

func benchmarkUidPackEncode(b *testing.B, blockSize int) {
rand.Seed(time.Now().UnixNano())

uids := getUids(1e6)
sz := uint64(len(uids)) * 8
b.Logf("Dataset Len=%d. Size: %s", len(uids), humanize.Bytes(sz))
b.ResetTimer()

var data []byte
for i := 0; i < b.N; i++ {
pack := Encode(uids, blockSize)
out, err := pack.Marshal()
x.Check(err)
data = out
}
b.Logf("Output size: %s. Compression: %.2f",
humanize.Bytes(uint64(len(data))),
float64(len(data))/float64(sz))
}

func BenchmarkUidPack(b *testing.B) {
b.Run("encode/128", func(b *testing.B) {
benchmarkUidPackEncode(b, 128)
})
b.Run("encode/256", func(b *testing.B) {
benchmarkUidPackEncode(b, 256)
})
b.Run("decode/128", func(b *testing.B) {
benchmarkUidPackDecode(b, 128)
})
b.Run("decode/256", func(b *testing.B) {
benchmarkUidPackDecode(b, 256)
})
}

func benchmarkUidPackDecode(b *testing.B, blockSize int) {
rand.Seed(time.Now().UnixNano())

uids := getUids(1e6)
sz := uint64(len(uids)) * 8
b.Logf("Dataset Len=%d. Size: %s", len(uids), humanize.Bytes(sz))

pack := Encode(uids, blockSize)
data, err := pack.Marshal()
x.Check(err)
b.Logf("Output size: %s. Compression: %.2f",
humanize.Bytes(uint64(len(data))),
float64(len(data))/float64(sz))

b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = Decode(pack)
}
}
10 changes: 10 additions & 0 deletions protos/pb.proto
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,16 @@ message Posting {
uint64 commit_ts = 14; // Meant to use only inmemory
}

message UidBlock {
uint64 base = 1;
repeated uint64 deltas = 2;
}

message UidPack {
uint32 block_size = 1;
repeated UidBlock blocks = 2;
}

message PostingList {
repeated Posting postings = 1;
bytes checksum = 2;
Expand Down
Loading