Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Parquet Support For Egress Pipeline Output #67

Merged
merged 11 commits into from
May 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion internal/encoding/block/block_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func TestBlock_Types(t *testing.T) {
assert.NoError(t, err)

schema := b[0].Schema()
assert.Equal(t, 8, len(schema))
assert.Equal(t, 9, len(schema))
assert.Contains(t, schema, "boolean1")
assert.Contains(t, schema, "double1")
assert.Contains(t, schema, "int1")
Expand Down
1 change: 1 addition & 0 deletions internal/encoding/block/from_orc.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,5 +147,6 @@ func convertToString(value interface{}) (string, bool) {
if ok {
return strconv.FormatInt(valueInt, 10), true
}

return "", false
}
40 changes: 36 additions & 4 deletions internal/encoding/block/from_parquet.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package block

import (
"fmt"

"github.com/kelindar/talaria/internal/column"
"github.com/kelindar/talaria/internal/encoding/parquet"
"github.com/kelindar/talaria/internal/encoding/typeof"
Expand Down Expand Up @@ -65,10 +67,9 @@ func FromParquetBy(payload []byte, partitionBy string, filter *typeof.Schema, ap
columnName := cols[i]
columnType := schema[columnName]

// Encode to JSON
if columnType == typeof.JSON {
if encoded, ok := convertToJSON(v); ok {
v = encoded
if handler := parquetHandlerFor(columnType.String()); handler != nil {
if v, err = handler(v); err != nil {
return true
}
}

Expand All @@ -92,3 +93,34 @@ func FromParquetBy(payload []byte, partitionBy string, filter *typeof.Schema, ap
blocks = append(blocks, last...)
return blocks, nil
}

type parquetFieldHandler func(interface{}) (interface{}, error)

func parquetHandlerFor(typ string) parquetFieldHandler {
switch typ {
case "string":
return parquetStringHandler

case "json":
return parquetJsonHandler
}

return nil
}

func parquetStringHandler(s interface{}) (interface{}, error) {
atris marked this conversation as resolved.
Show resolved Hide resolved
switch v := s.(type) {
case []byte:
return string(v), nil
}

return nil, nil
}

func parquetJsonHandler(s interface{}) (interface{}, error) {
if encoded, ok := convertToJSON(s); ok {
return encoded, nil
}

return nil, fmt.Errorf("Failed to convert to JSON")
}
3 changes: 2 additions & 1 deletion internal/encoding/block/from_parquet_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
package block

import (
"github.com/stretchr/testify/assert"
"io/ioutil"
"testing"

"github.com/stretchr/testify/assert"
)

const testFileForParquet = "../../../test/test2.parquet"
Expand Down
2 changes: 2 additions & 0 deletions internal/encoding/merge/merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ func New(mergeFunc string) (Func, error) {
switch strings.ToLower(mergeFunc) {
case "orc":
return ToOrc, nil
case "parquet":
return ToParquet, nil
case "": // Default to "orc" so we don't break existing configs
return ToOrc, nil
}
Expand Down
10 changes: 10 additions & 0 deletions internal/encoding/merge/merge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
// BenchmarkFlush runs a benchmark for a Merge function for flushing
// To run it, go in the directory and do 'go test -benchmem -bench=. -benchtime=1s'
// BenchmarkMerge/orc-8 1 7195029600 ns/op 2101578032 B/op 36859501 allocs/op
// BenchmarkMerge/parquet-12 1 18666411036 ns/op 5142058320 B/op 115850463 allocs/op
func BenchmarkMerge(b *testing.B) {

// Append some files
Expand All @@ -27,6 +28,15 @@ func BenchmarkMerge(b *testing.B) {
ToOrc(blocks, blocks[0].Schema())
}
})

// Run the actual benchmark
b.Run("parquet", func(b *testing.B) {
b.ResetTimer()
b.ReportAllocs()
for n := 0; n < b.N; n++ {
ToParquet(blocks, blocks[0].Schema())
}
})
}

func TestMergeNew(t *testing.T) {
Expand Down
Loading