Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Parquet Support For Egress Pipeline Output #67

Merged
merged 11 commits into from
May 13, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion internal/encoding/block/block_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func TestBlock_Types(t *testing.T) {
assert.NoError(t, err)

schema := b[0].Schema()
assert.Equal(t, 8, len(schema))
assert.Equal(t, 9, len(schema))
assert.Contains(t, schema, "boolean1")
assert.Contains(t, schema, "double1")
assert.Contains(t, schema, "int1")
Expand Down
1 change: 1 addition & 0 deletions internal/encoding/block/from_orc.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,5 +147,6 @@ func convertToString(value interface{}) (string, bool) {
if ok {
return strconv.FormatInt(valueInt, 10), true
}

return "", false
}
53 changes: 53 additions & 0 deletions internal/encoding/block/from_parquet.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package block

import (
"fmt"
"github.com/kelindar/talaria/internal/column"
"github.com/kelindar/talaria/internal/encoding/parquet"
"github.com/kelindar/talaria/internal/encoding/typeof"
Expand Down Expand Up @@ -65,13 +66,19 @@ func FromParquetBy(payload []byte, partitionBy string, filter *typeof.Schema, ap
columnName := cols[i]
columnType := schema[columnName]

fieldHandler, _ := getReverseHandler(columnType.String())

// Encode to JSON
if columnType == typeof.JSON {
atris marked this conversation as resolved.
Show resolved Hide resolved
if encoded, ok := convertToJSON(v); ok {
v = encoded
}
}

if fieldHandler != nil {
v, _ = fieldHandler(v)
atris marked this conversation as resolved.
Show resolved Hide resolved
}

row.Set(columnName, v)
}

Expand All @@ -92,3 +99,49 @@ func FromParquetBy(payload []byte, partitionBy string, filter *typeof.Schema, ap
blocks = append(blocks, last...)
return blocks, nil
}

type fieldHandler func(interface{}) (interface{}, error)
atris marked this conversation as resolved.
Show resolved Hide resolved

func getReverseHandler(typ string) (fieldHandler func(interface{}) (interface{}, error), err error) {
atris marked this conversation as resolved.
Show resolved Hide resolved

atris marked this conversation as resolved.
Show resolved Hide resolved
switch typ {
case "string":
fieldHandler = stringHandler
}

return fieldHandler, nil
}

// Transform runs the computed Values and overwrites/appends them to the set.
func ApplyCastHandlers(r Row, fieldHandlers []fieldHandler) Row {
atris marked this conversation as resolved.
Show resolved Hide resolved
// Create a new output row and copy the column values from the input
schema := make(typeof.Schema, len(r.Schema))
out := NewRow(schema, len(r.Values))

i := 0
for k, v := range r.Values {
handler := fieldHandlers[i]

if handler != nil {
out.Values[k], _ = handler(v)
}
out.Schema[k] = r.Schema[k]
}

return out
}

func stringHandler(s interface{}) (interface{}, error) {
atris marked this conversation as resolved.
Show resolved Hide resolved

switch s.(type) {
case []byte:
buf, ok := s.([]byte)
if !ok {
return nil, fmt.Errorf("Failed to get bytes from the interface")
}

return string(buf), nil
}

return nil, nil
}
2 changes: 2 additions & 0 deletions internal/encoding/merge/merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ func New(mergeFunc string) (Func, error) {
switch strings.ToLower(mergeFunc) {
case "orc":
return ToOrc, nil
case "parquet":
return ToParquet, nil
case "": // Default to "orc" so we don't break existing configs
return ToOrc, nil
}
Expand Down
10 changes: 10 additions & 0 deletions internal/encoding/merge/merge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
// BenchmarkFlush runs a benchmark for a Merge function for flushing
// To run it, go in the directory and do 'go test -benchmem -bench=. -benchtime=1s'
// BenchmarkMerge/orc-8 1 7195029600 ns/op 2101578032 B/op 36859501 allocs/op
// BenchmarkMerge/parquet-12 1 18666411036 ns/op 5142058320 B/op 115850463 allocs/op
func BenchmarkMerge(b *testing.B) {

// Append some files
Expand All @@ -27,6 +28,15 @@ func BenchmarkMerge(b *testing.B) {
ToOrc(blocks, blocks[0].Schema())
}
})

// Run the actual benchmark
b.Run("parquet", func(b *testing.B) {
b.ResetTimer()
b.ReportAllocs()
for n := 0; n < b.N; n++ {
ToParquet(blocks, blocks[0].Schema())
}
})
}

func TestMergeNew(t *testing.T) {
Expand Down
Loading