-
Notifications
You must be signed in to change notification settings - Fork 3
/
partition.go
74 lines (67 loc) · 5.72 KB
/
partition.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
package sif
// A Partition is a portion of a columnar dataset, consisting of multiple Rows.
// Partitions are not generally interacted with directly, instead being
// manipulated in parallel by DataFrame Tasks.
type Partition interface {
ID() string // ID retrieves the ID of this Partition
GetMaxRows() int // GetMaxRows retrieves the maximum number of rows in this Partition
GetNumRows() int // GetNumRows retrieves the number of rows in this Partition
GetRow(rowNum int) Row // GetRow retrieves a specific row from this Partition
GetSchema() Schema // GetSchema retrieves the schema for this Partition
}
// A SerializablePartition can be compressed and serialized
type SerializablePartition interface {
Partition
SetCompressor(Compressor) // SetCompressor defines the compressor for this Partition. Must be called prior to serializing.
ToBytes() ([]byte, error) // ToBytes serializes a Partition to a byte array suitable for persistence to disk
}
// A BuildablePartition can be built. Used in the implementation of DataSources and Parsers
type BuildablePartition interface {
Partition
ForEachRow(fn MapOperation) error // ForEachRow iterates over Rows in a Partition
CanInsertRowData(row Row) error // CanInsertRowData checks if a Row can be inserted into this Partition
AppendEmptyRowData(tempRow Row) (Row, error) // AppendEmptyRowData is a convenient way to add an empty Row to the end of this Partition, returning the Row so that Row methods can be used to populate it
AppendRowData(row Row) error // AppendRowData adds a Row to the end of this Partition, if it isn't full and if the Row matches this Partition's schema
InsertRowData(row Row, pos int) error // InsertRowData inserts a Row at a specific position within this Partition, if it isn't full and if the Row matches this Partition's Schema. Other Rows are shifted as necessary.
TruncateRowData(numRows int) // TruncateRowData zeroes out rows from the current last row towards the beginning of the Partition
CreateTempRow() Row
}
// A KeyablePartition can be keyed. Used in the implementation of Partition shuffling and reduction
type KeyablePartition interface {
KeyRows(kfn KeyingOperation) (OperablePartition, error) // KeyRows generates hash keys for a row from a key column. Attempts to manipulate partition in-place, falling back to creating a fresh partition if there are row errors
IsKeyed() bool // IsKeyed returns true iff this Partition has been keyed with KeyRows
GetKey(rowNum int) (uint64, error) // GetKey returns the key for a particular row number, or returns an error if the Partition is not keyed.
GetRowKey(row Row) (uint64, error) // GetRowKey returns the key for a particular row, or returns an error if the Partition is not keyed.
GetKeyRange(start int, end int) ([]uint64, error) // GetKeyRange returns keys for the given range of rows, or returns an error if the Partition is not keyed.
}
// An OperablePartition can be operated on
type OperablePartition interface {
Partition
KeyablePartition
UpdateSchema(currentSchema Schema) // UpdateSchema sets the public schema of a Partition
AddColumn(accessor ColumnAccessor) (OperablePartition, error) // AddColumn adds space for a new column to this Partition
RemoveColumn(colName string) (OperablePartition, error) // RemoveColumn removes a column's data and metadata from this Partition
RenameColumn(colName string, newAccessor ColumnAccessor) (OperablePartition, error) // RenameColumn modifies the name for a column in Partition data
MapRows(fn MapOperation) (OperablePartition, error) // MapRows runs a MapOperation on each row in this Partition, manipulating them in-place. Will fall back to creating a fresh partition if PartitionRowErrors occur.
FlatMapRows(fn FlatMapOperation) ([]OperablePartition, error) // FlatMapRows runs a FlatMapOperation on each row in this Partition, creating new Partitions
FilterRows(fn FilterOperation) (OperablePartition, error) // FilterRows filters the Rows in the current Partition, creating a new one. Rows are retained if FilterOperation returns true
}
// A CollectedPartition has been collected
type CollectedPartition interface {
Partition
ForEachRow(fn MapOperation) error // ForEachRow iterates over Rows in a Partition
}
// A ReduceablePartition can be stored in a PartitionIndex. Used in the implementation of Partition reduction
type ReduceablePartition interface {
BuildablePartition
KeyablePartition
SerializablePartition
PopulateTempRow(tempRow Row, idx int)
FindFirstKey(key uint64) (int, error) // PRECONDITION: Partition must already be sorted by key
FindLastKey(key uint64) (int, error) // PRECONDITION: Partition must already be sorted by key
FindFirstRowKey(keyBuf []byte, key uint64, keyfn KeyingOperation) (int, error) // PRECONDITION: Partition must already be sorted by key
FindLastRowKey(keyBuf []byte, key uint64, keyfn KeyingOperation) (int, error) // PRECONDITION: Partition must already be sorted by key
AverageKeyValue() (uint64, error) // AverageKeyValue is the average value of key within this sorted, keyed Partition
Split(pos int) (ReduceablePartition, ReduceablePartition, error) // Split splits a Partition into two Partitions. Split position ends up in right Partition.
BalancedSplit() (uint64, ReduceablePartition, ReduceablePartition, error) // Split position ends up in right Partition.
}