Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ private void decodeDictionaryIds(
int rowId,
int num,
WritableColumnVector column,
ColumnVector dictionaryIds) {
WritableColumnVector dictionaryIds) {
switch (descriptor.getType()) {
case INT32:
if (column.dataType() == DataTypes.IntegerType ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,6 @@ public int[] getInts(int rowId, int count) {
return array;
}

@Override
public int getDictId(int rowId) {
throw new UnsupportedOperationException();
}

//
// APIs dealing with Longs
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,32 +22,29 @@
import org.apache.spark.unsafe.types.UTF8String;

/**
* This class represents a column of values and provides the main APIs to access the data
* values. It supports all the types and contains get APIs as well as their batched versions.
* The batched versions are preferable whenever possible.
* This class represents in-memory values of a column and provides the main APIs to access the data.
* It supports all the types and contains get APIs as well as their batched versions. The batched
* versions are considered to be faster and preferable whenever possible.
*
* To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
* columns have child columns. All of the data is stored in the child columns and the parent column
* contains nullability, and in the case of Arrays, the lengths and offsets into the child column.
* Lengths and offsets are encoded identically to INTs.
* columns have child columns. All of the data are stored in the child columns and the parent column
* only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
* column and are encoded identically to INTs.
*
* Maps are just a special case of a two field struct.
*
* Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
* in the current RowBatch.
*
* A ColumnVector should be considered immutable once originally created.
*
* ColumnVectors are intended to be reused.
* in the current batch.
*/
public abstract class ColumnVector implements AutoCloseable {

/**
* Returns the data type of this column.
*/
public final DataType dataType() { return type; }

/**
* Cleans up memory for this column. The column is not usable after this.
* TODO: this should probably have ref-counted semantics.
*/
public abstract void close();

Expand Down Expand Up @@ -107,13 +104,6 @@ public abstract class ColumnVector implements AutoCloseable {
*/
public abstract int[] getInts(int rowId, int count);

/**
* Returns the dictionary Id for rowId.
* This should only be called when the ColumnVector is dictionaryIds.
* We have this separate method for dictionaryIds as per SPARK-16928.
*/
public abstract int getDictId(int rowId);

/**
* Returns the value for rowId.
*/
Expand Down Expand Up @@ -145,39 +135,39 @@ public abstract class ColumnVector implements AutoCloseable {
public abstract double[] getDoubles(int rowId, int count);

/**
* Returns the length of the array at rowid.
* Returns the length of the array for rowId.
*/
public abstract int getArrayLength(int rowId);

/**
* Returns the offset of the array at rowid.
* Returns the offset of the array for rowId.
*/
public abstract int getArrayOffset(int rowId);

/**
* Returns a utility object to get structs.
* Returns the struct for rowId.
*/
public final ColumnarRow getStruct(int rowId) {
return new ColumnarRow(this, rowId);
}

/**
* Returns a utility object to get structs.
* provided to keep API compatibility with InternalRow for code generation
* A special version of {@link #getStruct(int)}, which is only used as an adapter for Spark
* codegen framework, the second parameter is totally ignored.
*/
public final ColumnarRow getStruct(int rowId, int size) {
return getStruct(rowId);
}

/**
* Returns the array at rowid.
* Returns the array for rowId.
*/
public final ColumnarArray getArray(int rowId) {
return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
}

/**
* Returns the value for rowId.
* Returns the map for rowId.
*/
public MapData getMap(int ordinal) {
throw new UnsupportedOperationException();
Expand Down Expand Up @@ -214,30 +204,6 @@ public MapData getMap(int ordinal) {
*/
protected DataType type;

/**
* The Dictionary for this column.
*
* If it's not null, will be used to decode the value in getXXX().
*/
protected Dictionary dictionary;

/**
* Reusable column for ids of dictionary.
*/
protected ColumnVector dictionaryIds;

/**
* Returns true if this column has a dictionary.
*/
public boolean hasDictionary() { return this.dictionary != null; }

/**
* Returns the underlying integer column for ids of dictionary.
*/
public ColumnVector getDictionaryIds() {
return dictionaryIds;
}

/**
* Sets up the common state and also handles creating the child columns if this is a nested
* type.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@
* elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
* the lengths are known up front.
*
* A ColumnVector should be considered immutable once originally created. In other words, it is not
* valid to call put APIs after reads until reset() is called.
* A WritableColumnVector should be considered immutable once originally created. In other words,
* it is not valid to call put APIs after reads until reset() is called.
*
* WritableColumnVector are intended to be reused.
*/
public abstract class WritableColumnVector extends ColumnVector {

Expand Down Expand Up @@ -105,6 +107,58 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
@Override
public boolean anyNullsSet() { return anyNullsSet; }

/**
* Returns the dictionary Id for rowId.
*
* This should only be called when this `WritableColumnVector` represents dictionaryIds.
* We have this separate method for dictionaryIds as per SPARK-16928.
*/
public abstract int getDictId(int rowId);

/**
* The Dictionary for this column.
*
* If it's not null, will be used to decode the value in getXXX().
*/
protected Dictionary dictionary;

/**
* Reusable column for ids of dictionary.
*/
protected WritableColumnVector dictionaryIds;

/**
* Returns true if this column has a dictionary.
*/
public boolean hasDictionary() { return this.dictionary != null; }

/**
* Returns the underlying integer column for ids of dictionary.
*/
public WritableColumnVector getDictionaryIds() {
return dictionaryIds;
}

/**
* Update the dictionary.
*/
public void setDictionary(Dictionary dictionary) {
this.dictionary = dictionary;
}

/**
* Reserve a integer column for ids of dictionary.
*/
public WritableColumnVector reserveDictionaryIds(int capacity) {
if (dictionaryIds == null) {
dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
} else {
dictionaryIds.reset();
dictionaryIds.reserve(capacity);
}
return dictionaryIds;
}

/**
* Ensures that there is enough storage to store capacity elements. That is, the put() APIs
* must work for all rowIds < capacity.
Expand Down Expand Up @@ -613,36 +667,6 @@ public final int appendStruct(boolean isNull) {
*/
protected WritableColumnVector[] childColumns;

/**
* Update the dictionary.
*/
public void setDictionary(Dictionary dictionary) {
this.dictionary = dictionary;
}

/**
* Reserve a integer column for ids of dictionary.
*/
public WritableColumnVector reserveDictionaryIds(int capacity) {
WritableColumnVector dictionaryIds = (WritableColumnVector) this.dictionaryIds;
if (dictionaryIds == null) {
dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
this.dictionaryIds = dictionaryIds;
} else {
dictionaryIds.reset();
dictionaryIds.reserve(capacity);
}
return dictionaryIds;
}

/**
* Returns the underlying integer column for ids of dictionary.
*/
@Override
public WritableColumnVector getDictionaryIds() {
return (WritableColumnVector) dictionaryIds;
}

/**
* Reserve a new column.
*/
Expand Down