Skip to content

Commit 103dca3

Browse files
committed
move dictionary related APIs from ColumnVector to WritableColumnVector
1 parent 7453ab0 commit 103dca3

File tree

4 files changed

+52
-66
lines changed

4 files changed

+52
-66
lines changed

sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ private void decodeDictionaryIds(
239239
int rowId,
240240
int num,
241241
WritableColumnVector column,
242-
ColumnVector dictionaryIds) {
242+
WritableColumnVector dictionaryIds) {
243243
switch (descriptor.getType()) {
244244
case INT32:
245245
if (column.dataType() == DataTypes.IntegerType ||

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,6 @@ public int[] getInts(int rowId, int count) {
159159
return array;
160160
}
161161

162-
@Override
163-
public int getDictId(int rowId) {
164-
throw new UnsupportedOperationException();
165-
}
166-
167162
//
168163
// APIs dealing with Longs
169164
//

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java

Lines changed: 16 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -22,32 +22,29 @@
2222
import org.apache.spark.unsafe.types.UTF8String;
2323

2424
/**
25-
* This class represents a column of values and provides the main APIs to access the data
26-
* values. It supports all the types and contains get APIs as well as their batched versions.
27-
* The batched versions are preferable whenever possible.
25+
* This class represents in-memory values of a column and provides the main APIs to access the data.
26+
* It supports all the types and contains get APIs as well as their batched versions. The batched
27+
* versions are considered to be faster and preferable whenever possible.
2828
*
2929
* To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
30-
* columns have child columns. All of the data is stored in the child columns and the parent column
31-
* contains nullability, and in the case of Arrays, the lengths and offsets into the child column.
32-
* Lengths and offsets are encoded identically to INTs.
30+
* columns have child columns. All of the data are stored in the child columns and the parent column
31+
* only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
32+
* column and are encoded identically to INTs.
33+
*
3334
* Maps are just a special case of a two field struct.
3435
*
3536
* Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
36-
* in the current RowBatch.
37-
*
38-
* A ColumnVector should be considered immutable once originally created.
39-
*
40-
* ColumnVectors are intended to be reused.
37+
* in the current batch.
4138
*/
4239
public abstract class ColumnVector implements AutoCloseable {
40+
4341
/**
4442
* Returns the data type of this column.
4543
*/
4644
public final DataType dataType() { return type; }
4745

4846
/**
4947
* Cleans up memory for this column. The column is not usable after this.
50-
* TODO: this should probably have ref-counted semantics.
5148
*/
5249
public abstract void close();
5350

@@ -107,13 +104,6 @@ public abstract class ColumnVector implements AutoCloseable {
107104
*/
108105
public abstract int[] getInts(int rowId, int count);
109106

110-
/**
111-
* Returns the dictionary Id for rowId.
112-
* This should only be called when the ColumnVector is dictionaryIds.
113-
* We have this separate method for dictionaryIds as per SPARK-16928.
114-
*/
115-
public abstract int getDictId(int rowId);
116-
117107
/**
118108
* Returns the value for rowId.
119109
*/
@@ -145,39 +135,39 @@ public abstract class ColumnVector implements AutoCloseable {
145135
public abstract double[] getDoubles(int rowId, int count);
146136

147137
/**
148-
* Returns the length of the array at rowid.
138+
* Returns the length of the array for rowId.
149139
*/
150140
public abstract int getArrayLength(int rowId);
151141

152142
/**
153-
* Returns the offset of the array at rowid.
143+
* Returns the offset of the array for rowId.
154144
*/
155145
public abstract int getArrayOffset(int rowId);
156146

157147
/**
158-
* Returns a utility object to get structs.
148+
* Returns the struct for rowId.
159149
*/
160150
public final ColumnarRow getStruct(int rowId) {
161151
return new ColumnarRow(this, rowId);
162152
}
163153

164154
/**
165-
* Returns a utility object to get structs.
166-
* provided to keep API compatibility with InternalRow for code generation
155+
* A special version of {@link #getShort(int)}, which is only used as an adapter for Spark codegen
156+
* framework, the second parameter is totally ignored.
167157
*/
168158
public final ColumnarRow getStruct(int rowId, int size) {
169159
return getStruct(rowId);
170160
}
171161

172162
/**
173-
* Returns the array at rowid.
163+
* Returns the array for rowId.
174164
*/
175165
public final ColumnarArray getArray(int rowId) {
176166
return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
177167
}
178168

179169
/**
180-
* Returns the value for rowId.
170+
* Returns the map for rowId.
181171
*/
182172
public MapData getMap(int ordinal) {
183173
throw new UnsupportedOperationException();
@@ -214,30 +204,6 @@ public MapData getMap(int ordinal) {
214204
*/
215205
protected DataType type;
216206

217-
/**
218-
* The Dictionary for this column.
219-
*
220-
* If it's not null, will be used to decode the value in getXXX().
221-
*/
222-
protected Dictionary dictionary;
223-
224-
/**
225-
* Reusable column for ids of dictionary.
226-
*/
227-
protected ColumnVector dictionaryIds;
228-
229-
/**
230-
* Returns true if this column has a dictionary.
231-
*/
232-
public boolean hasDictionary() { return this.dictionary != null; }
233-
234-
/**
235-
* Returns the underlying integer column for ids of dictionary.
236-
*/
237-
public ColumnVector getDictionaryIds() {
238-
return dictionaryIds;
239-
}
240-
241207
/**
242208
* Sets up the common state and also handles creating the child columns if this is a nested
243209
* type.

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@
3636
* elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
3737
* the lengths are known up front.
3838
*
39-
* A ColumnVector should be considered immutable once originally created. In other words, it is not
40-
* valid to call put APIs after reads until reset() is called.
39+
* A WritableColumnVector should be considered immutable once originally created. In other words,
40+
* it is not valid to call put APIs after reads until reset() is called.
41+
*
42+
* WritableColumnVector are intended to be reused.
4143
*/
4244
public abstract class WritableColumnVector extends ColumnVector {
4345

@@ -105,6 +107,37 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
105107
@Override
106108
public boolean anyNullsSet() { return anyNullsSet; }
107109

110+
/**
111+
* Returns the dictionary Id for rowId.
112+
* This should only be called when the ColumnVector is dictionaryIds.
113+
* We have this separate method for dictionaryIds as per SPARK-16928.
114+
*/
115+
public abstract int getDictId(int rowId);
116+
117+
/**
118+
* The Dictionary for this column.
119+
*
120+
* If it's not null, will be used to decode the value in getXXX().
121+
*/
122+
protected Dictionary dictionary;
123+
124+
/**
125+
* Reusable column for ids of dictionary.
126+
*/
127+
protected WritableColumnVector dictionaryIds;
128+
129+
/**
130+
* Returns true if this column has a dictionary.
131+
*/
132+
public boolean hasDictionary() { return this.dictionary != null; }
133+
134+
/**
135+
* Returns the underlying integer column for ids of dictionary.
136+
*/
137+
public WritableColumnVector getDictionaryIds() {
138+
return dictionaryIds;
139+
}
140+
108141
/**
109142
* Ensures that there is enough storage to store capacity elements. That is, the put() APIs
110143
* must work for all rowIds < capacity.
@@ -635,14 +668,6 @@ public WritableColumnVector reserveDictionaryIds(int capacity) {
635668
return dictionaryIds;
636669
}
637670

638-
/**
639-
* Returns the underlying integer column for ids of dictionary.
640-
*/
641-
@Override
642-
public WritableColumnVector getDictionaryIds() {
643-
return (WritableColumnVector) dictionaryIds;
644-
}
645-
646671
/**
647672
* Reserve a new column.
648673
*/

0 commit comments

Comments
 (0)