Skip to content

Commit 7d8e2ca

Browse files
committed
[SPARK-22775][SQL] move dictionary related APIs from ColumnVector to WritableColumnVector
## What changes were proposed in this pull request? These dictionary related APIs are special to `WritableColumnVector` and should not be in `ColumnVector`, which will be public soon. ## How was this patch tested? existing tests Author: Wenchen Fan <[email protected]> Closes #19970 from cloud-fan/final.
1 parent c3dd2a2 commit 7d8e2ca

File tree

4 files changed

+73
-88
lines changed

4 files changed

+73
-88
lines changed

sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ private void decodeDictionaryIds(
239239
int rowId,
240240
int num,
241241
WritableColumnVector column,
242-
ColumnVector dictionaryIds) {
242+
WritableColumnVector dictionaryIds) {
243243
switch (descriptor.getType()) {
244244
case INT32:
245245
if (column.dataType() == DataTypes.IntegerType ||

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,6 @@ public int[] getInts(int rowId, int count) {
159159
return array;
160160
}
161161

162-
@Override
163-
public int getDictId(int rowId) {
164-
throw new UnsupportedOperationException();
165-
}
166-
167162
//
168163
// APIs dealing with Longs
169164
//

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java

Lines changed: 16 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -22,32 +22,29 @@
2222
import org.apache.spark.unsafe.types.UTF8String;
2323

2424
/**
25-
* This class represents a column of values and provides the main APIs to access the data
26-
* values. It supports all the types and contains get APIs as well as their batched versions.
27-
* The batched versions are preferable whenever possible.
25+
* This class represents in-memory values of a column and provides the main APIs to access the data.
26+
* It supports all the types and contains get APIs as well as their batched versions. The batched
27+
* versions are considered to be faster and preferable whenever possible.
2828
*
2929
* To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
30-
* columns have child columns. All of the data is stored in the child columns and the parent column
31-
* contains nullability, and in the case of Arrays, the lengths and offsets into the child column.
32-
* Lengths and offsets are encoded identically to INTs.
30+
* columns have child columns. All of the data are stored in the child columns and the parent column
31+
* only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
32+
* column and are encoded identically to INTs.
33+
*
3334
* Maps are just a special case of a two field struct.
3435
*
3536
* Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
36-
* in the current RowBatch.
37-
*
38-
* A ColumnVector should be considered immutable once originally created.
39-
*
40-
* ColumnVectors are intended to be reused.
37+
* in the current batch.
4138
*/
4239
public abstract class ColumnVector implements AutoCloseable {
40+
4341
/**
4442
* Returns the data type of this column.
4543
*/
4644
public final DataType dataType() { return type; }
4745

4846
/**
4947
* Cleans up memory for this column. The column is not usable after this.
50-
* TODO: this should probably have ref-counted semantics.
5148
*/
5249
public abstract void close();
5350

@@ -107,13 +104,6 @@ public abstract class ColumnVector implements AutoCloseable {
107104
*/
108105
public abstract int[] getInts(int rowId, int count);
109106

110-
/**
111-
* Returns the dictionary Id for rowId.
112-
* This should only be called when the ColumnVector is dictionaryIds.
113-
* We have this separate method for dictionaryIds as per SPARK-16928.
114-
*/
115-
public abstract int getDictId(int rowId);
116-
117107
/**
118108
* Returns the value for rowId.
119109
*/
@@ -145,39 +135,39 @@ public abstract class ColumnVector implements AutoCloseable {
145135
public abstract double[] getDoubles(int rowId, int count);
146136

147137
/**
148-
* Returns the length of the array at rowid.
138+
* Returns the length of the array for rowId.
149139
*/
150140
public abstract int getArrayLength(int rowId);
151141

152142
/**
153-
* Returns the offset of the array at rowid.
143+
* Returns the offset of the array for rowId.
154144
*/
155145
public abstract int getArrayOffset(int rowId);
156146

157147
/**
158-
* Returns a utility object to get structs.
148+
* Returns the struct for rowId.
159149
*/
160150
public final ColumnarRow getStruct(int rowId) {
161151
return new ColumnarRow(this, rowId);
162152
}
163153

164154
/**
165-
* Returns a utility object to get structs.
166-
* provided to keep API compatibility with InternalRow for code generation
155+
* A special version of {@link #getStruct(int)}, which is only used as an adapter for Spark
156+
* codegen framework, the second parameter is totally ignored.
167157
*/
168158
public final ColumnarRow getStruct(int rowId, int size) {
169159
return getStruct(rowId);
170160
}
171161

172162
/**
173-
* Returns the array at rowid.
163+
* Returns the array for rowId.
174164
*/
175165
public final ColumnarArray getArray(int rowId) {
176166
return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
177167
}
178168

179169
/**
180-
* Returns the value for rowId.
170+
* Returns the map for rowId.
181171
*/
182172
public MapData getMap(int ordinal) {
183173
throw new UnsupportedOperationException();
@@ -214,30 +204,6 @@ public MapData getMap(int ordinal) {
214204
*/
215205
protected DataType type;
216206

217-
/**
218-
* The Dictionary for this column.
219-
*
220-
* If it's not null, will be used to decode the value in getXXX().
221-
*/
222-
protected Dictionary dictionary;
223-
224-
/**
225-
* Reusable column for ids of dictionary.
226-
*/
227-
protected ColumnVector dictionaryIds;
228-
229-
/**
230-
* Returns true if this column has a dictionary.
231-
*/
232-
public boolean hasDictionary() { return this.dictionary != null; }
233-
234-
/**
235-
* Returns the underlying integer column for ids of dictionary.
236-
*/
237-
public ColumnVector getDictionaryIds() {
238-
return dictionaryIds;
239-
}
240-
241207
/**
242208
* Sets up the common state and also handles creating the child columns if this is a nested
243209
* type.

sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java

Lines changed: 56 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@
3636
* elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
3737
* the lengths are known up front.
3838
*
39-
* A ColumnVector should be considered immutable once originally created. In other words, it is not
40-
* valid to call put APIs after reads until reset() is called.
39+
* A WritableColumnVector should be considered immutable once originally created. In other words,
40+
* it is not valid to call put APIs after reads until reset() is called.
41+
*
42+
* WritableColumnVector are intended to be reused.
4143
*/
4244
public abstract class WritableColumnVector extends ColumnVector {
4345

@@ -105,6 +107,58 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
105107
@Override
106108
public boolean anyNullsSet() { return anyNullsSet; }
107109

110+
/**
111+
* Returns the dictionary Id for rowId.
112+
*
113+
* This should only be called when this `WritableColumnVector` represents dictionaryIds.
114+
* We have this separate method for dictionaryIds as per SPARK-16928.
115+
*/
116+
public abstract int getDictId(int rowId);
117+
118+
/**
119+
* The Dictionary for this column.
120+
*
121+
* If it's not null, will be used to decode the value in getXXX().
122+
*/
123+
protected Dictionary dictionary;
124+
125+
/**
126+
* Reusable column for ids of dictionary.
127+
*/
128+
protected WritableColumnVector dictionaryIds;
129+
130+
/**
131+
* Returns true if this column has a dictionary.
132+
*/
133+
public boolean hasDictionary() { return this.dictionary != null; }
134+
135+
/**
136+
* Returns the underlying integer column for ids of dictionary.
137+
*/
138+
public WritableColumnVector getDictionaryIds() {
139+
return dictionaryIds;
140+
}
141+
142+
/**
143+
* Update the dictionary.
144+
*/
145+
public void setDictionary(Dictionary dictionary) {
146+
this.dictionary = dictionary;
147+
}
148+
149+
/**
150+
* Reserve a integer column for ids of dictionary.
151+
*/
152+
public WritableColumnVector reserveDictionaryIds(int capacity) {
153+
if (dictionaryIds == null) {
154+
dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
155+
} else {
156+
dictionaryIds.reset();
157+
dictionaryIds.reserve(capacity);
158+
}
159+
return dictionaryIds;
160+
}
161+
108162
/**
109163
* Ensures that there is enough storage to store capacity elements. That is, the put() APIs
110164
* must work for all rowIds < capacity.
@@ -613,36 +667,6 @@ public final int appendStruct(boolean isNull) {
613667
*/
614668
protected WritableColumnVector[] childColumns;
615669

616-
/**
617-
* Update the dictionary.
618-
*/
619-
public void setDictionary(Dictionary dictionary) {
620-
this.dictionary = dictionary;
621-
}
622-
623-
/**
624-
* Reserve a integer column for ids of dictionary.
625-
*/
626-
public WritableColumnVector reserveDictionaryIds(int capacity) {
627-
WritableColumnVector dictionaryIds = (WritableColumnVector) this.dictionaryIds;
628-
if (dictionaryIds == null) {
629-
dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
630-
this.dictionaryIds = dictionaryIds;
631-
} else {
632-
dictionaryIds.reset();
633-
dictionaryIds.reserve(capacity);
634-
}
635-
return dictionaryIds;
636-
}
637-
638-
/**
639-
* Returns the underlying integer column for ids of dictionary.
640-
*/
641-
@Override
642-
public WritableColumnVector getDictionaryIds() {
643-
return (WritableColumnVector) dictionaryIds;
644-
}
645-
646670
/**
647671
* Reserve a new column.
648672
*/

0 commit comments

Comments
 (0)