[SPARK-22775][SQL] move dictionary related APIs from ColumnVector to WritableColumnVector

cloud-fan · cloud-fan · commit 7d8e2ca7f866 · 2017-12-14T19:33:54.000+08:00
## What changes were proposed in this pull request? These dictionary related APIs are special to `WritableColumnVector` and should not be in `ColumnVector`, which will be public soon. ## How was this patch tested? existing tests Author: Wenchen Fan <wenchen@databricks.com> Closes #19970 from cloud-fan/final.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -239,7 +239,7 @@ private void decodeDictionaryIds(
       int rowId,
       int num,
       WritableColumnVector column,
-      ColumnVector dictionaryIds) {
+      WritableColumnVector dictionaryIds) {
     switch (descriptor.getType()) {
       case INT32:
         if (column.dataType() == DataTypes.IntegerType ||
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -159,11 +159,6 @@ public int[] getInts(int rowId, int count) {
     return array;
   }
 
-  @Override
-  public int getDictId(int rowId) {
-    throw new UnsupportedOperationException();
-  }
-
   //
   // APIs dealing with Longs
   //
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -22,32 +22,29 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * This class represents a column of values and provides the main APIs to access the data
- * values. It supports all the types and contains get APIs as well as their batched versions.
- * The batched versions are preferable whenever possible.
+ * This class represents in-memory values of a column and provides the main APIs to access the data.
+ * It supports all the types and contains get APIs as well as their batched versions. The batched
+ * versions are considered to be faster and preferable whenever possible.
  *
  * To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
- * columns have child columns. All of the data is stored in the child columns and the parent column
- * contains nullability, and in the case of Arrays, the lengths and offsets into the child column.
- * Lengths and offsets are encoded identically to INTs.
+ * columns have child columns. All of the data are stored in the child columns and the parent column
+ * only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
+ * column and are encoded identically to INTs.
+ *
  * Maps are just a special case of a two field struct.
  *
  * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
- * in the current RowBatch.
- *
- * A ColumnVector should be considered immutable once originally created.
- *
- * ColumnVectors are intended to be reused.
+ * in the current batch.
  */
 public abstract class ColumnVector implements AutoCloseable {
+
   /**
    * Returns the data type of this column.
    */
   public final DataType dataType() { return type; }
 
   /**
    * Cleans up memory for this column. The column is not usable after this.
-   * TODO: this should probably have ref-counted semantics.
    */
   public abstract void close();
 
@@ -107,13 +104,6 @@ public abstract class ColumnVector implements AutoCloseable {
    */
   public abstract int[] getInts(int rowId, int count);
 
-  /**
-   * Returns the dictionary Id for rowId.
-   * This should only be called when the ColumnVector is dictionaryIds.
-   * We have this separate method for dictionaryIds as per SPARK-16928.
-   */
-  public abstract int getDictId(int rowId);
-
   /**
    * Returns the value for rowId.
    */
@@ -145,39 +135,39 @@ public abstract class ColumnVector implements AutoCloseable {
   public abstract double[] getDoubles(int rowId, int count);
 
   /**
-   * Returns the length of the array at rowid.
+   * Returns the length of the array for rowId.
    */
   public abstract int getArrayLength(int rowId);
 
   /**
-   * Returns the offset of the array at rowid.
+   * Returns the offset of the array for rowId.
    */
   public abstract int getArrayOffset(int rowId);
 
   /**
-   * Returns a utility object to get structs.
+   * Returns the struct for rowId.
    */
   public final ColumnarRow getStruct(int rowId) {
     return new ColumnarRow(this, rowId);
   }
 
   /**
-   * Returns a utility object to get structs.
-   * provided to keep API compatibility with InternalRow for code generation
+   * A special version of {@link #getStruct(int)}, which is only used as an adapter for Spark
+   * codegen framework, the second parameter is totally ignored.
    */
   public final ColumnarRow getStruct(int rowId, int size) {
     return getStruct(rowId);
   }
 
   /**
-   * Returns the array at rowid.
+   * Returns the array for rowId.
    */
   public final ColumnarArray getArray(int rowId) {
     return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the map for rowId.
    */
   public MapData getMap(int ordinal) {
     throw new UnsupportedOperationException();
@@ -214,30 +204,6 @@ public MapData getMap(int ordinal) {
    */
   protected DataType type;
 
-  /**
-   * The Dictionary for this column.
-   *
-   * If it's not null, will be used to decode the value in getXXX().
-   */
-  protected Dictionary dictionary;
-
-  /**
-   * Reusable column for ids of dictionary.
-   */
-  protected ColumnVector dictionaryIds;
-
-  /**
-   * Returns true if this column has a dictionary.
-   */
-  public boolean hasDictionary() { return this.dictionary != null; }
-
-  /**
-   * Returns the underlying integer column for ids of dictionary.
-   */
-  public ColumnVector getDictionaryIds() {
-    return dictionaryIds;
-  }
-
   /**
    * Sets up the common state and also handles creating the child columns if this is a nested
    * type.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -36,8 +36,10 @@
  * elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
  * the lengths are known up front.
  *
- * A ColumnVector should be considered immutable once originally created. In other words, it is not
- * valid to call put APIs after reads until reset() is called.
+ * A WritableColumnVector should be considered immutable once originally created. In other words,
+ * it is not valid to call put APIs after reads until reset() is called.
+ *
+ * WritableColumnVector are intended to be reused.
  */
 public abstract class WritableColumnVector extends ColumnVector {
 
@@ -105,6 +107,58 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
   @Override
   public boolean anyNullsSet() { return anyNullsSet; }
 
+  /**
+   * Returns the dictionary Id for rowId.
+   *
+   * This should only be called when this `WritableColumnVector` represents dictionaryIds.
+   * We have this separate method for dictionaryIds as per SPARK-16928.
+   */
+  public abstract int getDictId(int rowId);
+
+  /**
+   * The Dictionary for this column.
+   *
+   * If it's not null, will be used to decode the value in getXXX().
+   */
+  protected Dictionary dictionary;
+
+  /**
+   * Reusable column for ids of dictionary.
+   */
+  protected WritableColumnVector dictionaryIds;
+
+  /**
+   * Returns true if this column has a dictionary.
+   */
+  public boolean hasDictionary() { return this.dictionary != null; }
+
+  /**
+   * Returns the underlying integer column for ids of dictionary.
+   */
+  public WritableColumnVector getDictionaryIds() {
+    return dictionaryIds;
+  }
+
+  /**
+   * Update the dictionary.
+   */
+  public void setDictionary(Dictionary dictionary) {
+    this.dictionary = dictionary;
+  }
+
+  /**
+   * Reserve a integer column for ids of dictionary.
+   */
+  public WritableColumnVector reserveDictionaryIds(int capacity) {
+    if (dictionaryIds == null) {
+      dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
+    } else {
+      dictionaryIds.reset();
+      dictionaryIds.reserve(capacity);
+    }
+    return dictionaryIds;
+  }
+
   /**
    * Ensures that there is enough storage to store capacity elements. That is, the put() APIs
    * must work for all rowIds < capacity.
@@ -613,36 +667,6 @@ public final int appendStruct(boolean isNull) {
    */
   protected WritableColumnVector[] childColumns;
 
-  /**
-   * Update the dictionary.
-   */
-  public void setDictionary(Dictionary dictionary) {
-    this.dictionary = dictionary;
-  }
-
-  /**
-   * Reserve a integer column for ids of dictionary.
-   */
-  public WritableColumnVector reserveDictionaryIds(int capacity) {
-    WritableColumnVector dictionaryIds = (WritableColumnVector) this.dictionaryIds;
-    if (dictionaryIds == null) {
-      dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
-      this.dictionaryIds = dictionaryIds;
-    } else {
-      dictionaryIds.reset();
-      dictionaryIds.reserve(capacity);
-    }
-    return dictionaryIds;
-  }
-
-  /**
-   * Returns the underlying integer column for ids of dictionary.
-   */
-  @Override
-  public WritableColumnVector getDictionaryIds() {
-    return (WritableColumnVector) dictionaryIds;
-  }
-
   /**
    * Reserve a new column.
    */