Skip to content

Commit 6d1c642

Browse files
committed
Using ColumnProjectionUtils to optimise RCFile and ORC column pruning
1 parent eb62fd3 commit 6d1c642

File tree

1 file changed

+42
-6
lines changed

1 file changed

+42
-6
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveOperators.scala

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,18 @@
1818
package org.apache.spark.sql.hive.execution
1919

2020
import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
21+
import org.apache.hadoop.hive.conf.HiveConf
2122
import org.apache.hadoop.hive.metastore.MetaStoreUtils
2223
import org.apache.hadoop.hive.ql.Context
2324
import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition, Hive}
2425
import org.apache.hadoop.hive.ql.plan.{TableDesc, FileSinkDesc}
25-
import org.apache.hadoop.hive.serde2.Serializer
26+
import org.apache.hadoop.hive.serde.serdeConstants
2627
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption
2728
import org.apache.hadoop.hive.serde2.objectinspector._
2829
import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector
2930
import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveVarcharObjectInspector
31+
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
32+
import org.apache.hadoop.hive.serde2.{ColumnProjectionUtils, Serializer}
3033
import org.apache.hadoop.io.Writable
3134
import org.apache.hadoop.mapred._
3235

@@ -119,6 +122,38 @@ case class HiveTableScan(
119122
Cast(Literal(value), dataType).eval(null)
120123
}
121124

125+
private def addColumnMetadataToConf(hiveConf: HiveConf) {
126+
// Specifies IDs and internal names of columns to be scanned.
127+
val neededColumnIDs = attributes.map(a => relation.output.indexWhere(_.name == a.name): Integer)
128+
val columnInternalNames = neededColumnIDs.map(HiveConf.getColumnInternalName(_)).mkString(",")
129+
130+
if (attributes.size == relation.output.size) {
131+
ColumnProjectionUtils.setFullyReadColumns(hiveConf)
132+
} else {
133+
ColumnProjectionUtils.appendReadColumnIDs(hiveConf, neededColumnIDs)
134+
}
135+
136+
ColumnProjectionUtils.appendReadColumnNames(hiveConf, attributes.map(_.name))
137+
138+
// Specifies types and object inspectors of columns to be scanned.
139+
val structOI = ObjectInspectorUtils
140+
.getStandardObjectInspector(
141+
relation.tableDesc.getDeserializer.getObjectInspector,
142+
ObjectInspectorCopyOption.JAVA)
143+
.asInstanceOf[StructObjectInspector]
144+
145+
val columnTypeNames = structOI
146+
.getAllStructFieldRefs
147+
.map(_.getFieldObjectInspector)
148+
.map(TypeInfoUtils.getTypeInfoFromObjectInspector(_).getTypeName)
149+
.mkString(",")
150+
151+
hiveConf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypeNames)
152+
hiveConf.set(serdeConstants.LIST_COLUMNS, columnInternalNames)
153+
}
154+
155+
addColumnMetadataToConf(sc.hiveconf)
156+
122157
@transient
123158
def inputRdd = if (!relation.hiveQlTable.isPartitioned) {
124159
hadoopReader.makeRDDForTable(relation.hiveQlTable)
@@ -156,18 +191,19 @@ case class HiveTableScan(
156191
} else {
157192
val mutableRow = new GenericMutableRow(attributes.length)
158193
val buffered = iterator.buffered
159-
160-
(buffered.head match {
194+
val rowsAndPartitionKeys = buffered.head match {
161195
case Array(_, _) =>
162196
buffered.map { case Array(deserializedRow, partitionKeys: Array[String]) =>
163197
(deserializedRow, partitionKeys)
164198
}
165199

166200
case _ =>
167-
buffered.map { deserializedRow =>
168-
(deserializedRow, Array.empty[String])
201+
buffered.map {
202+
(_, Array.empty[String])
169203
}
170-
}).map { case (deserializedRow, partitionKeys: Array[String]) =>
204+
}
205+
206+
rowsAndPartitionKeys.map { case (deserializedRow, partitionKeys) =>
171207
var i = 0
172208

173209
while (i < attributes.length) {

0 commit comments

Comments
 (0)