Skip to content

Commit 529d2c0

Browse files
xinlifoobaralamb
andauthored
Extract Date32 parquet statistics as Date32Array rather than Int32Array (#10593)
* Fixes bug expect `Date32Array` but returns Int32Array * Add round trip ut * Update arrow_statistics.rs * remove unreachable code --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 656da83 commit 529d2c0

File tree

2 files changed

+116
-18
lines changed

2 files changed

+116
-18
lines changed

datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

Lines changed: 93 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ macro_rules! get_statistic {
7575
*scale,
7676
))
7777
}
78+
Some(DataType::Date32) => {
79+
Some(ScalarValue::Date32(Some(*s.$func())))
80+
}
81+
Some(DataType::Date64) => {
82+
Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000)))
83+
}
7884
_ => Some(ScalarValue::Int32(Some(*s.$func()))),
7985
}
8086
}
@@ -363,10 +369,12 @@ impl<'a> StatisticsConverter<'a> {
363369
#[cfg(test)]
364370
mod test {
365371
use super::*;
372+
use arrow::compute::kernels::cast_utils::Parser;
373+
use arrow::datatypes::{Date32Type, Date64Type};
366374
use arrow_array::{
367-
new_null_array, Array, BinaryArray, BooleanArray, Decimal128Array, Float32Array,
368-
Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, StructArray,
369-
TimestampNanosecondArray,
375+
new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array,
376+
Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch,
377+
StringArray, StructArray, TimestampNanosecondArray,
370378
};
371379
use arrow_schema::{Field, SchemaRef};
372380
use bytes::Bytes;
@@ -664,6 +672,68 @@ mod test {
664672
.run()
665673
}
666674

675+
#[test]
676+
fn roundtrip_date32() {
677+
Test {
678+
input: date32_array(vec![
679+
// row group 1
680+
Some("2021-01-01"),
681+
None,
682+
Some("2021-01-03"),
683+
// row group 2
684+
Some("2021-01-01"),
685+
Some("2021-01-05"),
686+
None,
687+
// row group 3
688+
None,
689+
None,
690+
None,
691+
]),
692+
expected_min: date32_array(vec![
693+
Some("2021-01-01"),
694+
Some("2021-01-01"),
695+
None,
696+
]),
697+
expected_max: date32_array(vec![
698+
Some("2021-01-03"),
699+
Some("2021-01-05"),
700+
None,
701+
]),
702+
}
703+
.run()
704+
}
705+
706+
#[test]
707+
fn roundtrip_date64() {
708+
Test {
709+
input: date64_array(vec![
710+
// row group 1
711+
Some("2021-01-01"),
712+
None,
713+
Some("2021-01-03"),
714+
// row group 2
715+
Some("2021-01-01"),
716+
Some("2021-01-05"),
717+
None,
718+
// row group 3
719+
None,
720+
None,
721+
None,
722+
]),
723+
expected_min: date64_array(vec![
724+
Some("2021-01-01"),
725+
Some("2021-01-01"),
726+
None,
727+
]),
728+
expected_max: date64_array(vec![
729+
Some("2021-01-03"),
730+
Some("2021-01-05"),
731+
None,
732+
]),
733+
}
734+
.run()
735+
}
736+
667737
#[test]
668738
fn struct_and_non_struct() {
669739
// Ensures that statistics for an array that appears *after* a struct
@@ -1069,4 +1139,24 @@ mod test {
10691139
]);
10701140
Arc::new(struct_array)
10711141
}
1142+
1143+
fn date32_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> ArrayRef {
1144+
let array = Date32Array::from(
1145+
input
1146+
.into_iter()
1147+
.map(|s| Date32Type::parse(s.unwrap_or_default()))
1148+
.collect::<Vec<_>>(),
1149+
);
1150+
Arc::new(array)
1151+
}
1152+
1153+
fn date64_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> ArrayRef {
1154+
let array = Date64Array::from(
1155+
input
1156+
.into_iter()
1157+
.map(|s| Date64Type::parse(s.unwrap_or_default()))
1158+
.collect::<Vec<_>>(),
1159+
);
1160+
Arc::new(array)
1161+
}
10721162
}

datafusion/core/tests/parquet/arrow_statistics.rs

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@
2121
use std::fs::File;
2222
use std::sync::Arc;
2323

24+
use arrow::compute::kernels::cast_utils::Parser;
25+
use arrow::datatypes::{Date32Type, Date64Type};
2426
use arrow_array::{
25-
make_array, Array, ArrayRef, BooleanArray, Decimal128Array, FixedSizeBinaryArray,
26-
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch,
27-
StringArray, UInt64Array,
27+
make_array, Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Decimal128Array,
28+
FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
29+
RecordBatch, StringArray, UInt64Array,
2830
};
2931
use arrow_schema::{DataType, Field, Schema};
3032
use datafusion::datasource::physical_plan::parquet::{
@@ -638,8 +640,6 @@ async fn test_timestamp_diff_rg_sizes() {
638640
}
639641

640642
// date with different row group sizes
641-
// Bug expect `Date32Array` but returns Int32Array
642-
// https://github.com/apache/datafusion/issues/10587
643643
#[tokio::test]
644644
async fn test_dates_32_diff_rg_sizes() {
645645
// This creates a parquet files of 3 columns named "date32", "date64", "names"
@@ -654,10 +654,16 @@ async fn test_dates_32_diff_rg_sizes() {
654654
};
655655
Test {
656656
reader: reader.build().await,
657-
// mins are [18262, 18565,]
658-
expected_min: Arc::new(Int32Array::from(vec![18262, 18565])),
659-
// maxes are [18564, 21865,]
660-
expected_max: Arc::new(Int32Array::from(vec![18564, 21865])),
657+
// mins are [2020-01-01, 2020-10-30]
658+
expected_min: Arc::new(Date32Array::from(vec![
659+
Date32Type::parse("2020-01-01"),
660+
Date32Type::parse("2020-10-30"),
661+
])),
662+
// maxes are [2020-10-29, 2029-11-12]
663+
expected_max: Arc::new(Date32Array::from(vec![
664+
Date32Type::parse("2020-10-29"),
665+
Date32Type::parse("2029-11-12"),
666+
])),
661667
// nulls are [2, 2]
662668
expected_null_counts: UInt64Array::from(vec![2, 2]),
663669
// row counts are [13, 7]
@@ -667,10 +673,6 @@ async fn test_dates_32_diff_rg_sizes() {
667673
.run();
668674
}
669675

670-
// BUG: same as above. Expect to return Date64Array but returns Int32Array
671-
// test date with different row group sizes
672-
// https://github.com/apache/datafusion/issues/10587
673-
#[ignore]
674676
#[tokio::test]
675677
async fn test_dates_64_diff_rg_sizes() {
676678
// The file is created by 4 record batches (each has a null row), each has 5 rows but then will be split into 2 row groups with size 13, 7
@@ -680,8 +682,14 @@ async fn test_dates_64_diff_rg_sizes() {
680682
};
681683
Test {
682684
reader: reader.build().await,
683-
expected_min: Arc::new(Int64Array::from(vec![18262, 18565])), // panic here because the actual data is Int32Array
684-
expected_max: Arc::new(Int64Array::from(vec![18564, 21865])),
685+
expected_min: Arc::new(Date64Array::from(vec![
686+
Date64Type::parse("2020-01-01"),
687+
Date64Type::parse("2020-10-30"),
688+
])),
689+
expected_max: Arc::new(Date64Array::from(vec![
690+
Date64Type::parse("2020-10-29"),
691+
Date64Type::parse("2029-11-12"),
692+
])),
685693
expected_null_counts: UInt64Array::from(vec![2, 2]),
686694
expected_row_counts: UInt64Array::from(vec![13, 7]),
687695
column_name: "date64",

0 commit comments

Comments
 (0)