-
Notifications
You must be signed in to change notification settings - Fork 1k
Description
Is your feature request related to a problem or challenge? Please describe what you are trying to do.
When adding additional support to the statistics converter as @Kev1n8 did in #6181 the presence of two sets of tests is quite confusing about where to add tests for the new test
There are tests in both
- https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs
- https://github.com/apache/arrow-rs/blob/master/parquet/tests/arrow_reader/statistics.rs
Describe the solution you'd like
I would like to remove the rundant tests
Describe alternatives you've considered
I think the clearest alternateive is to move all the tests from https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs to https://github.com/apache/arrow-rs/blob/master/parquet/tests/arrow_reader/statistics.rs
We should only move the non duplicated tests.
So that means something like:
- Leave a comment with a note about where the tests are in https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs
- Keep edge condition tests
arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs
Lines 1513 to 1522 in f2de2cd
#[test] fn roundtrip_empty() { let empty_bool_array = new_empty_array(&DataType::Boolean); Test { input: empty_bool_array.clone(), expected_min: empty_bool_array.clone(), expected_max: empty_bool_array.clone(), } .run() } - Remove tests in that have corresponding coverage
arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs
Lines 1548 to 2113 in f2de2cd
fn roundtrip_int32() { Test { input: i32_array([ // row group 1 Some(1), None, Some(3), // row group 2 Some(0), Some(5), None, // row group 3 None, None, None, ]), expected_min: i32_array([Some(1), Some(0), None]), expected_max: i32_array([Some(3), Some(5), None]), } .run() } #[test] fn roundtrip_int64() { Test { input: i64_array([ // row group 1 Some(1), None, Some(3), // row group 2 Some(0), Some(5), None, // row group 3 None, None, None, ]), expected_min: i64_array([Some(1), Some(0), None]), expected_max: i64_array(vec![Some(3), Some(5), None]), } .run() } #[test] fn roundtrip_f32() { Test { input: f32_array([ // row group 1 Some(1.0), None, Some(3.0), // row group 2 Some(-1.0), Some(5.0), None, // row group 3 None, None, None, ]), expected_min: f32_array([Some(1.0), Some(-1.0), None]), expected_max: f32_array([Some(3.0), Some(5.0), None]), } .run() } #[test] fn roundtrip_f64() { Test { input: f64_array([ // row group 1 Some(1.0), None, Some(3.0), // row group 2 Some(-1.0), Some(5.0), None, // row group 3 None, None, None, ]), expected_min: f64_array([Some(1.0), Some(-1.0), None]), expected_max: f64_array([Some(3.0), Some(5.0), None]), } .run() } #[test] fn roundtrip_timestamp() { Test { input: timestamp_seconds_array( [ // row group 1 Some(1), None, Some(3), // row group 2 Some(9), Some(5), None, // row group 3 None, None, None, ], None, ), expected_min: timestamp_seconds_array([Some(1), Some(5), None], None), expected_max: timestamp_seconds_array([Some(3), Some(9), None], None), } .run(); Test { input: timestamp_milliseconds_array( [ // row group 1 Some(1), None, Some(3), // row group 2 Some(9), Some(5), None, // row group 3 None, None, None, ], None, ), expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], None), expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], None), } .run(); Test { input: timestamp_microseconds_array( [ // row group 1 Some(1), None, Some(3), // row group 2 Some(9), Some(5), None, // row group 3 None, None, None, ], None, ), expected_min: timestamp_microseconds_array([Some(1), Some(5), None], None), expected_max: timestamp_microseconds_array([Some(3), Some(9), None], None), } .run(); Test { input: timestamp_nanoseconds_array( [ // row group 1 Some(1), None, Some(3), // row group 2 Some(9), Some(5), None, // row group 3 None, None, None, ], None, ), expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], None), expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], None), } .run() } #[test] fn roundtrip_timestamp_timezoned() { Test { input: timestamp_seconds_array( [ // row group 1 Some(1), None, Some(3), // row group 2 Some(9), Some(5), None, // row group 3 None, None, None, ], Some("UTC"), ), expected_min: timestamp_seconds_array([Some(1), Some(5), None], Some("UTC")), expected_max: timestamp_seconds_array([Some(3), Some(9), None], Some("UTC")), } .run(); Test { input: timestamp_milliseconds_array( [ // row group 1 Some(1), None, Some(3), // row group 2 Some(9), Some(5), None, // row group 3 None, None, None, ], Some("UTC"), ), expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], Some("UTC")), expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], Some("UTC")), } .run(); Test { input: timestamp_microseconds_array( [ // row group 1 Some(1), None, Some(3), // row group 2 Some(9), Some(5), None, // row group 3 None, None, None, ], Some("UTC"), ), expected_min: timestamp_microseconds_array([Some(1), Some(5), None], Some("UTC")), expected_max: timestamp_microseconds_array([Some(3), Some(9), None], Some("UTC")), } .run(); Test { input: timestamp_nanoseconds_array( [ // row group 1 Some(1), None, Some(3), // row group 2 Some(9), Some(5), None, // row group 3 None, None, None, ], Some("UTC"), ), expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], Some("UTC")), expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], Some("UTC")), } .run() } #[test] fn roundtrip_decimal() { Test { input: Arc::new( Decimal128Array::from(vec![ // row group 1 Some(100), None, Some(22000), // row group 2 Some(500000), Some(330000), None, // row group 3 None, None, None, ]) .with_precision_and_scale(9, 2) .unwrap(), ), expected_min: Arc::new( Decimal128Array::from(vec![Some(100), Some(330000), None]) .with_precision_and_scale(9, 2) .unwrap(), ), expected_max: Arc::new( Decimal128Array::from(vec![Some(22000), Some(500000), None]) .with_precision_and_scale(9, 2) .unwrap(), ), } .run(); Test { input: Arc::new( Decimal256Array::from(vec![ // row group 1 Some(i256::from(100)), None, Some(i256::from(22000)), // row group 2 Some(i256::MAX), Some(i256::MIN), None, // row group 3 None, None, None, ]) .with_precision_and_scale(76, 76) .unwrap(), ), expected_min: Arc::new( Decimal256Array::from(vec![Some(i256::from(100)), Some(i256::MIN), None]) .with_precision_and_scale(76, 76) .unwrap(), ), expected_max: Arc::new( Decimal256Array::from(vec![Some(i256::from(22000)), Some(i256::MAX), None]) .with_precision_and_scale(76, 76) .unwrap(), ), } .run() } #[test] fn roundtrip_utf8() { Test { input: utf8_array([ // row group 1 Some("A"), None, Some("Q"), // row group 2 Some("ZZ"), Some("AA"), None, // row group 3 None, None, None, ]), expected_min: utf8_array([Some("A"), Some("AA"), None]), expected_max: utf8_array([Some("Q"), Some("ZZ"), None]), } .run() } #[test] fn roundtrip_struct() { let mut test = Test { input: struct_array(vec![ // row group 1 (Some(true), Some(1)), (None, None), (Some(true), Some(3)), // row group 2 (Some(true), Some(0)), (Some(false), Some(5)), (None, None), // row group 3 (None, None), (None, None), (None, None), ]), expected_min: struct_array(vec![ (Some(true), Some(1)), (Some(true), Some(0)), (None, None), ]), expected_max: struct_array(vec![ (Some(true), Some(3)), (Some(true), Some(0)), (None, None), ]), }; // Due to https://github.com/apache/datafusion/issues/8334, // statistics for struct arrays are not supported test.expected_min = new_null_array(test.input.data_type(), test.expected_min.len()); test.expected_max = new_null_array(test.input.data_type(), test.expected_min.len()); test.run() } #[test] fn roundtrip_binary() { Test { input: Arc::new(BinaryArray::from_opt_vec(vec![ // row group 1 Some(b"A"), None, Some(b"Q"), // row group 2 Some(b"ZZ"), Some(b"AA"), None, // row group 3 None, None, None, ])), expected_min: Arc::new(BinaryArray::from_opt_vec(vec![ Some(b"A"), Some(b"AA"), None, ])), expected_max: Arc::new(BinaryArray::from_opt_vec(vec![ Some(b"Q"), Some(b"ZZ"), None, ])), } .run() } #[test] fn roundtrip_date32() { Test { input: date32_array(vec![ // row group 1 Some("2021-01-01"), None, Some("2021-01-03"), // row group 2 Some("2021-01-01"), Some("2021-01-05"), None, // row group 3 None, None, None, ]), expected_min: date32_array(vec![Some("2021-01-01"), Some("2021-01-01"), None]), expected_max: date32_array(vec![Some("2021-01-03"), Some("2021-01-05"), None]), } .run() } #[test] fn roundtrip_date64() { Test { input: date64_array(vec![ // row group 1 Some("2021-01-01"), None, Some("2021-01-03"), // row group 2 Some("2021-01-01"), Some("2021-01-05"), None, // row group 3 None, None, None, ]), expected_min: date64_array(vec![Some("2021-01-01"), Some("2021-01-01"), None]), expected_max: date64_array(vec![Some("2021-01-03"), Some("2021-01-05"), None]), } .run() } #[test] fn roundtrip_large_binary_array() { let input: Vec<Option<&[u8]>> = vec![ // row group 1 Some(b"A"), None, Some(b"Q"), // row group 2 Some(b"ZZ"), Some(b"AA"), None, // row group 3 None, None, None, ]; let expected_min: Vec<Option<&[u8]>> = vec![Some(b"A"), Some(b"AA"), None]; let expected_max: Vec<Option<&[u8]>> = vec![Some(b"Q"), Some(b"ZZ"), None]; Test { input: large_binary_array(input), expected_min: large_binary_array(expected_min), expected_max: large_binary_array(expected_max), } .run(); } #[test] fn struct_and_non_struct() { // Ensures that statistics for an array that appears *after* a struct // array are not wrong let struct_col = struct_array(vec![ // row group 1 (Some(true), Some(1)), (None, None), (Some(true), Some(3)), ]); let int_col = i32_array([Some(100), Some(200), Some(300)]); let expected_min = i32_array([Some(100)]); let expected_max = i32_array(vec![Some(300)]); // use a name that shadows a name in the struct column match struct_col.data_type() { DataType::Struct(fields) => { assert_eq!(fields.get(1).unwrap().name(), "int_col") } _ => panic!("unexpected data type for struct column"), }; let input_batch = RecordBatch::try_from_iter([("struct_col", struct_col), ("int_col", int_col)]).unwrap(); let schema = input_batch.schema(); let metadata = parquet_metadata(schema.clone(), input_batch); let parquet_schema = metadata.file_metadata().schema_descr(); // read the int_col statistics let (idx, _) = parquet_column(parquet_schema, &schema, "int_col").unwrap(); assert_eq!(idx, 2); let row_groups = metadata.row_groups(); let converter = StatisticsConverter::try_new("int_col", &schema, parquet_schema).unwrap(); let min = converter.row_group_mins(row_groups.iter()).unwrap(); assert_eq!( &min, &expected_min, "Min. Statistics\n\n{}\n\n", DisplayStats(row_groups) ); let max = converter.row_group_maxes(row_groups.iter()).unwrap(); assert_eq!( &max, &expected_max, "Max. Statistics\n\n{}\n\n", DisplayStats(row_groups) ); } - Keep data file validation:
arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs
Lines 2115 to 2235 in f2de2cd
fn nan_in_stats() { // /parquet-testing/data/nan_in_stats.parquet // row_groups: 1 // "x": Double({min: Some(1.0), max: Some(NaN), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) TestFile::new("nan_in_stats.parquet") .with_column(ExpectedColumn { name: "x", expected_min: Arc::new(Float64Array::from(vec![Some(1.0)])), expected_max: Arc::new(Float64Array::from(vec![Some(f64::NAN)])), }) .run(); } #[test] fn alltypes_plain() { // /parquet-testing/data/datapage_v1-snappy-compressed-checksum.parquet // row_groups: 1 // (has no statistics) TestFile::new("alltypes_plain.parquet") // No column statistics should be read as NULL, but with the right type .with_column(ExpectedColumn { name: "id", expected_min: i32_array([None]), expected_max: i32_array([None]), }) .with_column(ExpectedColumn { name: "bool_col", expected_min: bool_array([None]), expected_max: bool_array([None]), }) .run(); } #[test] fn alltypes_tiny_pages() { // /parquet-testing/data/alltypes_tiny_pages.parquet // row_groups: 1 // "id": Int32({min: Some(0), max: Some(7299), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "bool_col": Boolean({min: Some(false), max: Some(true), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "tinyint_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "smallint_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "int_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "bigint_col": Int64({min: Some(0), max: Some(90), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "float_col": Float({min: Some(0.0), max: Some(9.9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "double_col": Double({min: Some(0.0), max: Some(90.89999999999999), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "date_string_col": ByteArray({min: Some(ByteArray { data: "01/01/09" }), max: Some(ByteArray { data: "12/31/10" }), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "string_col": ByteArray({min: Some(ByteArray { data: "0" }), max: Some(ByteArray { data: "9" }), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "timestamp_col": Int96({min: None, max: None, distinct_count: None, null_count: 0, min_max_deprecated: true, min_max_backwards_compatible: true}) // "year": Int32({min: Some(2009), max: Some(2010), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) // "month": Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) TestFile::new("alltypes_tiny_pages.parquet") .with_column(ExpectedColumn { name: "id", expected_min: i32_array([Some(0)]), expected_max: i32_array([Some(7299)]), }) .with_column(ExpectedColumn { name: "bool_col", expected_min: bool_array([Some(false)]), expected_max: bool_array([Some(true)]), }) .with_column(ExpectedColumn { name: "tinyint_col", expected_min: i8_array([Some(0)]), expected_max: i8_array([Some(9)]), }) .with_column(ExpectedColumn { name: "smallint_col", expected_min: i16_array([Some(0)]), expected_max: i16_array([Some(9)]), }) .with_column(ExpectedColumn { name: "int_col", expected_min: i32_array([Some(0)]), expected_max: i32_array([Some(9)]), }) .with_column(ExpectedColumn { name: "bigint_col", expected_min: i64_array([Some(0)]), expected_max: i64_array([Some(90)]), }) .with_column(ExpectedColumn { name: "float_col", expected_min: f32_array([Some(0.0)]), expected_max: f32_array([Some(9.9)]), }) .with_column(ExpectedColumn { name: "double_col", expected_min: f64_array([Some(0.0)]), expected_max: f64_array([Some(90.89999999999999)]), }) .with_column(ExpectedColumn { name: "date_string_col", expected_min: utf8_array([Some("01/01/09")]), expected_max: utf8_array([Some("12/31/10")]), }) .with_column(ExpectedColumn { name: "string_col", expected_min: utf8_array([Some("0")]), expected_max: utf8_array([Some("9")]), }) // File has no min/max for timestamp_col .with_column(ExpectedColumn { name: "timestamp_col", expected_min: timestamp_nanoseconds_array([None], None), expected_max: timestamp_nanoseconds_array([None], None), }) .with_column(ExpectedColumn { name: "year", expected_min: i32_array([Some(2009)]), expected_max: i32_array([Some(2010)]), }) .with_column(ExpectedColumn { name: "month", expected_min: i32_array([Some(1)]), expected_max: i32_array([Some(12)]), }) .run(); } - Port fixed length decimal legacy:
arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs
Lines 2237 to 2258 in f2de2cd
fn fixed_length_decimal_legacy() { // /parquet-testing/data/fixed_length_decimal_legacy.parquet // row_groups: 1 // "value": FixedLenByteArray({min: Some(FixedLenByteArray(ByteArray { data: Some(ByteBufferPtr { data: b"\0\0\0\0\0\xc8" }) })), max: Some(FixedLenByteArray(ByteArray { data: "\0\0\0\0\t`" })), distinct_count: None, null_count: 0, min_max_deprecated: true, min_max_backwards_compatible: true}) TestFile::new("fixed_length_decimal_legacy.parquet") .with_column(ExpectedColumn { name: "value", expected_min: Arc::new( Decimal128Array::from(vec![Some(200)]) .with_precision_and_scale(13, 2) .unwrap(), ), expected_max: Arc::new( Decimal128Array::from(vec![Some(2400)]) .with_precision_and_scale(13, 2) .unwrap(), ), }) .run(); }
Additional context
This duplication is a historical artifact of how this code was developed in DataFusion and then it got brought over when @efredine ported the work in #6046