Skip to content

Remove test duplication in parquet statistics tets #6185

@alamb

Description

@alamb

Is your feature request related to a problem or challenge? Please describe what you are trying to do.

When adding additional support to the statistics converter as @Kev1n8 did in #6181 the presence of two sets of tests is quite confusing about where to add tests for the new test

There are tests in both

Describe the solution you'd like
I would like to remove the rundant tests

Describe alternatives you've considered
I think the clearest alternateive is to move all the tests from https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs to https://github.com/apache/arrow-rs/blob/master/parquet/tests/arrow_reader/statistics.rs

We should only move the non duplicated tests.

So that means something like:

  • Leave a comment with a note about where the tests are in https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs
  • Keep edge condition tests
    #[test]
    fn roundtrip_empty() {
    let empty_bool_array = new_empty_array(&DataType::Boolean);
    Test {
    input: empty_bool_array.clone(),
    expected_min: empty_bool_array.clone(),
    expected_max: empty_bool_array.clone(),
    }
    .run()
    }
  • Remove tests in
    fn roundtrip_int32() {
    Test {
    input: i32_array([
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(0),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: i32_array([Some(1), Some(0), None]),
    expected_max: i32_array([Some(3), Some(5), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_int64() {
    Test {
    input: i64_array([
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(0),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: i64_array([Some(1), Some(0), None]),
    expected_max: i64_array(vec![Some(3), Some(5), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_f32() {
    Test {
    input: f32_array([
    // row group 1
    Some(1.0),
    None,
    Some(3.0),
    // row group 2
    Some(-1.0),
    Some(5.0),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: f32_array([Some(1.0), Some(-1.0), None]),
    expected_max: f32_array([Some(3.0), Some(5.0), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_f64() {
    Test {
    input: f64_array([
    // row group 1
    Some(1.0),
    None,
    Some(3.0),
    // row group 2
    Some(-1.0),
    Some(5.0),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: f64_array([Some(1.0), Some(-1.0), None]),
    expected_max: f64_array([Some(3.0), Some(5.0), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_timestamp() {
    Test {
    input: timestamp_seconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    None,
    ),
    expected_min: timestamp_seconds_array([Some(1), Some(5), None], None),
    expected_max: timestamp_seconds_array([Some(3), Some(9), None], None),
    }
    .run();
    Test {
    input: timestamp_milliseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    None,
    ),
    expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], None),
    expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], None),
    }
    .run();
    Test {
    input: timestamp_microseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    None,
    ),
    expected_min: timestamp_microseconds_array([Some(1), Some(5), None], None),
    expected_max: timestamp_microseconds_array([Some(3), Some(9), None], None),
    }
    .run();
    Test {
    input: timestamp_nanoseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    None,
    ),
    expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], None),
    expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], None),
    }
    .run()
    }
    #[test]
    fn roundtrip_timestamp_timezoned() {
    Test {
    input: timestamp_seconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    Some("UTC"),
    ),
    expected_min: timestamp_seconds_array([Some(1), Some(5), None], Some("UTC")),
    expected_max: timestamp_seconds_array([Some(3), Some(9), None], Some("UTC")),
    }
    .run();
    Test {
    input: timestamp_milliseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    Some("UTC"),
    ),
    expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], Some("UTC")),
    expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], Some("UTC")),
    }
    .run();
    Test {
    input: timestamp_microseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    Some("UTC"),
    ),
    expected_min: timestamp_microseconds_array([Some(1), Some(5), None], Some("UTC")),
    expected_max: timestamp_microseconds_array([Some(3), Some(9), None], Some("UTC")),
    }
    .run();
    Test {
    input: timestamp_nanoseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    Some("UTC"),
    ),
    expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], Some("UTC")),
    expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], Some("UTC")),
    }
    .run()
    }
    #[test]
    fn roundtrip_decimal() {
    Test {
    input: Arc::new(
    Decimal128Array::from(vec![
    // row group 1
    Some(100),
    None,
    Some(22000),
    // row group 2
    Some(500000),
    Some(330000),
    None,
    // row group 3
    None,
    None,
    None,
    ])
    .with_precision_and_scale(9, 2)
    .unwrap(),
    ),
    expected_min: Arc::new(
    Decimal128Array::from(vec![Some(100), Some(330000), None])
    .with_precision_and_scale(9, 2)
    .unwrap(),
    ),
    expected_max: Arc::new(
    Decimal128Array::from(vec![Some(22000), Some(500000), None])
    .with_precision_and_scale(9, 2)
    .unwrap(),
    ),
    }
    .run();
    Test {
    input: Arc::new(
    Decimal256Array::from(vec![
    // row group 1
    Some(i256::from(100)),
    None,
    Some(i256::from(22000)),
    // row group 2
    Some(i256::MAX),
    Some(i256::MIN),
    None,
    // row group 3
    None,
    None,
    None,
    ])
    .with_precision_and_scale(76, 76)
    .unwrap(),
    ),
    expected_min: Arc::new(
    Decimal256Array::from(vec![Some(i256::from(100)), Some(i256::MIN), None])
    .with_precision_and_scale(76, 76)
    .unwrap(),
    ),
    expected_max: Arc::new(
    Decimal256Array::from(vec![Some(i256::from(22000)), Some(i256::MAX), None])
    .with_precision_and_scale(76, 76)
    .unwrap(),
    ),
    }
    .run()
    }
    #[test]
    fn roundtrip_utf8() {
    Test {
    input: utf8_array([
    // row group 1
    Some("A"),
    None,
    Some("Q"),
    // row group 2
    Some("ZZ"),
    Some("AA"),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: utf8_array([Some("A"), Some("AA"), None]),
    expected_max: utf8_array([Some("Q"), Some("ZZ"), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_struct() {
    let mut test = Test {
    input: struct_array(vec![
    // row group 1
    (Some(true), Some(1)),
    (None, None),
    (Some(true), Some(3)),
    // row group 2
    (Some(true), Some(0)),
    (Some(false), Some(5)),
    (None, None),
    // row group 3
    (None, None),
    (None, None),
    (None, None),
    ]),
    expected_min: struct_array(vec![
    (Some(true), Some(1)),
    (Some(true), Some(0)),
    (None, None),
    ]),
    expected_max: struct_array(vec![
    (Some(true), Some(3)),
    (Some(true), Some(0)),
    (None, None),
    ]),
    };
    // Due to https://github.com/apache/datafusion/issues/8334,
    // statistics for struct arrays are not supported
    test.expected_min = new_null_array(test.input.data_type(), test.expected_min.len());
    test.expected_max = new_null_array(test.input.data_type(), test.expected_min.len());
    test.run()
    }
    #[test]
    fn roundtrip_binary() {
    Test {
    input: Arc::new(BinaryArray::from_opt_vec(vec![
    // row group 1
    Some(b"A"),
    None,
    Some(b"Q"),
    // row group 2
    Some(b"ZZ"),
    Some(b"AA"),
    None,
    // row group 3
    None,
    None,
    None,
    ])),
    expected_min: Arc::new(BinaryArray::from_opt_vec(vec![
    Some(b"A"),
    Some(b"AA"),
    None,
    ])),
    expected_max: Arc::new(BinaryArray::from_opt_vec(vec![
    Some(b"Q"),
    Some(b"ZZ"),
    None,
    ])),
    }
    .run()
    }
    #[test]
    fn roundtrip_date32() {
    Test {
    input: date32_array(vec![
    // row group 1
    Some("2021-01-01"),
    None,
    Some("2021-01-03"),
    // row group 2
    Some("2021-01-01"),
    Some("2021-01-05"),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: date32_array(vec![Some("2021-01-01"), Some("2021-01-01"), None]),
    expected_max: date32_array(vec![Some("2021-01-03"), Some("2021-01-05"), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_date64() {
    Test {
    input: date64_array(vec![
    // row group 1
    Some("2021-01-01"),
    None,
    Some("2021-01-03"),
    // row group 2
    Some("2021-01-01"),
    Some("2021-01-05"),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: date64_array(vec![Some("2021-01-01"), Some("2021-01-01"), None]),
    expected_max: date64_array(vec![Some("2021-01-03"), Some("2021-01-05"), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_large_binary_array() {
    let input: Vec<Option<&[u8]>> = vec![
    // row group 1
    Some(b"A"),
    None,
    Some(b"Q"),
    // row group 2
    Some(b"ZZ"),
    Some(b"AA"),
    None,
    // row group 3
    None,
    None,
    None,
    ];
    let expected_min: Vec<Option<&[u8]>> = vec![Some(b"A"), Some(b"AA"), None];
    let expected_max: Vec<Option<&[u8]>> = vec![Some(b"Q"), Some(b"ZZ"), None];
    Test {
    input: large_binary_array(input),
    expected_min: large_binary_array(expected_min),
    expected_max: large_binary_array(expected_max),
    }
    .run();
    }
    #[test]
    fn struct_and_non_struct() {
    // Ensures that statistics for an array that appears *after* a struct
    // array are not wrong
    let struct_col = struct_array(vec![
    // row group 1
    (Some(true), Some(1)),
    (None, None),
    (Some(true), Some(3)),
    ]);
    let int_col = i32_array([Some(100), Some(200), Some(300)]);
    let expected_min = i32_array([Some(100)]);
    let expected_max = i32_array(vec![Some(300)]);
    // use a name that shadows a name in the struct column
    match struct_col.data_type() {
    DataType::Struct(fields) => {
    assert_eq!(fields.get(1).unwrap().name(), "int_col")
    }
    _ => panic!("unexpected data type for struct column"),
    };
    let input_batch =
    RecordBatch::try_from_iter([("struct_col", struct_col), ("int_col", int_col)]).unwrap();
    let schema = input_batch.schema();
    let metadata = parquet_metadata(schema.clone(), input_batch);
    let parquet_schema = metadata.file_metadata().schema_descr();
    // read the int_col statistics
    let (idx, _) = parquet_column(parquet_schema, &schema, "int_col").unwrap();
    assert_eq!(idx, 2);
    let row_groups = metadata.row_groups();
    let converter = StatisticsConverter::try_new("int_col", &schema, parquet_schema).unwrap();
    let min = converter.row_group_mins(row_groups.iter()).unwrap();
    assert_eq!(
    &min,
    &expected_min,
    "Min. Statistics\n\n{}\n\n",
    DisplayStats(row_groups)
    );
    let max = converter.row_group_maxes(row_groups.iter()).unwrap();
    assert_eq!(
    &max,
    &expected_max,
    "Max. Statistics\n\n{}\n\n",
    DisplayStats(row_groups)
    );
    }
    that have corresponding coverage
  • Keep data file validation:
    fn nan_in_stats() {
    // /parquet-testing/data/nan_in_stats.parquet
    // row_groups: 1
    // "x": Double({min: Some(1.0), max: Some(NaN), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    TestFile::new("nan_in_stats.parquet")
    .with_column(ExpectedColumn {
    name: "x",
    expected_min: Arc::new(Float64Array::from(vec![Some(1.0)])),
    expected_max: Arc::new(Float64Array::from(vec![Some(f64::NAN)])),
    })
    .run();
    }
    #[test]
    fn alltypes_plain() {
    // /parquet-testing/data/datapage_v1-snappy-compressed-checksum.parquet
    // row_groups: 1
    // (has no statistics)
    TestFile::new("alltypes_plain.parquet")
    // No column statistics should be read as NULL, but with the right type
    .with_column(ExpectedColumn {
    name: "id",
    expected_min: i32_array([None]),
    expected_max: i32_array([None]),
    })
    .with_column(ExpectedColumn {
    name: "bool_col",
    expected_min: bool_array([None]),
    expected_max: bool_array([None]),
    })
    .run();
    }
    #[test]
    fn alltypes_tiny_pages() {
    // /parquet-testing/data/alltypes_tiny_pages.parquet
    // row_groups: 1
    // "id": Int32({min: Some(0), max: Some(7299), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "bool_col": Boolean({min: Some(false), max: Some(true), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "tinyint_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "smallint_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "int_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "bigint_col": Int64({min: Some(0), max: Some(90), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "float_col": Float({min: Some(0.0), max: Some(9.9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "double_col": Double({min: Some(0.0), max: Some(90.89999999999999), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "date_string_col": ByteArray({min: Some(ByteArray { data: "01/01/09" }), max: Some(ByteArray { data: "12/31/10" }), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "string_col": ByteArray({min: Some(ByteArray { data: "0" }), max: Some(ByteArray { data: "9" }), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "timestamp_col": Int96({min: None, max: None, distinct_count: None, null_count: 0, min_max_deprecated: true, min_max_backwards_compatible: true})
    // "year": Int32({min: Some(2009), max: Some(2010), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "month": Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    TestFile::new("alltypes_tiny_pages.parquet")
    .with_column(ExpectedColumn {
    name: "id",
    expected_min: i32_array([Some(0)]),
    expected_max: i32_array([Some(7299)]),
    })
    .with_column(ExpectedColumn {
    name: "bool_col",
    expected_min: bool_array([Some(false)]),
    expected_max: bool_array([Some(true)]),
    })
    .with_column(ExpectedColumn {
    name: "tinyint_col",
    expected_min: i8_array([Some(0)]),
    expected_max: i8_array([Some(9)]),
    })
    .with_column(ExpectedColumn {
    name: "smallint_col",
    expected_min: i16_array([Some(0)]),
    expected_max: i16_array([Some(9)]),
    })
    .with_column(ExpectedColumn {
    name: "int_col",
    expected_min: i32_array([Some(0)]),
    expected_max: i32_array([Some(9)]),
    })
    .with_column(ExpectedColumn {
    name: "bigint_col",
    expected_min: i64_array([Some(0)]),
    expected_max: i64_array([Some(90)]),
    })
    .with_column(ExpectedColumn {
    name: "float_col",
    expected_min: f32_array([Some(0.0)]),
    expected_max: f32_array([Some(9.9)]),
    })
    .with_column(ExpectedColumn {
    name: "double_col",
    expected_min: f64_array([Some(0.0)]),
    expected_max: f64_array([Some(90.89999999999999)]),
    })
    .with_column(ExpectedColumn {
    name: "date_string_col",
    expected_min: utf8_array([Some("01/01/09")]),
    expected_max: utf8_array([Some("12/31/10")]),
    })
    .with_column(ExpectedColumn {
    name: "string_col",
    expected_min: utf8_array([Some("0")]),
    expected_max: utf8_array([Some("9")]),
    })
    // File has no min/max for timestamp_col
    .with_column(ExpectedColumn {
    name: "timestamp_col",
    expected_min: timestamp_nanoseconds_array([None], None),
    expected_max: timestamp_nanoseconds_array([None], None),
    })
    .with_column(ExpectedColumn {
    name: "year",
    expected_min: i32_array([Some(2009)]),
    expected_max: i32_array([Some(2010)]),
    })
    .with_column(ExpectedColumn {
    name: "month",
    expected_min: i32_array([Some(1)]),
    expected_max: i32_array([Some(12)]),
    })
    .run();
    }
  • Port fixed length decimal legacy:
    fn fixed_length_decimal_legacy() {
    // /parquet-testing/data/fixed_length_decimal_legacy.parquet
    // row_groups: 1
    // "value": FixedLenByteArray({min: Some(FixedLenByteArray(ByteArray { data: Some(ByteBufferPtr { data: b"\0\0\0\0\0\xc8" }) })), max: Some(FixedLenByteArray(ByteArray { data: "\0\0\0\0\t`" })), distinct_count: None, null_count: 0, min_max_deprecated: true, min_max_backwards_compatible: true})
    TestFile::new("fixed_length_decimal_legacy.parquet")
    .with_column(ExpectedColumn {
    name: "value",
    expected_min: Arc::new(
    Decimal128Array::from(vec![Some(200)])
    .with_precision_and_scale(13, 2)
    .unwrap(),
    ),
    expected_max: Arc::new(
    Decimal128Array::from(vec![Some(2400)])
    .with_precision_and_scale(13, 2)
    .unwrap(),
    ),
    })
    .run();
    }

Additional context
This duplication is a historical artifact of how this code was developed in DataFusion and then it got brought over when @efredine ported the work in #6046

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementAny new improvement worthy of a entry in the changelogparquetChanges to the parquet crate

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions