From 1bc52e9b0dea65616009d335bcc8a31329be9cfc Mon Sep 17 00:00:00 2001 From: Gabriel Musat Mestre Date: Mon, 12 May 2025 07:49:53 +0200 Subject: [PATCH 1/4] Fix comparisons between lists that contain nulls --- datafusion/common/src/scalar/mod.rs | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 5c020d1f6398..48fb5513b6db 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -605,6 +605,20 @@ fn partial_cmp_list(arr1: &dyn Array, arr2: &dyn Array) -> Option { let eq_res = arrow::compute::kernels::cmp::eq(&arr1_trimmed, &arr2_trimmed).ok()?; for j in 0..lt_res.len() { + // In Postgres, NULL values in lists are always considered to be greater than non-NULL values: + // + // $ SELECT ARRAY[NULL]::integer[] > ARRAY[1] + // true + // + // These next two if statements are introduced for replicating Postgres behavior, as + // arrow::compute does not account for this. + if arr1_trimmed.is_null(j) && !arr2_trimmed.is_null(j) { + return Some(Ordering::Greater); + } + if !arr1_trimmed.is_null(j) && arr2_trimmed.is_null(j) { + return Some(Ordering::Less); + } + if lt_res.is_valid(j) && lt_res.value(j) { return Some(Ordering::Less); } @@ -4878,6 +4892,24 @@ mod tests { ])]), )); assert_eq!(a.partial_cmp(&b), Some(Ordering::Greater)); + + let a = + ScalarValue::List(Arc::new( + ListArray::from_iter_primitive::(vec![Some(vec![ + None, + Some(2), + Some(3), + ])]), + )); + let b = + ScalarValue::List(Arc::new( + ListArray::from_iter_primitive::(vec![Some(vec![ + Some(1), + Some(2), + Some(3), + ])]), + )); + assert_eq!(a.partial_cmp(&b), Some(Ordering::Greater)); } #[test] From 41cba6fefafb389566ee6334565d6def66b8ba30 Mon Sep 17 00:00:00 2001 From: Gabriel Musat Mestre Date: Mon, 12 May 2025 07:50:42 +0200 Subject: [PATCH 2/4] Add support for lists in min/max agg functions --- datafusion/functions-aggregate/src/min_max.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs index af178ed67528..4bd842dc0363 100644 --- a/datafusion/functions-aggregate/src/min_max.rs +++ b/datafusion/functions-aggregate/src/min_max.rs @@ -616,7 +616,8 @@ fn min_batch(values: &ArrayRef) -> Result { min_binary_view ) } - DataType::Struct(_) => min_max_batch_struct(values, Ordering::Greater)?, + DataType::Struct(_) => min_max_batch_generic(values, Ordering::Greater)?, + DataType::List(_) => min_max_batch_generic(values, Ordering::Greater)?, DataType::Dictionary(_, _) => { let values = values.as_any_dictionary().values(); min_batch(values)? @@ -625,7 +626,7 @@ fn min_batch(values: &ArrayRef) -> Result { }) } -fn min_max_batch_struct(array: &ArrayRef, ordering: Ordering) -> Result { +fn min_max_batch_generic(array: &ArrayRef, ordering: Ordering) -> Result { if array.len() == array.null_count() { return ScalarValue::try_from(array.data_type()); } @@ -649,7 +650,7 @@ fn min_max_batch_struct(array: &ArrayRef, ordering: Ordering) -> Result {{ if $VALUE.is_null() { $DELTA.clone() @@ -703,7 +704,8 @@ pub fn max_batch(values: &ArrayRef) -> Result { max_binary ) } - DataType::Struct(_) => min_max_batch_struct(values, Ordering::Less)?, + DataType::Struct(_) => min_max_batch_generic(values, Ordering::Less)?, + DataType::List(_) => min_max_batch_generic(values, Ordering::Less)?, DataType::Dictionary(_, _) => { let values = values.as_any_dictionary().values(); max_batch(values)? @@ -983,7 +985,14 @@ macro_rules! min_max { lhs @ ScalarValue::Struct(_), rhs @ ScalarValue::Struct(_), ) => { - min_max_struct!(lhs, rhs, $OP) + min_max_generic!(lhs, rhs, $OP) + } + + ( + lhs @ ScalarValue::List(_), + rhs @ ScalarValue::List(_), + ) => { + min_max_generic!(lhs, rhs, $OP) } e => { return internal_err!( From b9e43f3ef85cdda9a96630afb0faeb7ef3e4e62d Mon Sep 17 00:00:00 2001 From: Gabriel Musat Mestre Date: Mon, 12 May 2025 07:58:50 +0200 Subject: [PATCH 3/4] Add sqllogictests --- .../sqllogictest/test_files/aggregate.slt | 147 ++++++++++++++++++ .../sqllogictest/test_files/array_query.slt | 36 ++++- 2 files changed, 181 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 552ec5e3b86a..c016aaddfb1b 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -6997,4 +6997,151 @@ VALUES ---- {a: 1, b: 2, c: 3} {a: 1, b: 2, c: 4} +# Min/Max with list over integers +query ?? +SELECT MIN(column1), MAX(column1) FROM VALUES +([1, 2, 3]), +([1, 2]); +---- +[1, 2] [1, 2, 3] + +# Min/Max with lists over strings +query ?? +SELECT MIN(column1), MAX(column1) FROM VALUES +(['a', 'b', 'c']), +(['a', 'b']); +---- +[a, b] [a, b, c] + +# Min/Max with list over booleans +query ?? +SELECT MIN(column1), MAX(column1) FROM VALUES +([true, true, false]), +([false, true]); +---- +[false, true] [true, true, false] + +# Min/Max with list over nullable integers +query ?? +SELECT MIN(column1), MAX(column1) FROM VALUES +([NULL, 1, 2]), +([1, 2]); +---- +[1, 2] [NULL, 1, 2] + +# Min/Max list with different lengths and nulls +query ?? +SELECT MIN(column1), MAX(column1) FROM VALUES +([1, NULL, 3]), +([1, 2, 3, 4]), +([1, 2]); +---- +[1, 2] [1, NULL, 3] + +# Min/Max list with only NULLs +query ?? +SELECT MIN(column1), MAX(column1) FROM VALUES +([NULL, NULL]), +([NULL]); +---- +[NULL] [NULL, NULL] + +# Min/Max list with empty lists +query ?? +SELECT MIN(column1), MAX(column1) FROM VALUES +([]), +([1]), +([]); +---- +[] [1] + +# Min/Max list of varying types (integers and NULLs) +query ?? +SELECT MIN(column1), MAX(column1) FROM VALUES +([1, 2, 3]), +([NULL, 2, 3]), +([1, 2, NULL]); +---- +[1, 2, 3] [NULL, 2, 3] + +# Min/Max list grouped by key with NULLs and differing lengths +query I?? rowsort +SELECT column1, MIN(column2), MAX(column2) FROM VALUES +(0, [1, NULL, 3]), +(0, [1, 2, 3, 4]), +(1, [1, 2]), +(1, [NULL, 5]), +(1, []) +GROUP BY column1; +---- +0 [1, 2, 3, 4] [1, NULL, 3] +1 [] [NULL, 5] + +# Min/Max list grouped by key with NULLs and differing lengths +query I?? rowsort +SELECT column1, MIN(column2), MAX(column2) FROM VALUES +(0, [NULL]), +(0, [NULL, NULL]), +(1, [NULL]) +GROUP BY column1; +---- +0 [NULL] [NULL, NULL] +1 [NULL] [NULL] + +# Min/Max grouped list with empty and non-empty +query I?? rowsort +SELECT column1, MIN(column2), MAX(column2) FROM VALUES +(0, []), +(0, [1]), +(0, []), +(1, [5, 6]), +(1, []) +GROUP BY column1; +---- +0 [] [1] +1 [] [5, 6] + +# Min/Max over lists with a window function +query ? +SELECT min(column1) OVER (ORDER BY column1) FROM VALUES +([1, 2, 3]), +([1, 2, 3]), +([2, 3]) +---- +[1, 2, 3] +[1, 2, 3] +[1, 2, 3] + +# Min/Max over lists with a window function and nulls +query ? +SELECT min(column1) OVER (ORDER BY column1) FROM VALUES +(NULL), +([4, 5]), +([2, 3]) +---- +[2, 3] +[2, 3] +[2, 3] + +# Min/Max over lists with a window function, nulls and ROWS BETWEEN statement +query ? +SELECT min(column1) OVER (ORDER BY column1 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) FROM VALUES +(NULL), +([4, 5]), +([2, 3]) +---- +[2, 3] +[2, 3] +[4, 5] + +# Min/Max over lists with a window function using a different column +query ? +SELECT max(column2) OVER (ORDER BY column1 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) FROM VALUES +([1, 2, 3], [4, 5]), +([2, 3], [2, 3]), +([1, 2, 3], NULL) +---- +[4, 5] +[4, 5] +[2, 3] diff --git a/datafusion/sqllogictest/test_files/array_query.slt b/datafusion/sqllogictest/test_files/array_query.slt index 8fde295e6051..67567d96fc77 100644 --- a/datafusion/sqllogictest/test_files/array_query.slt +++ b/datafusion/sqllogictest/test_files/array_query.slt @@ -108,11 +108,43 @@ SELECT * FROM data WHERE column2 is not distinct from null; # Aggregates ########### -query error Internal error: Min/Max accumulator not implemented for type List +query ? SELECT min(column1) FROM data; +---- +[1, 2, 3] -query error Internal error: Min/Max accumulator not implemented for type List +query ? SELECT max(column1) FROM data; +---- +[2, 3] + +query ? +SELECT min(column1) OVER (ORDER BY column1) FROM data; +---- +[1, 2, 3] +[1, 2, 3] +[1, 2, 3] + +query ? +SELECT min(column2) OVER (ORDER BY column2) FROM data; +---- +[2, 3] +[2, 3] +[2, 3] + +query ? +SELECT min(column2) OVER (ORDER BY column2 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) FROM data; +---- +[2, 3] +[2, 3] +[4, 5] + +query ? +SELECT max(column2) OVER (ORDER BY column1 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) FROM data; +---- +[4, 5] +[4, 5] +[2, 3] query I SELECT count(column1) FROM data; From b0a9bf59519d6fecfce8165ba495ab59f25dcf9d Mon Sep 17 00:00:00 2001 From: Gabriel Musat Mestre Date: Mon, 12 May 2025 08:01:40 +0200 Subject: [PATCH 4/4] Support lists in window frame target type --- .../optimizer/src/analyzer/type_coercion.rs | 1 + .../sqllogictest/test_files/array_query.slt | 28 ------------------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 860e041bb423..5fab680a2867 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -718,6 +718,7 @@ fn coerce_frame_bound( fn extract_window_frame_target_type(col_type: &DataType) -> Result { if col_type.is_numeric() || is_utf8_or_utf8view_or_large_utf8(col_type) + || matches!(col_type, DataType::List(_)) || matches!(col_type, DataType::Null) || matches!(col_type, DataType::Boolean) { diff --git a/datafusion/sqllogictest/test_files/array_query.slt b/datafusion/sqllogictest/test_files/array_query.slt index 67567d96fc77..65d4fa495e3b 100644 --- a/datafusion/sqllogictest/test_files/array_query.slt +++ b/datafusion/sqllogictest/test_files/array_query.slt @@ -118,34 +118,6 @@ SELECT max(column1) FROM data; ---- [2, 3] -query ? -SELECT min(column1) OVER (ORDER BY column1) FROM data; ----- -[1, 2, 3] -[1, 2, 3] -[1, 2, 3] - -query ? -SELECT min(column2) OVER (ORDER BY column2) FROM data; ----- -[2, 3] -[2, 3] -[2, 3] - -query ? -SELECT min(column2) OVER (ORDER BY column2 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) FROM data; ----- -[2, 3] -[2, 3] -[4, 5] - -query ? -SELECT max(column2) OVER (ORDER BY column1 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) FROM data; ----- -[4, 5] -[4, 5] -[2, 3] - query I SELECT count(column1) FROM data; ----