From f47ff6c386aaaefe99ef81c55a15dc7328cfd895 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Sat, 27 Apr 2024 09:35:11 +0800 Subject: [PATCH 1/6] fix: Correct null_count in describe() --- datafusion/core/src/dataframe/mod.rs | 6 +++--- datafusion/expr/src/expr_fn.rs | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index f877b7d698b4..38e7ccdb26bf 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -49,8 +49,8 @@ use datafusion_common::{ plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions, }; use datafusion_expr::{ - avg, count, is_null, max, median, min, stddev, utils::COUNT_STAR_EXPANSION, - TableProviderFilterPushDown, UNNAMED_TABLE, + avg, count, count_null, max, median, min, stddev, + utils::COUNT_STAR_EXPANSION, TableProviderFilterPushDown, UNNAMED_TABLE, }; use async_trait::async_trait; @@ -534,7 +534,7 @@ impl DataFrame { vec![], original_schema_fields .clone() - .map(|f| count(is_null(col(f.name()))).alias(f.name())) + .map(|f| count_null(col(f.name())).alias(f.name())) .collect::>(), ), // mean aggregation diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 1d976a12cc4f..c88134cc405b 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -215,6 +215,18 @@ pub fn count(expr: Expr) -> Expr { )) } +/// Create an expression to represent the count_null() aggregate function +pub fn count_null(expr: Expr) -> Expr { + Expr::AggregateFunction(AggregateFunction::new( + aggregate_function::AggregateFunction::Count, + vec![expr.clone()], + false, + Some(Box::new(expr.is_null())), + None, + None, + )) +} + /// Return a new expression with bitwise AND pub fn bitwise_and(left: Expr, right: Expr) -> Expr { Expr::BinaryExpr(BinaryExpr::new( @@ -448,6 +460,11 @@ pub fn is_null(expr: Expr) -> Expr { Expr::IsNull(Box::new(expr)) } +/// Create is not null expression +pub fn is_not_null(expr: Expr) -> Expr { + Expr::IsNotNull(Box::new(expr)) +} + /// Create is true expression pub fn is_true(expr: Expr) -> Expr { Expr::IsTrue(Box::new(expr)) From eeebda4afbf0e8f8881b11b16763c651e9819dc4 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Sat, 27 Apr 2024 09:35:43 +0800 Subject: [PATCH 2/6] chore: fix fmt --- datafusion/core/src/dataframe/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 38e7ccdb26bf..ff9b2fdc1df1 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -49,8 +49,8 @@ use datafusion_common::{ plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions, }; use datafusion_expr::{ - avg, count, count_null, max, median, min, stddev, - utils::COUNT_STAR_EXPANSION, TableProviderFilterPushDown, UNNAMED_TABLE, + avg, count, count_null, max, median, min, stddev, utils::COUNT_STAR_EXPANSION, + TableProviderFilterPushDown, UNNAMED_TABLE, }; use async_trait::async_trait; From 98784c50fed24c9bc6e0740bc2236c3b619b350a Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Sat, 27 Apr 2024 10:08:43 +0800 Subject: [PATCH 3/6] chore: Fix ci --- datafusion/core/tests/dataframe/describe.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/dataframe/describe.rs b/datafusion/core/tests/dataframe/describe.rs index e82c06efd644..c0077401898f 100644 --- a/datafusion/core/tests/dataframe/describe.rs +++ b/datafusion/core/tests/dataframe/describe.rs @@ -39,7 +39,7 @@ async fn describe() -> Result<()> { "| describe | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col | year | month |", "+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+", "| count | 7300.0 | 7300 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300 | 7300 | 7300 | 7300.0 | 7300.0 |", - "| null_count | 7300.0 | 7300 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300 | 7300 | 7300 | 7300.0 | 7300.0 |", + "| null_count | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0.0 | 0.0 |", "| mean | 3649.5 | null | 4.5 | 4.5 | 4.5 | 45.0 | 4.949999964237213 | 45.45 | null | null | null | 2009.5 | 6.526027397260274 |", "| std | 2107.472815166704 | null | 2.8724780750809518 | 2.8724780750809518 | 2.8724780750809518 | 28.724780750809533 | 3.1597258182544645 | 29.012028558317645 | null | null | null | 0.5000342500942125 | 3.44808750051728 |", "| min | 0.0 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 01/01/09 | 0 | 2008-12-31T23:00:00 | 2009.0 | 1.0 |", @@ -69,7 +69,7 @@ async fn describe_boolean_binary() -> Result<()> { "| describe | a | b |", "+------------+------+------+", "| count | 1 | 1 |", - "| null_count | 1 | 1 |", + "| null_count | 0 | 0 |", "| mean | null | null |", "| std | null | null |", "| min | a | null |", From 3a2220776bf9dc0473edddca14b08455086b20a8 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Sat, 27 Apr 2024 21:08:20 +0800 Subject: [PATCH 4/6] fix: Update comment --- datafusion/expr/src/expr_fn.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index c88134cc405b..132f4bd9ef0c 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -215,7 +215,7 @@ pub fn count(expr: Expr) -> Expr { )) } -/// Create an expression to represent the count_null() aggregate function +/// Create an expression to represent the COUNT(IS NULL x) aggregate function pub fn count_null(expr: Expr) -> Expr { Expr::AggregateFunction(AggregateFunction::new( aggregate_function::AggregateFunction::Count, From de67422e1f404c904d1afe0c7953522e034fbe32 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Tue, 30 Apr 2024 16:25:11 +0800 Subject: [PATCH 5/6] fix: refactor null_count calculation in describe() and add test --- datafusion/core/src/dataframe/mod.rs | 12 +++++++-- datafusion/core/tests/dataframe/describe.rs | 30 +++++++++++++++++++++ datafusion/expr/src/expr_fn.rs | 12 --------- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index ff9b2fdc1df1..4644e15febef 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -48,10 +48,12 @@ use datafusion_common::config::{CsvOptions, FormatOptions, JsonOptions}; use datafusion_common::{ plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions, }; +use datafusion_expr::lit; use datafusion_expr::{ - avg, count, count_null, max, median, min, stddev, utils::COUNT_STAR_EXPANSION, + avg, count, max, median, min, stddev, utils::COUNT_STAR_EXPANSION, TableProviderFilterPushDown, UNNAMED_TABLE, }; +use datafusion_expr::{case, is_null, sum}; use async_trait::async_trait; @@ -534,7 +536,13 @@ impl DataFrame { vec![], original_schema_fields .clone() - .map(|f| count_null(col(f.name())).alias(f.name())) + .map(|f| { + sum(case(is_null(col(f.name()))) + .when(lit(true), lit(1)) + .otherwise(lit(0)) + .unwrap()) + .alias(f.name()) + }) .collect::>(), ), // mean aggregation diff --git a/datafusion/core/tests/dataframe/describe.rs b/datafusion/core/tests/dataframe/describe.rs index c0077401898f..e446d71473be 100644 --- a/datafusion/core/tests/dataframe/describe.rs +++ b/datafusion/core/tests/dataframe/describe.rs @@ -81,6 +81,36 @@ async fn describe_boolean_binary() -> Result<()> { Ok(()) } +#[tokio::test] +async fn describe_null() -> Result<()> { + let ctx = parquet_context().await; + + //add test case for only boolean boolean/binary column + let result = ctx + .sql("select 'a' as a, null as b") + .await? + .describe() + .await? + .collect() + .await?; + #[rustfmt::skip] + let expected = [ + "+------------+------+------+", + "| describe | a | b |", + "+------------+------+------+", + "| count | 1 | 0 |", + "| null_count | 0 | 1 |", + "| mean | null | null |", + "| std | null | null |", + "| min | null | null |", + "| max | null | null |", + "| median | null | null |", + "+------------+------+------+" + ]; + assert_batches_eq!(expected, &result); + Ok(()) +} + /// Return a SessionContext with parquet file registered async fn parquet_context() -> SessionContext { let ctx = SessionContext::new(); diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 132f4bd9ef0c..d26fcca7b033 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -215,18 +215,6 @@ pub fn count(expr: Expr) -> Expr { )) } -/// Create an expression to represent the COUNT(IS NULL x) aggregate function -pub fn count_null(expr: Expr) -> Expr { - Expr::AggregateFunction(AggregateFunction::new( - aggregate_function::AggregateFunction::Count, - vec![expr.clone()], - false, - Some(Box::new(expr.is_null())), - None, - None, - )) -} - /// Return a new expression with bitwise AND pub fn bitwise_and(left: Expr, right: Expr) -> Expr { Expr::BinaryExpr(BinaryExpr::new( From ec5fe1817f4b97e811c27c08a8dce4c26a802daa Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Tue, 30 Apr 2024 16:29:13 +0800 Subject: [PATCH 6/6] chore --- datafusion/expr/src/expr_fn.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index d26fcca7b033..1d976a12cc4f 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -448,11 +448,6 @@ pub fn is_null(expr: Expr) -> Expr { Expr::IsNull(Box::new(expr)) } -/// Create is not null expression -pub fn is_not_null(expr: Expr) -> Expr { - Expr::IsNotNull(Box::new(expr)) -} - /// Create is true expression pub fn is_true(expr: Expr) -> Expr { Expr::IsTrue(Box::new(expr))