From 7bf0ab623fa96b9b85ae494b4f9ddd56b4b92133 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 10 Nov 2023 09:25:03 -0700 Subject: [PATCH 01/11] Assume filters are highly selective if we cannot truly estimate cardinality --- datafusion/physical-plan/src/filter.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 0c44b367e514..7fa85eac653c 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -196,7 +196,9 @@ impl ExecutionPlan for FilterExec { let schema = self.schema(); if !check_support(predicate, &schema) { - return Ok(Statistics::new_unknown(&schema)); + // assume worst case, that the filter is highly selective and + // returns all the rows from its input + return self.input.statistics(); } let input_stats = self.input.statistics()?; From 21af5aa1a8024921fc524bbeb639106f25e9db84 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 10 Nov 2023 11:18:28 -0700 Subject: [PATCH 02/11] fix regression --- datafusion/physical-plan/src/filter.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 7fa85eac653c..7e69900f42b4 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -194,13 +194,19 @@ impl ExecutionPlan for FilterExec { fn statistics(&self) -> Result { let predicate = self.predicate(); + let input_stats = self.input.statistics()?; let schema = self.schema(); if !check_support(predicate, &schema) { // assume worst case, that the filter is highly selective and // returns all the rows from its input - return self.input.statistics(); + let mut stats = input_stats.clone(); + stats.num_rows = match stats.num_rows { + Precision::Exact(n) => Precision::Inexact(n), + Precision::Inexact(n) => Precision::Inexact(n), + Precision::Absent => Precision::Absent, + }; + return Ok(stats) } - let input_stats = self.input.statistics()?; let num_rows = input_stats.num_rows; let total_byte_size = input_stats.total_byte_size; From ead9ea1ca770334b571ecb04563788d8206ea558 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 10 Nov 2023 11:24:38 -0700 Subject: [PATCH 03/11] cargo fmt --- datafusion/physical-plan/src/filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 7e69900f42b4..c3d3082b1550 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -205,7 +205,7 @@ impl ExecutionPlan for FilterExec { Precision::Inexact(n) => Precision::Inexact(n), Precision::Absent => Precision::Absent, }; - return Ok(stats) + return Ok(stats); } let num_rows = input_stats.num_rows; From dcd71f3e5ba25cacd90719da4c6b274c9e59b14e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 10 Nov 2023 11:48:36 -0700 Subject: [PATCH 04/11] simplify code --- datafusion/physical-plan/src/filter.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index c3d3082b1550..4e9bd5f2aa42 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -199,13 +199,7 @@ impl ExecutionPlan for FilterExec { if !check_support(predicate, &schema) { // assume worst case, that the filter is highly selective and // returns all the rows from its input - let mut stats = input_stats.clone(); - stats.num_rows = match stats.num_rows { - Precision::Exact(n) => Precision::Inexact(n), - Precision::Inexact(n) => Precision::Inexact(n), - Precision::Absent => Precision::Absent, - }; - return Ok(stats); + return Ok(input_stats.clone().into_inexact()); } let num_rows = input_stats.num_rows; From 6ca5964c884e90fd2f5a2ffe74e60954a3d73ea2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 10 Nov 2023 14:06:46 -0700 Subject: [PATCH 05/11] Update datafusion/physical-plan/src/filter.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Daniƫl Heres --- datafusion/physical-plan/src/filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 4e9bd5f2aa42..dbb2b9d9714b 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -197,7 +197,7 @@ impl ExecutionPlan for FilterExec { let input_stats = self.input.statistics()?; let schema = self.schema(); if !check_support(predicate, &schema) { - // assume worst case, that the filter is highly selective and + // assume worst case, that the filter is highly unselective and // returns all the rows from its input return Ok(input_stats.clone().into_inexact()); } From 8669ba66ed3a488d7d7acfb782da57aa240e7ce1 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 10 Nov 2023 16:03:34 -0700 Subject: [PATCH 06/11] add comment with link to follow on issue --- datafusion/physical-plan/src/filter.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index dbb2b9d9714b..638bfecdb7d5 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -199,6 +199,8 @@ impl ExecutionPlan for FilterExec { if !check_support(predicate, &schema) { // assume worst case, that the filter is highly unselective and // returns all the rows from its input + // tracking issue for making this configurable: + // https://github.com/apache/arrow-datafusion/issues/8133 return Ok(input_stats.clone().into_inexact()); } From a3725b4936af8a4d2a6eaf8ca2dab7270c4b48b8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 11 Nov 2023 09:42:30 -0700 Subject: [PATCH 07/11] Use default of 20% selectivity --- datafusion/core/output1.parquet | Bin 0 -> 846 bytes datafusion/core/output2.parquet.snappy | Bin 0 -> 846 bytes datafusion/core/output3.parquet.snappy.parquet | Bin 0 -> 846 bytes datafusion/physical-plan/src/filter.rs | 10 +++++++--- 4 files changed, 7 insertions(+), 3 deletions(-) create mode 100644 datafusion/core/output1.parquet create mode 100644 datafusion/core/output2.parquet.snappy create mode 100644 datafusion/core/output3.parquet.snappy.parquet diff --git a/datafusion/core/output1.parquet b/datafusion/core/output1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2e2eed2b00d7b89b30e16dcde821f6a554fba478 GIT binary patch literal 846 zcmaJ=F>BjU5I#xIif~MDg75GH5j>=i6jGBy10D>umk^2ulPoO+f^Y@#5ZjUMgsz=J z2z2RC8_%9Pbqe_jow{_(kfA@KW9~_bB-gltlaBAbyZi3@P^)?P8U=3Sj)!@K4S%7iD?1KMLEY{vbH%d_ey3jmdbudAs?$vB*Q8l!d;V;o&^-$o0cc zJ6K?l*3?9rM+_>s<)ITPQLO9jvlGQ+d=XMub7VNC7r`#I(Bm?YE?Q1HXBcX={sp9W_>+6=ycIoEhoMq>Tq(_|f>ymVze_>;zM0%8^=jTXx?4e_pHZV(E@hQa&Ny%{)KqYBjU5I#xIif~MDg75GH5j>=i6jGBy10D>umk^2ulPoO+f^Y@#5ZjUMgsz=J z2z2RC8_%9Pbqe_jow{_(kfA@KW9~_bB-gltlaBAbyZi3@P^)?P8U=3Sj)!@K4S%7iD?1KMLEY{vbH%d_ey3jmdbudAs?$vB*Q8l!d;V;o&^-$o0cc zJ6K?l*3?9rM+_>s<)ITPQLO9jvlGQ+d=XMub7VNC7r`#I(Bm?YE?Q1HXBcX={sp9W_>+6=ycIoEhoMq>Tq(_|f>ymVze_>;zM0%8^=jTXx?4e_pHZV(E@hQa&Ny%{)KqYBjU5I#xIif~MDg75GH5j>=i6jGBy10D>umk^2ulPoO+f^Y@#5ZjUMgsz=J z2z2RC8_%9Pbqe_jow{_(kfA@KW9~_bB-gltlaBAbyZi3@P^)?P8U=3Sj)!@K4S%7iD?1KMLEY{vbH%d_ey3jmdbudAs?$vB*Q8l!d;V;o&^-$o0cc zJ6K?l*3?9rM+_>s<)ITPQLO9jvlGQ+d=XMub7VNC7r`#I(Bm?YE?Q1HXBcX={sp9W_>+6=ycIoEhoMq>Tq(_|f>ymVze_>;zM0%8^=jTXx?4e_pHZV(E@hQa&Ny%{)KqY Date: Sat, 11 Nov 2023 09:53:13 -0700 Subject: [PATCH 08/11] trigger CI From 61b03406c0e687c12c5d30c7a7ed069e3e74824f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 11 Nov 2023 09:53:52 -0700 Subject: [PATCH 09/11] remove files --- datafusion/core/output1.parquet | Bin 846 -> 0 bytes datafusion/core/output2.parquet.snappy | Bin 846 -> 0 bytes datafusion/core/output3.parquet.snappy.parquet | Bin 846 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 datafusion/core/output1.parquet delete mode 100644 datafusion/core/output2.parquet.snappy delete mode 100644 datafusion/core/output3.parquet.snappy.parquet diff --git a/datafusion/core/output1.parquet b/datafusion/core/output1.parquet deleted file mode 100644 index 2e2eed2b00d7b89b30e16dcde821f6a554fba478..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 846 zcmaJ=F>BjU5I#xIif~MDg75GH5j>=i6jGBy10D>umk^2ulPoO+f^Y@#5ZjUMgsz=J z2z2RC8_%9Pbqe_jow{_(kfA@KW9~_bB-gltlaBAbyZi3@P^)?P8U=3Sj)!@K4S%7iD?1KMLEY{vbH%d_ey3jmdbudAs?$vB*Q8l!d;V;o&^-$o0cc zJ6K?l*3?9rM+_>s<)ITPQLO9jvlGQ+d=XMub7VNC7r`#I(Bm?YE?Q1HXBcX={sp9W_>+6=ycIoEhoMq>Tq(_|f>ymVze_>;zM0%8^=jTXx?4e_pHZV(E@hQa&Ny%{)KqYBjU5I#xIif~MDg75GH5j>=i6jGBy10D>umk^2ulPoO+f^Y@#5ZjUMgsz=J z2z2RC8_%9Pbqe_jow{_(kfA@KW9~_bB-gltlaBAbyZi3@P^)?P8U=3Sj)!@K4S%7iD?1KMLEY{vbH%d_ey3jmdbudAs?$vB*Q8l!d;V;o&^-$o0cc zJ6K?l*3?9rM+_>s<)ITPQLO9jvlGQ+d=XMub7VNC7r`#I(Bm?YE?Q1HXBcX={sp9W_>+6=ycIoEhoMq>Tq(_|f>ymVze_>;zM0%8^=jTXx?4e_pHZV(E@hQa&Ny%{)KqYBjU5I#xIif~MDg75GH5j>=i6jGBy10D>umk^2ulPoO+f^Y@#5ZjUMgsz=J z2z2RC8_%9Pbqe_jow{_(kfA@KW9~_bB-gltlaBAbyZi3@P^)?P8U=3Sj)!@K4S%7iD?1KMLEY{vbH%d_ey3jmdbudAs?$vB*Q8l!d;V;o&^-$o0cc zJ6K?l*3?9rM+_>s<)ITPQLO9jvlGQ+d=XMub7VNC7r`#I(Bm?YE?Q1HXBcX={sp9W_>+6=ycIoEhoMq>Tq(_|f>ymVze_>;zM0%8^=jTXx?4e_pHZV(E@hQa&Ny%{)KqY Date: Sat, 11 Nov 2023 11:35:17 -0700 Subject: [PATCH 10/11] trigger CI From 51503288b8b2dfe981252fe56e3dcfa7e3d237d4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 11 Nov 2023 12:58:02 -0700 Subject: [PATCH 11/11] address feedback --- datafusion/physical-plan/src/filter.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index cb79d9430089..c7db9f892f42 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -205,6 +205,10 @@ impl ExecutionPlan for FilterExec { if let Precision::Inexact(n) = stats.num_rows { stats.num_rows = Precision::Inexact((selectivity * n as f32) as usize); } + if let Precision::Inexact(n) = stats.total_byte_size { + stats.total_byte_size = + Precision::Inexact((selectivity * n as f32) as usize); + } return Ok(stats); }