Skip to content

Commit d43a70d

Browse files
authored
Minor: Add new bloom filter predicate tests (#8433)
* Minor: Add new bloom filter tests * fmt
1 parent d771f26 commit d43a70d

File tree

1 file changed

+113
-4
lines changed
  • datafusion/core/src/datasource/physical_plan/parquet

1 file changed

+113
-4
lines changed

datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs

Lines changed: 113 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ mod tests {
350350
use arrow::datatypes::Schema;
351351
use arrow::datatypes::{DataType, Field};
352352
use datafusion_common::{config::ConfigOptions, TableReference, ToDFSchema};
353+
use datafusion_common::{DataFusionError, Result};
353354
use datafusion_expr::{
354355
builder::LogicalTableSource, cast, col, lit, AggregateUDF, Expr, ScalarUDF,
355356
TableSource, WindowUDF,
@@ -994,6 +995,26 @@ mod tests {
994995
create_physical_expr(expr, &df_schema, schema, &execution_props).unwrap()
995996
}
996997

998+
// Note the values in the `String` column are:
999+
// ❯ select * from './parquet-testing/data/data_index_bloom_encoding_stats.parquet';
1000+
// +-----------+
1001+
// | String |
1002+
// +-----------+
1003+
// | Hello |
1004+
// | This is |
1005+
// | a |
1006+
// | test |
1007+
// | How |
1008+
// | are you |
1009+
// | doing |
1010+
// | today |
1011+
// | the quick |
1012+
// | brown fox |
1013+
// | jumps |
1014+
// | over |
1015+
// | the lazy |
1016+
// | dog |
1017+
// +-----------+
9971018
#[tokio::test]
9981019
async fn test_row_group_bloom_filter_pruning_predicate_simple_expr() {
9991020
// load parquet file
@@ -1002,7 +1023,7 @@ mod tests {
10021023
let path = format!("{testdata}/{file_name}");
10031024
let data = bytes::Bytes::from(std::fs::read(path).unwrap());
10041025

1005-
// generate pruning predicate
1026+
// generate pruning predicate `(String = "Hello_Not_exists")`
10061027
let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
10071028
let expr = col(r#""String""#).eq(lit("Hello_Not_Exists"));
10081029
let expr = logical2physical(&expr, &schema);
@@ -1029,7 +1050,7 @@ mod tests {
10291050
let path = format!("{testdata}/{file_name}");
10301051
let data = bytes::Bytes::from(std::fs::read(path).unwrap());
10311052

1032-
// generate pruning predicate
1053+
// generate pruning predicate `(String = "Hello_Not_exists" OR String = "Hello_Not_exists2")`
10331054
let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
10341055
let expr = lit("1").eq(lit("1")).and(
10351056
col(r#""String""#)
@@ -1091,7 +1112,7 @@ mod tests {
10911112
let path = format!("{testdata}/{file_name}");
10921113
let data = bytes::Bytes::from(std::fs::read(path).unwrap());
10931114

1094-
// generate pruning predicate
1115+
// generate pruning predicate `(String = "Hello")`
10951116
let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
10961117
let expr = col(r#""String""#).eq(lit("Hello"));
10971118
let expr = logical2physical(&expr, &schema);
@@ -1110,6 +1131,94 @@ mod tests {
11101131
assert_eq!(pruned_row_groups, row_groups);
11111132
}
11121133

1134+
#[tokio::test]
1135+
async fn test_row_group_bloom_filter_pruning_predicate_with_exists_2_values() {
1136+
// load parquet file
1137+
let testdata = datafusion_common::test_util::parquet_test_data();
1138+
let file_name = "data_index_bloom_encoding_stats.parquet";
1139+
let path = format!("{testdata}/{file_name}");
1140+
let data = bytes::Bytes::from(std::fs::read(path).unwrap());
1141+
1142+
// generate pruning predicate `(String = "Hello") OR (String = "the quick")`
1143+
let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
1144+
let expr = col(r#""String""#)
1145+
.eq(lit("Hello"))
1146+
.or(col(r#""String""#).eq(lit("the quick")));
1147+
let expr = logical2physical(&expr, &schema);
1148+
let pruning_predicate =
1149+
PruningPredicate::try_new(expr, Arc::new(schema)).unwrap();
1150+
1151+
let row_groups = vec![0];
1152+
let pruned_row_groups = test_row_group_bloom_filter_pruning_predicate(
1153+
file_name,
1154+
data,
1155+
&pruning_predicate,
1156+
&row_groups,
1157+
)
1158+
.await
1159+
.unwrap();
1160+
assert_eq!(pruned_row_groups, row_groups);
1161+
}
1162+
1163+
#[tokio::test]
1164+
async fn test_row_group_bloom_filter_pruning_predicate_with_exists_3_values() {
1165+
// load parquet file
1166+
let testdata = datafusion_common::test_util::parquet_test_data();
1167+
let file_name = "data_index_bloom_encoding_stats.parquet";
1168+
let path = format!("{testdata}/{file_name}");
1169+
let data = bytes::Bytes::from(std::fs::read(path).unwrap());
1170+
1171+
// generate pruning predicate `(String = "Hello") OR (String = "the quick") OR (String = "are you")`
1172+
let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
1173+
let expr = col(r#""String""#)
1174+
.eq(lit("Hello"))
1175+
.or(col(r#""String""#).eq(lit("the quick")))
1176+
.or(col(r#""String""#).eq(lit("are you")));
1177+
let expr = logical2physical(&expr, &schema);
1178+
let pruning_predicate =
1179+
PruningPredicate::try_new(expr, Arc::new(schema)).unwrap();
1180+
1181+
let row_groups = vec![0];
1182+
let pruned_row_groups = test_row_group_bloom_filter_pruning_predicate(
1183+
file_name,
1184+
data,
1185+
&pruning_predicate,
1186+
&row_groups,
1187+
)
1188+
.await
1189+
.unwrap();
1190+
assert_eq!(pruned_row_groups, row_groups);
1191+
}
1192+
1193+
#[tokio::test]
1194+
async fn test_row_group_bloom_filter_pruning_predicate_with_or_not_eq() {
1195+
// load parquet file
1196+
let testdata = datafusion_common::test_util::parquet_test_data();
1197+
let file_name = "data_index_bloom_encoding_stats.parquet";
1198+
let path = format!("{testdata}/{file_name}");
1199+
let data = bytes::Bytes::from(std::fs::read(path).unwrap());
1200+
1201+
// generate pruning predicate `(String = "foo") OR (String != "bar")`
1202+
let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
1203+
let expr = col(r#""String""#)
1204+
.not_eq(lit("foo"))
1205+
.or(col(r#""String""#).not_eq(lit("bar")));
1206+
let expr = logical2physical(&expr, &schema);
1207+
let pruning_predicate =
1208+
PruningPredicate::try_new(expr, Arc::new(schema)).unwrap();
1209+
1210+
let row_groups = vec![0];
1211+
let pruned_row_groups = test_row_group_bloom_filter_pruning_predicate(
1212+
file_name,
1213+
data,
1214+
&pruning_predicate,
1215+
&row_groups,
1216+
)
1217+
.await
1218+
.unwrap();
1219+
assert_eq!(pruned_row_groups, row_groups);
1220+
}
1221+
11131222
#[tokio::test]
11141223
async fn test_row_group_bloom_filter_pruning_predicate_without_bloom_filter() {
11151224
// load parquet file
@@ -1118,7 +1227,7 @@ mod tests {
11181227
let path = format!("{testdata}/{file_name}");
11191228
let data = bytes::Bytes::from(std::fs::read(path).unwrap());
11201229

1121-
// generate pruning predicate
1230+
// generate pruning predicate on a column without a bloom filter
11221231
let schema = Schema::new(vec![Field::new("string_col", DataType::Utf8, false)]);
11231232
let expr = col(r#""string_col""#).eq(lit("0"));
11241233
let expr = logical2physical(&expr, &schema);

0 commit comments

Comments
 (0)