diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index fbe7d5c04b9b..74ae278794ff 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1345,6 +1345,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", + "indexmap", "paste", "serde_json", "sqlparser", diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 0dec14e9178a..69cdf866cf98 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -226,7 +226,12 @@ impl DFSchema { for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { if let Some(qualifier) = qualifier { - qualified_names.insert((qualifier, field.name())); + if !qualified_names.insert((qualifier, field.name())) { + return _schema_err!(SchemaError::DuplicateQualifiedField { + qualifier: Box::new(qualifier.clone()), + name: field.name().to_string(), + }); + } } else if !unqualified_names.insert(field.name()) { return _schema_err!(SchemaError::DuplicateUnqualifiedField { name: field.name().to_string() @@ -1165,7 +1170,10 @@ mod tests { let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let right = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let join = left.join(&right); - assert!(join.err().is_none()); + assert_eq!( + join.unwrap_err().strip_backtrace(), + "Schema error: Schema contains duplicate qualified field name t1.c0", + ); Ok(()) } diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 663d8bf022f0..4d1569f586f6 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -3337,52 +3337,6 @@ mod tests { Ok(()) } - // Table 't1' self join - // Supplementary test of issue: https://github.com/apache/datafusion/issues/7790 - #[tokio::test] - async fn with_column_self_join() -> Result<()> { - let df = test_table().await?.select_columns(&["c1"])?; - let ctx = SessionContext::new(); - - ctx.register_table("t1", df.into_view())?; - - let df = ctx - .table("t1") - .await? - .join( - ctx.table("t1").await?, - JoinType::Inner, - &["c1"], - &["c1"], - None, - )? - .sort(vec![ - // make the test deterministic - col("t1.c1").sort(true, true), - ])? - .limit(0, Some(1))?; - - let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+", - "| c1 | c1 |", - "+----+----+", - "| a | a |", - "+----+----+", - ], - &df_results - ); - - let actual_err = df.clone().with_column("new_column", lit(true)).unwrap_err(); - let expected_err = "Error during planning: Projections require unique expression names \ - but the expression \"t1.c1\" at position 0 and \"t1.c1\" at position 1 have the same name. \ - Consider aliasing (\"AS\") one of them."; - assert_eq!(actual_err.strip_backtrace(), expected_err); - - Ok(()) - } - #[tokio::test] async fn with_column_renamed() -> Result<()> { let df = test_table() diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 55387fea22ee..d7dc1afe4d50 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -48,6 +48,7 @@ datafusion-expr-common = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } +indexmap = { workspace = true } paste = "^1.0" serde_json = { workspace = true } sqlparser = { workspace = true } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 6a88382061cd..3353926ffb38 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -51,6 +51,7 @@ use datafusion_common::{ DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, FunctionalDependencies, ParamValues, Result, TableReference, UnnestOptions, }; +use indexmap::IndexSet; // backwards compatibility use crate::display::PgJsonVisitor; @@ -3069,6 +3070,8 @@ fn calc_func_dependencies_for_aggregate( let group_by_expr_names = group_expr .iter() .map(|item| item.schema_name().to_string()) + .collect::>() + .into_iter() .collect::>(); let aggregate_func_dependencies = aggregate_functional_dependencies( input.schema(), diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 1d8eb9445eda..9bb53a1d04a0 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -38,6 +38,7 @@ use datafusion_common::{ DataFusionError, Result, TableReference, }; +use indexmap::IndexSet; use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem}; pub use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity; @@ -59,16 +60,7 @@ pub fn exprlist_to_columns(expr: &[Expr], accum: &mut HashSet) -> Result /// Count the number of distinct exprs in a list of group by expressions. If the /// first element is a `GroupingSet` expression then it must be the only expr. pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result { - if let Some(Expr::GroupingSet(grouping_set)) = group_expr.first() { - if group_expr.len() > 1 { - return plan_err!( - "Invalid group by expressions, GroupingSet must be the only expression" - ); - } - Ok(grouping_set.distinct_expr().len()) - } else { - Ok(group_expr.len()) - } + grouping_set_to_exprlist(group_expr).map(|exprs| exprs.len()) } /// The [power set] (or powerset) of a set S is the set of all subsets of S, \ @@ -260,7 +252,11 @@ pub fn grouping_set_to_exprlist(group_expr: &[Expr]) -> Result> { } Ok(grouping_set.distinct_expr()) } else { - Ok(group_expr.iter().collect()) + Ok(group_expr + .iter() + .collect::>() + .into_iter() + .collect()) } } diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index 470bd947c7fb..236167985790 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -345,7 +345,7 @@ fn select_wildcard_with_repeated_column() { let sql = "SELECT *, col_int32 FROM test"; let err = test_sql(sql).expect_err("query should have failed"); assert_eq!( - "expand_wildcard_rule\ncaused by\nError during planning: Projections require unique expression names but the expression \"test.col_int32\" at position 0 and \"test.col_int32\" at position 7 have the same name. Consider aliasing (\"AS\") one of them.", + "Schema error: Schema contains duplicate qualified field name test.col_int32", err.strip_backtrace() ); } @@ -396,7 +396,7 @@ fn test_sql(sql: &str) -> Result { .with_udaf(count_udaf()) .with_udaf(avg_udaf()); let sql_to_rel = SqlToRel::new(&context_provider); - let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap(); + let plan = sql_to_rel.sql_statement_to_plan(statement.clone())?; let config = OptimizerContext::new().with_skip_failing_rules(false); let analyzer = Analyzer::new(); diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 5cbe1d7c014a..e8defedddf2c 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -197,9 +197,9 @@ impl PlannerContext { /// extends the FROM schema, returning the existing one, if any pub fn extend_outer_from_schema(&mut self, schema: &DFSchemaRef) -> Result<()> { - self.outer_from_schema = match self.outer_from_schema.as_ref() { - Some(from_schema) => Some(Arc::new(from_schema.join(schema)?)), - None => Some(Arc::clone(schema)), + match self.outer_from_schema.as_mut() { + Some(from_schema) => Arc::make_mut(from_schema).merge(schema), + None => self.outer_from_schema = Some(Arc::clone(schema)), }; Ok(()) } diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 8d801b92c393..519fbb887c7e 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -1215,14 +1215,14 @@ statement ok create table t1(v1 int) as values(100); ## Query with Ambiguous column reference -query error DataFusion error: Schema error: Ambiguous reference to unqualified field v1 +query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1 select count(*) from t1 right outer join t1 on t1.v1 > 0; -query error DataFusion error: Schema error: Ambiguous reference to unqualified field v1 +query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1 select t1.v1 from t1 join t1 using(v1) cross join (select struct('foo' as v1) as t1); statement ok -drop table t1; \ No newline at end of file +drop table t1;