apache · alamb · Oct 2, 2024 · Sep 24, 2024 · Sep 25, 2024 · alamb
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
@@ -226,7 +226,12 @@ impl DFSchema {
 
         for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) {
             if let Some(qualifier) = qualifier {
-                qualified_names.insert((qualifier, field.name()));
+                if !qualified_names.insert((qualifier, field.name())) {
+                    return _schema_err!(SchemaError::DuplicateQualifiedField {
+                        qualifier: Box::new(qualifier.clone()),
+                        name: field.name().to_string(),
+                    });
+                }
             } else if !unqualified_names.insert(field.name()) {
                 return _schema_err!(SchemaError::DuplicateUnqualifiedField {
                     name: field.name().to_string()
@@ -1165,7 +1170,10 @@ mod tests {
         let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
         let right = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
         let join = left.join(&right);
-        assert!(join.err().is_none());
+        assert_eq!(
+            join.unwrap_err().strip_backtrace(),
+            "Schema error: Schema contains duplicate qualified field name t1.c0",
+        );
         Ok(())
     }
 

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -3337,52 +3337,6 @@ mod tests {
         Ok(())
     }
 
-    // Table 't1' self join
-    // Supplementary test of issue: https://github.com/apache/datafusion/issues/7790
-    #[tokio::test]
-    async fn with_column_self_join() -> Result<()> {
-        let df = test_table().await?.select_columns(&["c1"])?;
-        let ctx = SessionContext::new();
-
-        ctx.register_table("t1", df.into_view())?;
-
-        let df = ctx
-            .table("t1")
-            .await?
-            .join(
-                ctx.table("t1").await?,
-                JoinType::Inner,
-                &["c1"],
-                &["c1"],
-                None,
-            )?
-            .sort(vec![
-                // make the test deterministic
-                col("t1.c1").sort(true, true),
-            ])?
-            .limit(0, Some(1))?;
-
-        let df_results = df.clone().collect().await?;
-        assert_batches_sorted_eq!(
-            [
-                "+----+----+",
-                "| c1 | c1 |",
-                "+----+----+",
-                "| a  | a  |",
-                "+----+----+",
-            ],
-            &df_results
-        );
-
-        let actual_err = df.clone().with_column("new_column", lit(true)).unwrap_err();
-        let expected_err = "Error during planning: Projections require unique expression names \
-            but the expression \"t1.c1\" at position 0 and \"t1.c1\" at position 1 have the same name. \
-            Consider aliasing (\"AS\") one of them.";
-        assert_eq!(actual_err.strip_backtrace(), expected_err);
-
-        Ok(())
-    }
-
     #[tokio::test]
     async fn with_column_renamed() -> Result<()> {
         let df = test_table()

diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml
@@ -48,6 +48,7 @@ datafusion-expr-common = { workspace = true }
 datafusion-functions-aggregate-common = { workspace = true }
 datafusion-functions-window-common = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
+indexmap = { workspace = true }
 paste = "^1.0"
 serde_json = { workspace = true }
 sqlparser = { workspace = true }

diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
@@ -51,6 +51,7 @@ use datafusion_common::{
     DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence,
     FunctionalDependencies, ParamValues, Result, TableReference, UnnestOptions,
 };
+use indexmap::IndexSet;
 
 // backwards compatibility
 use crate::display::PgJsonVisitor;
@@ -3069,6 +3070,8 @@ fn calc_func_dependencies_for_aggregate(
         let group_by_expr_names = group_expr
             .iter()
             .map(|item| item.schema_name().to_string())
+            .collect::<IndexSet<_>>()
+            .into_iter()
             .collect::<Vec<_>>();
         let aggregate_func_dependencies = aggregate_functional_dependencies(
             input.schema(),

diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
@@ -38,6 +38,7 @@ use datafusion_common::{
     DataFusionError, Result, TableReference,
 };
 
+use indexmap::IndexSet;
 use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem};
 
 pub use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity;
@@ -59,16 +60,7 @@ pub fn exprlist_to_columns(expr: &[Expr], accum: &mut HashSet<Column>) -> Result
 /// Count the number of distinct exprs in a list of group by expressions. If the
 /// first element is a `GroupingSet` expression then it must be the only expr.
 pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
-    if let Some(Expr::GroupingSet(grouping_set)) = group_expr.first() {
-        if group_expr.len() > 1 {
-            return plan_err!(
-                "Invalid group by expressions, GroupingSet must be the only expression"
-            );
-        }
-        Ok(grouping_set.distinct_expr().len())
-    } else {
-        Ok(group_expr.len())
-    }
+    grouping_set_to_exprlist(group_expr).map(|exprs| exprs.len())
 }
 
 /// The [power set] (or powerset) of a set S is the set of all subsets of S, \
@@ -260,7 +252,11 @@ pub fn grouping_set_to_exprlist(group_expr: &[Expr]) -> Result<Vec<&Expr>> {
         }
         Ok(grouping_set.distinct_expr())
     } else {
-        Ok(group_expr.iter().collect())
+        Ok(group_expr
+            .iter()
+            .collect::<IndexSet<_>>()
+            .into_iter()
+            .collect())
     }
 }
 

diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs
@@ -345,7 +345,7 @@ fn select_wildcard_with_repeated_column() {
     let sql = "SELECT *, col_int32 FROM test";
     let err = test_sql(sql).expect_err("query should have failed");
     assert_eq!(
-        "expand_wildcard_rule\ncaused by\nError during planning: Projections require unique expression names but the expression \"test.col_int32\" at position 0 and \"test.col_int32\" at position 7 have the same name. Consider aliasing (\"AS\") one of them.",
+        "Schema error: Schema contains duplicate qualified field name test.col_int32",
         err.strip_backtrace()
     );
 }
@@ -396,7 +396,7 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> {
         .with_udaf(count_udaf())
         .with_udaf(avg_udaf());
     let sql_to_rel = SqlToRel::new(&context_provider);
-    let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap();
+    let plan = sql_to_rel.sql_statement_to_plan(statement.clone())?;
 
     let config = OptimizerContext::new().with_skip_failing_rules(false);
     let analyzer = Analyzer::new();

diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
@@ -197,9 +197,9 @@ impl PlannerContext {
 
     /// extends the FROM schema, returning the existing one, if any
     pub fn extend_outer_from_schema(&mut self, schema: &DFSchemaRef) -> Result<()> {
-        self.outer_from_schema = match self.outer_from_schema.as_ref() {
-            Some(from_schema) => Some(Arc::new(from_schema.join(schema)?)),
-            None => Some(Arc::clone(schema)),
+        match self.outer_from_schema.as_mut() {
+            Some(from_schema) => Arc::make_mut(from_schema).merge(schema),
+            None => self.outer_from_schema = Some(Arc::clone(schema)),
         };
         Ok(())
     }

diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt
@@ -1215,14 +1215,14 @@ statement ok
 create table t1(v1 int) as values(100);
 
 ## Query with Ambiguous column reference
-query error DataFusion error: Schema error: Ambiguous reference to unqualified field v1
+query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1
 select count(*)
 from t1
 right outer join t1
 on t1.v1 > 0;
 
-query error DataFusion error: Schema error: Ambiguous reference to unqualified field v1
+query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1
 select t1.v1 from t1 join t1 using(v1) cross join (select struct('foo' as v1) as t1);
 
 statement ok
-drop table t1;
+drop table t1;