Skip to content

Commit cec318a

Browse files
alambcomphead
andcommitted
Add standalone example of using the SQL frontend (apache#11088)
* Add standalone example of using the SQL frontend * Disable debug info for all example jobs * Revert "Disable debug info for all example jobs" This reverts commit f222a10. * fix type * Int32 --> Int8 * Use assert_eq rather than println * Update datafusion-examples/examples/sql_frontend.rs Co-authored-by: Oleks V <[email protected]> --------- Co-authored-by: Oleks V <[email protected]>
1 parent d1af056 commit cec318a

File tree

3 files changed

+219
-8
lines changed

3 files changed

+219
-8
lines changed

datafusion-examples/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ cargo run --example csv_sql
8080
- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
8181
- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
8282
- [`sql_analysis.rs`](examples/sql_analysis.rs): Analyse SQL queries with DataFusion structures
83+
- [`sql_frontend.rs`](examples/sql_frontend.rs): Create LogicalPlans (only) from sql strings
8384
- [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a custom SQL dialect on top of `DFParser`
8485
- [`to_char.rs`](examples/to_char.rs): Examples of using the to_char function
8586
- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using to_timestamp functions
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
19+
use datafusion_common::config::ConfigOptions;
20+
use datafusion_common::{plan_err, Result};
21+
use datafusion_expr::{
22+
AggregateUDF, Expr, LogicalPlan, ScalarUDF, TableProviderFilterPushDown, TableSource,
23+
WindowUDF,
24+
};
25+
use datafusion_optimizer::{
26+
Analyzer, AnalyzerRule, Optimizer, OptimizerConfig, OptimizerContext, OptimizerRule,
27+
};
28+
use datafusion_sql::planner::{ContextProvider, SqlToRel};
29+
use datafusion_sql::sqlparser::dialect::PostgreSqlDialect;
30+
use datafusion_sql::sqlparser::parser::Parser;
31+
use datafusion_sql::TableReference;
32+
use std::any::Any;
33+
use std::sync::Arc;
34+
35+
/// This example shows how to use DataFusion's SQL planner to parse SQL text and
36+
/// build `LogicalPlan`s without executing them.
37+
///
38+
/// For example, if you need a SQL planner and optimizer like Apache Calcite,
39+
/// but do not want a Java runtime dependency for some reason, you could use
40+
/// DataFusion as a SQL frontend.
41+
///
42+
/// Normally, users interact with DataFusion via SessionContext. However, using
43+
/// SessionContext requires depending on the full `datafusion` core crate.
44+
///
45+
/// In this example, we demonstrate how to use the lower level APIs directly,
46+
/// which only requires the `datafusion-sql` dependency.
47+
pub fn main() -> Result<()> {
48+
// First, we parse the SQL string. Note that we use the DataFusion
49+
// Parser, which wraps the `sqlparser-rs` SQL parser and adds DataFusion
50+
// specific syntax such as `CREATE EXTERNAL TABLE`
51+
let dialect = PostgreSqlDialect {};
52+
let sql = "SELECT name FROM person WHERE age BETWEEN 21 AND 32";
53+
let statements = Parser::parse_sql(&dialect, sql)?;
54+
55+
// Now, use DataFusion's SQL planner, called `SqlToRel` to create a
56+
// `LogicalPlan` from the parsed statement
57+
//
58+
// To invoke SqlToRel we must provide it schema and function information
59+
// via an object that implements the `ContextProvider` trait
60+
let context_provider = MyContextProvider::default();
61+
let sql_to_rel = SqlToRel::new(&context_provider);
62+
let logical_plan = sql_to_rel.sql_statement_to_plan(statements[0].clone())?;
63+
64+
// Here is the logical plan that was generated:
65+
assert_eq!(
66+
logical_plan.display_indent().to_string(),
67+
"Projection: person.name\
68+
\n Filter: person.age BETWEEN Int64(21) AND Int64(32)\
69+
\n TableScan: person"
70+
);
71+
72+
// The initial LogicalPlan is a mechanical translation from the parsed SQL
73+
// and often can not run without the Analyzer passes.
74+
//
75+
// In this example, `person.age` is actually a different data type (Int8)
76+
// than the values to which it is compared to which are Int64. Most
77+
// execution engines, including DataFusion's, will fail if you provide such
78+
// a plan.
79+
//
80+
// To prepare it to run, we must apply type coercion to align types, and
81+
// check for other semantic errors. In DataFusion this is done by a
82+
// component called the Analyzer.
83+
let config = OptimizerContext::default().with_skip_failing_rules(false);
84+
let analyzed_plan = Analyzer::new().execute_and_check(
85+
logical_plan,
86+
config.options(),
87+
observe_analyzer,
88+
)?;
89+
// Note that the Analyzer has added a CAST to the plan to align the types
90+
assert_eq!(
91+
analyzed_plan.display_indent().to_string(),
92+
"Projection: person.name\
93+
\n Filter: CAST(person.age AS Int64) BETWEEN Int64(21) AND Int64(32)\
94+
\n TableScan: person",
95+
);
96+
97+
// As we can see, the Analyzer added a CAST so the types are the same
98+
// (Int64). However, this plan is not as efficient as it could be, as it
99+
// will require casting *each row* of the input to UInt64 before comparison
100+
// to 21 and 32. To optimize this query's performance, it is better to cast
101+
// the constants once at plan time to UInt8.
102+
//
103+
// Query optimization is handled in DataFusion by a component called the
104+
// Optimizer, which we now invoke
105+
//
106+
let optimized_plan =
107+
Optimizer::new().optimize(analyzed_plan, &config, observe_optimizer)?;
108+
109+
// Show the fully optimized plan. Note that the optimizer did several things
110+
// to prepare this plan for execution:
111+
//
112+
// 1. Removed casts from person.age as we described above
113+
// 2. Converted BETWEEN to two single column inequalities (which are typically faster to execute)
114+
// 3. Pushed the projection of `name` down to the scan (so the scan only returns that column)
115+
// 4. Pushed the filter into the scan
116+
// 5. Removed the projection as it was only serving to pass through the name column
117+
assert_eq!(
118+
optimized_plan.display_indent().to_string(),
119+
"TableScan: person projection=[name], full_filters=[person.age >= UInt8(21), person.age <= UInt8(32)]"
120+
);
121+
122+
Ok(())
123+
}
124+
125+
// Note that both the optimizer and the analyzer take a callback, called an
126+
// "observer" that is invoked after each pass. We do not do anything with these
127+
// callbacks in this example
128+
129+
fn observe_analyzer(_plan: &LogicalPlan, _rule: &dyn AnalyzerRule) {}
130+
fn observe_optimizer(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {}
131+
132+
/// Implements the `ContextProvider` trait required to plan SQL
133+
#[derive(Default)]
134+
struct MyContextProvider {
135+
options: ConfigOptions,
136+
}
137+
138+
impl ContextProvider for MyContextProvider {
139+
fn get_table_source(&self, name: TableReference) -> Result<Arc<dyn TableSource>> {
140+
if name.table() == "person" {
141+
Ok(Arc::new(MyTableSource {
142+
schema: Arc::new(Schema::new(vec![
143+
Field::new("name", DataType::Utf8, false),
144+
Field::new("age", DataType::UInt8, false),
145+
])),
146+
}))
147+
} else {
148+
plan_err!("Table {} not found", name.table())
149+
}
150+
}
151+
152+
fn get_function_meta(&self, _name: &str) -> Option<Arc<ScalarUDF>> {
153+
None
154+
}
155+
156+
fn get_aggregate_meta(&self, _name: &str) -> Option<Arc<AggregateUDF>> {
157+
None
158+
}
159+
160+
fn get_variable_type(&self, _variable_names: &[String]) -> Option<DataType> {
161+
None
162+
}
163+
164+
fn get_window_meta(&self, _name: &str) -> Option<Arc<WindowUDF>> {
165+
None
166+
}
167+
168+
fn options(&self) -> &ConfigOptions {
169+
&self.options
170+
}
171+
172+
fn udf_names(&self) -> Vec<String> {
173+
Vec::new()
174+
}
175+
176+
fn udaf_names(&self) -> Vec<String> {
177+
Vec::new()
178+
}
179+
180+
fn udwf_names(&self) -> Vec<String> {
181+
Vec::new()
182+
}
183+
}
184+
185+
/// TableSource is the part of TableProvider needed for creating a LogicalPlan.
186+
struct MyTableSource {
187+
schema: SchemaRef,
188+
}
189+
190+
impl TableSource for MyTableSource {
191+
fn as_any(&self) -> &dyn Any {
192+
self
193+
}
194+
195+
fn schema(&self) -> SchemaRef {
196+
self.schema.clone()
197+
}
198+
199+
// For this example, we report to the DataFusion optimizer that
200+
// this provider can apply filters during the scan
201+
fn supports_filters_pushdown(
202+
&self,
203+
filters: &[&Expr],
204+
) -> Result<Vec<TableProviderFilterPushDown>> {
205+
Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
206+
}
207+
}

datafusion/expr/src/table_source.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,17 @@ impl std::fmt::Display for TableType {
7171
}
7272
}
7373

74-
/// The TableSource trait is used during logical query planning and optimizations and
75-
/// provides access to schema information and filter push-down capabilities. This trait
76-
/// provides a subset of the functionality of the TableProvider trait in the core
77-
/// datafusion crate. The TableProvider trait provides additional capabilities needed for
78-
/// physical query execution (such as the ability to perform a scan). The reason for
79-
/// having two separate traits is to avoid having the logical plan code be dependent
80-
/// on the DataFusion execution engine. Other projects may want to use DataFusion's
81-
/// logical plans and have their own execution engine.
74+
/// Access schema information and filter push-down capabilities.
75+
///
76+
/// The TableSource trait is used during logical query planning and
77+
/// optimizations and provides a subset of the functionality of the
78+
/// `TableProvider` trait in the (core) `datafusion` crate. The `TableProvider`
79+
/// trait provides additional capabilities needed for physical query execution
80+
/// (such as the ability to perform a scan).
81+
///
82+
/// The reason for having two separate traits is to avoid having the logical
83+
/// plan code be dependent on the DataFusion execution engine. Some projects use
84+
/// DataFusion's logical plans and have their own execution engine.
8285
pub trait TableSource: Sync + Send {
8386
fn as_any(&self) -> &dyn Any;
8487

0 commit comments

Comments
 (0)