Add "Extended" clickbench queries (#8861)

alamb · web-flow · commit 08de64d3778e · 2024-01-16T09:51:19.000+01:00
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -74,6 +74,7 @@ parquet:                Benchmark of parquet reader's filtering speed
 sort:                   Benchmark of sorting speed
 clickbench_1:           ClickBench queries against a single parquet file
 clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
+clickbench_extended:    ClickBench "inspired" queries against a single parquet (DataFusion specific)
 
 **********
 * Supported Configuration (Environment Variables)
@@ -155,6 +156,9 @@ main() {
                 clickbench_partitioned)
                     data_clickbench_partitioned
                     ;;
+                clickbench_extended)
+                    data_clickbench_1
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
                     usage
@@ -193,6 +197,7 @@ main() {
                     run_sort
                     run_clickbench_1
                     run_clickbench_partitioned
+                    run_clickbench_extended
                     ;;
                 tpch)
                     run_tpch "1"
@@ -218,6 +223,9 @@ main() {
                 clickbench_partitioned)
                     run_clickbench_partitioned
                     ;;
+                clickbench_extended)
+                    run_clickbench_extended
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
                     usage
@@ -401,6 +409,15 @@ run_clickbench_partitioned() {
     $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
 }
 
+# Runs the clickbench "extended" benchmark with a single large parquet file
+run_clickbench_extended() {
+    RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running clickbench (1 file) extended benchmark..."
+    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE}
+}
+
+
 compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
     BRANCH1="${ARG2}"
diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md
@@ -0,0 +1,33 @@
+# ClickBench queries
+
+This directory contains queries for the ClickBench benchmark https://benchmark.clickhouse.com/
+
+ClickBench is focused on aggregation and filtering performance (though it has no Joins)
+
+## Files:
+* `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository]
+* `extended.sql` - "Extended" DataFusion specific queries. 
+
+[ClickBench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql
+
+## "Extended" Queries 
+The "extended" queries are not part of the official ClickBench benchmark. 
+Instead they are used to test other DataFusion features that are not 
+covered by the standard benchmark
+
+Each description below is for the corresponding line in `extended.sql` (line 1
+is `Q0`, line 2 is `Q1`, etc.)  
+
+### Q0
+Models initial Data exploration, to understand some statistics of data. 
+Import Query Properties: multiple `COUNT DISTINCT` on strings
+
+```sql
+SELECT 
+    COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") 
+FROM hits;
+```
+
+
+
+
diff --git a/benchmarks/queries/clickbench/README.txt b/benchmarks/queries/clickbench/README.txt
diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql
@@ -0,0 +1 @@
+SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
@@ -15,13 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::path::Path;
 use std::{path::PathBuf, time::Instant};
 
 use datafusion::{
-    common::exec_err,
     error::{DataFusionError, Result},
     prelude::SessionContext,
 };
+use datafusion_common::exec_datafusion_err;
 use structopt::StructOpt;
 
 use crate::{BenchmarkRun, CommonOpt};
@@ -69,15 +70,49 @@ pub struct RunOpt {
     output_path: Option<PathBuf>,
 }
 
-const CLICKBENCH_QUERY_START_ID: usize = 0;
-const CLICKBENCH_QUERY_END_ID: usize = 42;
+struct AllQueries {
+    queries: Vec<String>,
+}
+
+impl AllQueries {
+    fn try_new(path: &Path) -> Result<Self> {
+        // ClickBench has all queries in a single file identified by line number
+        let all_queries = std::fs::read_to_string(path)
+            .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
+        Ok(Self {
+            queries: all_queries.lines().map(|s| s.to_string()).collect(),
+        })
+    }
+
+    /// Returns the text of query `query_id`
+    fn get_query(&self, query_id: usize) -> Result<&str> {
+        self.queries
+            .get(query_id)
+            .ok_or_else(|| {
+                let min_id = self.min_query_id();
+                let max_id = self.max_query_id();
+                exec_datafusion_err!(
+                    "Invalid query id {query_id}. Must be between {min_id} and {max_id}"
+                )
+            })
+            .map(|s| s.as_str())
+    }
+
+    fn min_query_id(&self) -> usize {
+        0
+    }
 
+    fn max_query_id(&self) -> usize {
+        self.queries.len() - 1
+    }
+}
 impl RunOpt {
     pub async fn run(self) -> Result<()> {
         println!("Running benchmarks with the following options: {self:?}");
+        let queries = AllQueries::try_new(self.queries_path.as_path())?;
         let query_range = match self.query {
             Some(query_id) => query_id..=query_id,
-            None => CLICKBENCH_QUERY_START_ID..=CLICKBENCH_QUERY_END_ID,
+            None => queries.min_query_id()..=queries.max_query_id(),
         };
 
         let config = self.common.config();
@@ -88,12 +123,12 @@ impl RunOpt {
         let mut benchmark_run = BenchmarkRun::new();
         for query_id in query_range {
             benchmark_run.start_new_case(&format!("Query {query_id}"));
-            let sql = self.get_query(query_id)?;
+            let sql = queries.get_query(query_id)?;
             println!("Q{query_id}: {sql}");
 
             for i in 0..iterations {
                 let start = Instant::now();
-                let results = ctx.sql(&sql).await?.collect().await?;
+                let results = ctx.sql(sql).await?.collect().await?;
                 let elapsed = start.elapsed();
                 let ms = elapsed.as_secs_f64() * 1000.0;
                 let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
@@ -120,23 +155,4 @@ impl RunOpt {
                 )
             })
     }
-
-    /// Returns the text of query `query_id`
-    fn get_query(&self, query_id: usize) -> Result<String> {
-        if query_id > CLICKBENCH_QUERY_END_ID {
-            return exec_err!(
-                "Invalid query id {query_id}. Must be between {CLICKBENCH_QUERY_START_ID} and {CLICKBENCH_QUERY_END_ID}"
-            );
-        }
-
-        let path = self.queries_path.as_path();
-
-        // ClickBench has all queries in a single file identified by line number
-        let all_queries = std::fs::read_to_string(path).map_err(|e| {
-            DataFusionError::Execution(format!("Could not open {path:?}: {e}"))
-        })?;
-        let all_queries: Vec<_> = all_queries.lines().collect();
-
-        Ok(all_queries.get(query_id).map(|s| s.to_string()).unwrap())
-    }
 }

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;`