Skip to content

Commit 08de64d

Browse files
authored
Add "Extended" clickbench queries (#8861)
1 parent 4cde998 commit 08de64d

File tree

5 files changed

+92
-26
lines changed

5 files changed

+92
-26
lines changed

benchmarks/bench.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ parquet: Benchmark of parquet reader's filtering speed
7474
sort: Benchmark of sorting speed
7575
clickbench_1: ClickBench queries against a single parquet file
7676
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
77+
clickbench_extended: ClickBench "inspired" queries against a single parquet (DataFusion specific)
7778
7879
**********
7980
* Supported Configuration (Environment Variables)
@@ -155,6 +156,9 @@ main() {
155156
clickbench_partitioned)
156157
data_clickbench_partitioned
157158
;;
159+
clickbench_extended)
160+
data_clickbench_1
161+
;;
158162
*)
159163
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
160164
usage
@@ -193,6 +197,7 @@ main() {
193197
run_sort
194198
run_clickbench_1
195199
run_clickbench_partitioned
200+
run_clickbench_extended
196201
;;
197202
tpch)
198203
run_tpch "1"
@@ -218,6 +223,9 @@ main() {
218223
clickbench_partitioned)
219224
run_clickbench_partitioned
220225
;;
226+
clickbench_extended)
227+
run_clickbench_extended
228+
;;
221229
*)
222230
echo "Error: unknown benchmark '$BENCHMARK' for run"
223231
usage
@@ -401,6 +409,15 @@ run_clickbench_partitioned() {
401409
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
402410
}
403411

412+
# Runs the clickbench "extended" benchmark with a single large parquet file
413+
run_clickbench_extended() {
414+
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
415+
echo "RESULTS_FILE: ${RESULTS_FILE}"
416+
echo "Running clickbench (1 file) extended benchmark..."
417+
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE}
418+
}
419+
420+
404421
compare_benchmarks() {
405422
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
406423
BRANCH1="${ARG2}"
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# ClickBench queries
2+
3+
This directory contains queries for the ClickBench benchmark https://benchmark.clickhouse.com/
4+
5+
ClickBench is focused on aggregation and filtering performance (though it has no Joins)
6+
7+
## Files:
8+
* `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository]
9+
* `extended.sql` - "Extended" DataFusion specific queries.
10+
11+
[ClickBench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql
12+
13+
## "Extended" Queries
14+
The "extended" queries are not part of the official ClickBench benchmark.
15+
Instead they are used to test other DataFusion features that are not
16+
covered by the standard benchmark
17+
18+
Each description below is for the corresponding line in `extended.sql` (line 1
19+
is `Q0`, line 2 is `Q1`, etc.)
20+
21+
### Q0
22+
Models initial Data exploration, to understand some statistics of data.
23+
Import Query Properties: multiple `COUNT DISTINCT` on strings
24+
25+
```sql
26+
SELECT
27+
COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel")
28+
FROM hits;
29+
```
30+
31+
32+
33+

benchmarks/queries/clickbench/README.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;

benchmarks/src/clickbench.rs

Lines changed: 41 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,14 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::path::Path;
1819
use std::{path::PathBuf, time::Instant};
1920

2021
use datafusion::{
21-
common::exec_err,
2222
error::{DataFusionError, Result},
2323
prelude::SessionContext,
2424
};
25+
use datafusion_common::exec_datafusion_err;
2526
use structopt::StructOpt;
2627

2728
use crate::{BenchmarkRun, CommonOpt};
@@ -69,15 +70,49 @@ pub struct RunOpt {
6970
output_path: Option<PathBuf>,
7071
}
7172

72-
const CLICKBENCH_QUERY_START_ID: usize = 0;
73-
const CLICKBENCH_QUERY_END_ID: usize = 42;
73+
struct AllQueries {
74+
queries: Vec<String>,
75+
}
76+
77+
impl AllQueries {
78+
fn try_new(path: &Path) -> Result<Self> {
79+
// ClickBench has all queries in a single file identified by line number
80+
let all_queries = std::fs::read_to_string(path)
81+
.map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
82+
Ok(Self {
83+
queries: all_queries.lines().map(|s| s.to_string()).collect(),
84+
})
85+
}
86+
87+
/// Returns the text of query `query_id`
88+
fn get_query(&self, query_id: usize) -> Result<&str> {
89+
self.queries
90+
.get(query_id)
91+
.ok_or_else(|| {
92+
let min_id = self.min_query_id();
93+
let max_id = self.max_query_id();
94+
exec_datafusion_err!(
95+
"Invalid query id {query_id}. Must be between {min_id} and {max_id}"
96+
)
97+
})
98+
.map(|s| s.as_str())
99+
}
100+
101+
fn min_query_id(&self) -> usize {
102+
0
103+
}
74104

105+
fn max_query_id(&self) -> usize {
106+
self.queries.len() - 1
107+
}
108+
}
75109
impl RunOpt {
76110
pub async fn run(self) -> Result<()> {
77111
println!("Running benchmarks with the following options: {self:?}");
112+
let queries = AllQueries::try_new(self.queries_path.as_path())?;
78113
let query_range = match self.query {
79114
Some(query_id) => query_id..=query_id,
80-
None => CLICKBENCH_QUERY_START_ID..=CLICKBENCH_QUERY_END_ID,
115+
None => queries.min_query_id()..=queries.max_query_id(),
81116
};
82117

83118
let config = self.common.config();
@@ -88,12 +123,12 @@ impl RunOpt {
88123
let mut benchmark_run = BenchmarkRun::new();
89124
for query_id in query_range {
90125
benchmark_run.start_new_case(&format!("Query {query_id}"));
91-
let sql = self.get_query(query_id)?;
126+
let sql = queries.get_query(query_id)?;
92127
println!("Q{query_id}: {sql}");
93128

94129
for i in 0..iterations {
95130
let start = Instant::now();
96-
let results = ctx.sql(&sql).await?.collect().await?;
131+
let results = ctx.sql(sql).await?.collect().await?;
97132
let elapsed = start.elapsed();
98133
let ms = elapsed.as_secs_f64() * 1000.0;
99134
let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
@@ -120,23 +155,4 @@ impl RunOpt {
120155
)
121156
})
122157
}
123-
124-
/// Returns the text of query `query_id`
125-
fn get_query(&self, query_id: usize) -> Result<String> {
126-
if query_id > CLICKBENCH_QUERY_END_ID {
127-
return exec_err!(
128-
"Invalid query id {query_id}. Must be between {CLICKBENCH_QUERY_START_ID} and {CLICKBENCH_QUERY_END_ID}"
129-
);
130-
}
131-
132-
let path = self.queries_path.as_path();
133-
134-
// ClickBench has all queries in a single file identified by line number
135-
let all_queries = std::fs::read_to_string(path).map_err(|e| {
136-
DataFusionError::Execution(format!("Could not open {path:?}: {e}"))
137-
})?;
138-
let all_queries: Vec<_> = all_queries.lines().collect();
139-
140-
Ok(all_queries.get(query_id).map(|s| s.to_string()).unwrap())
141-
}
142158
}

0 commit comments

Comments
 (0)