Skip to content

Commit 6546479

Browse files
authored
Add JOB benchmark dataset [1/N] (imdb dataset) (#12497)
* imdb dataset * cargo fmt * we should also extrac the tar after download * we should not skip last col
1 parent ba8a759 commit 6546479

File tree

5 files changed

+450
-0
lines changed

5 files changed

+450
-0
lines changed

benchmarks/bench.sh

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ main() {
142142
data_tpch "10"
143143
data_clickbench_1
144144
data_clickbench_partitioned
145+
data_imdb
145146
;;
146147
tpch)
147148
data_tpch "1"
@@ -166,6 +167,9 @@ main() {
166167
clickbench_extended)
167168
data_clickbench_1
168169
;;
170+
imdb)
171+
data_imdb
172+
;;
169173
*)
170174
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
171175
usage
@@ -430,6 +434,85 @@ run_clickbench_extended() {
430434
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}"
431435
}
432436

437+
# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
438+
# http://homepages.cwi.nl/~boncz/job/imdb.tgz
439+
data_imdb() {
440+
local imdb_dir="${DATA_DIR}/imdb"
441+
local imdb_temp_gz="${imdb_dir}/imdb.tgz"
442+
local imdb_url="https://homepages.cwi.nl/~boncz/job/imdb.tgz"
443+
444+
# imdb has 21 files, we just separate them into 3 groups for better readability
445+
local first_required_files=(
446+
"aka_name.parquet"
447+
"aka_title.parquet"
448+
"cast_info.parquet"
449+
"char_name.parquet"
450+
"comp_cast_type.parquet"
451+
"company_name.parquet"
452+
"company_type.parquet"
453+
)
454+
455+
local second_required_files=(
456+
"complete_cast.parquet"
457+
"info_type.parquet"
458+
"keyword.parquet"
459+
"kind_type.parquet"
460+
"link_type.parquet"
461+
"movie_companies.parquet"
462+
"movie_info.parquet"
463+
)
464+
465+
local third_required_files=(
466+
"movie_info_idx.parquet"
467+
"movie_keyword.parquet"
468+
"movie_link.parquet"
469+
"name.parquet"
470+
"person_info.parquet"
471+
"role_type.parquet"
472+
"title.parquet"
473+
)
474+
475+
# Combine the three arrays into one
476+
local required_files=("${first_required_files[@]}" "${second_required_files[@]}" "${third_required_files[@]}")
477+
local convert_needed=false
478+
479+
# Create directory if it doesn't exist
480+
mkdir -p "${imdb_dir}"
481+
482+
# Check if required files exist
483+
for file in "${required_files[@]}"; do
484+
if [ ! -f "${imdb_dir}/${file}" ]; then
485+
convert_needed=true
486+
break
487+
fi
488+
done
489+
490+
if [ "$convert_needed" = true ]; then
491+
if [ ! -f "${imdb_dir}/imdb.tgz" ]; then
492+
echo "Downloading IMDB dataset..."
493+
494+
# Download the dataset
495+
curl -o "${imdb_temp_gz}" "${imdb_url}"
496+
497+
# Extract the dataset
498+
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
499+
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
500+
else
501+
echo "IMDB.tgz already exists."
502+
503+
# Extract the dataset
504+
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
505+
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
506+
fi
507+
echo "IMDB dataset downloaded and extracted."
508+
else
509+
echo "IMDB dataset already exists and contains required parquet files."
510+
fi
511+
}
512+
513+
514+
515+
433516
compare_benchmarks() {
434517
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
435518
BRANCH1="$1"

benchmarks/src/bin/imdb.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! IMDB binary entrypoint
19+
20+
use datafusion::error::Result;
21+
use datafusion_benchmarks::imdb;
22+
use structopt::StructOpt;
23+
24+
#[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
25+
compile_error!(
26+
"feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time"
27+
);
28+
29+
#[cfg(feature = "snmalloc")]
30+
#[global_allocator]
31+
static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
32+
33+
#[cfg(feature = "mimalloc")]
34+
#[global_allocator]
35+
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
36+
37+
#[derive(Debug, StructOpt)]
38+
#[structopt(name = "IMDB", about = "IMDB Dataset Processing.")]
39+
enum ImdbOpt {
40+
Convert(imdb::ConvertOpt),
41+
}
42+
43+
#[tokio::main]
44+
pub async fn main() -> Result<()> {
45+
env_logger::init();
46+
match ImdbOpt::from_args() {
47+
ImdbOpt::Convert(opt) => opt.run().await,
48+
}
49+
}

benchmarks/src/imdb/convert.rs

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use datafusion::dataframe::DataFrameWriteOptions;
19+
use datafusion_common::instant::Instant;
20+
use std::path::PathBuf;
21+
22+
use datafusion::error::Result;
23+
use datafusion::prelude::*;
24+
use structopt::StructOpt;
25+
26+
use datafusion::common::not_impl_err;
27+
28+
use super::get_imdb_table_schema;
29+
use super::IMDB_TABLES;
30+
31+
#[derive(Debug, StructOpt)]
32+
pub struct ConvertOpt {
33+
/// Path to csv files
34+
#[structopt(parse(from_os_str), required = true, short = "i", long = "input")]
35+
input_path: PathBuf,
36+
37+
/// Output path
38+
#[structopt(parse(from_os_str), required = true, short = "o", long = "output")]
39+
output_path: PathBuf,
40+
41+
/// Output file format: `csv` or `parquet`
42+
#[structopt(short = "f", long = "format")]
43+
file_format: String,
44+
45+
/// Batch size when reading CSV or Parquet files
46+
#[structopt(short = "s", long = "batch-size", default_value = "8192")]
47+
batch_size: usize,
48+
}
49+
50+
impl ConvertOpt {
51+
pub async fn run(self) -> Result<()> {
52+
let input_path = self.input_path.to_str().unwrap();
53+
let output_path = self.output_path.to_str().unwrap();
54+
55+
for table in IMDB_TABLES {
56+
let start = Instant::now();
57+
let schema = get_imdb_table_schema(table);
58+
59+
let input_path = format!("{input_path}/{table}.csv");
60+
let output_path = format!("{output_path}/{table}.parquet");
61+
let options = CsvReadOptions::new()
62+
.schema(&schema)
63+
.has_header(false)
64+
.delimiter(b',')
65+
.escape(b'\\')
66+
.file_extension(".csv");
67+
68+
let config = SessionConfig::new().with_batch_size(self.batch_size);
69+
let ctx = SessionContext::new_with_config(config);
70+
71+
let mut csv = ctx.read_csv(&input_path, options).await?;
72+
73+
// Select all apart from the padding column
74+
let selection = csv
75+
.schema()
76+
.iter()
77+
.take(schema.fields.len())
78+
.map(Expr::from)
79+
.collect();
80+
81+
csv = csv.select(selection)?;
82+
83+
println!(
84+
"Converting '{}' to {} files in directory '{}'",
85+
&input_path, self.file_format, &output_path
86+
);
87+
match self.file_format.as_str() {
88+
"csv" => {
89+
csv.write_csv(
90+
output_path.as_str(),
91+
DataFrameWriteOptions::new(),
92+
None,
93+
)
94+
.await?;
95+
}
96+
"parquet" => {
97+
csv.write_parquet(
98+
output_path.as_str(),
99+
DataFrameWriteOptions::new(),
100+
None,
101+
)
102+
.await?;
103+
}
104+
other => {
105+
return not_impl_err!("Invalid output format: {other}");
106+
}
107+
}
108+
println!("Conversion completed in {} ms", start.elapsed().as_millis());
109+
}
110+
Ok(())
111+
}
112+
}

0 commit comments

Comments
 (0)