-
Notifications
You must be signed in to change notification settings - Fork 1.8k
feat: Add a configuration to make parquet encryption optional #16649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e668b99
d38dba4
5a2b456
c972676
ec3f828
3538a27
a754992
e430672
7fcba70
d6b1fca
3353186
e4bc0e3
f52e79c
5615ac8
61bc78e
a81855f
4cf12b3
f29bec3
86fe04b
d4ea63f
0fcc4a5
86db3a5
b34441a
668d728
ec1e8da
e233408
9ffaae4
8e244e9
2871d51
c405167
506801e
3058a90
e7e521a
bbeecfe
4ceb072
c998378
7780b33
219d0b3
3af85bc
3255293
6a77842
9972ecb
068e65d
ecd5f93
d79fdf2
f3e6945
a7005a3
48166f4
29821d4
6add82c
0129130
511c94b
efd38d1
69f7c57
81375dc
7fcfd4d
5f0f1b0
6de2af0
464e74d
b0cad10
321596f
411d0a5
3414141
842aa89
60d37ac
cca1cd5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| // Support optional features for encryption in Parquet files. | ||
| //! This module provides types and functions related to encryption in Parquet files. | ||
| #[cfg(feature = "parquet_encryption")] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than having so may
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. That's an interesting idea, but I will stick with the current approach for now.
|
||
| pub use parquet::encryption::decrypt::FileDecryptionProperties; | ||
| #[cfg(feature = "parquet_encryption")] | ||
| pub use parquet::encryption::encrypt::FileEncryptionProperties; | ||
|
|
||
| #[cfg(not(feature = "parquet_encryption"))] | ||
| pub struct FileDecryptionProperties; | ||
| #[cfg(not(feature = "parquet_encryption"))] | ||
| pub struct FileEncryptionProperties; | ||
|
|
||
| #[cfg(feature = "parquet")] | ||
| use crate::config::ParquetEncryptionOptions; | ||
| pub use crate::config::{ConfigFileDecryptionProperties, ConfigFileEncryptionProperties}; | ||
| #[cfg(feature = "parquet")] | ||
| use parquet::file::properties::WriterPropertiesBuilder; | ||
|
|
||
| #[cfg(feature = "parquet")] | ||
| pub fn add_crypto_to_writer_properties( | ||
| #[allow(unused)] crypto: &ParquetEncryptionOptions, | ||
| #[allow(unused_mut)] mut builder: WriterPropertiesBuilder, | ||
| ) -> WriterPropertiesBuilder { | ||
| #[cfg(feature = "parquet_encryption")] | ||
| if let Some(file_encryption_properties) = &crypto.file_encryption { | ||
| builder = builder | ||
| .with_file_encryption_properties(file_encryption_properties.clone().into()); | ||
| } | ||
| builder | ||
| } | ||
|
|
||
| #[cfg(feature = "parquet_encryption")] | ||
| pub fn map_encryption_to_config_encryption( | ||
| encryption: Option<&FileEncryptionProperties>, | ||
| ) -> Option<ConfigFileEncryptionProperties> { | ||
| encryption.map(|fe| fe.into()) | ||
| } | ||
|
|
||
| #[cfg(not(feature = "parquet_encryption"))] | ||
| pub fn map_encryption_to_config_encryption( | ||
| _encryption: Option<&FileEncryptionProperties>, | ||
| ) -> Option<ConfigFileEncryptionProperties> { | ||
| None | ||
| } | ||
|
|
||
| #[cfg(feature = "parquet_encryption")] | ||
| pub fn map_config_decryption_to_decryption( | ||
| decryption: Option<&ConfigFileDecryptionProperties>, | ||
| ) -> Option<FileDecryptionProperties> { | ||
| decryption.map(|fd| fd.clone().into()) | ||
| } | ||
|
|
||
| #[cfg(not(feature = "parquet_encryption"))] | ||
| pub fn map_config_decryption_to_decryption( | ||
| _decryption: Option<&ConfigFileDecryptionProperties>, | ||
| ) -> Option<FileDecryptionProperties> { | ||
| None | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -61,13 +61,21 @@ default = [ | |||
| "unicode_expressions", | ||||
| "compression", | ||||
| "parquet", | ||||
| "parquet_encryption", | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this really be a default feature? It's still marked experimental in arrow-rs.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure. I set it to the default so that users can have this feature enabled as per the updated docs and available in the CLI.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably a question for @alamb
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would personally prefer if it was not part of the default features but given it is already on by default on main, I think it is fine to leave it on by default in this PR We can discuss disabling the feature by default as a follow on ticket / PR perhaps Here are some notes I wrote about how to make it a non default feature:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this would make for a good follow-up PR. Thanks for the helpful notes.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've made an issue for this for further discussion: #16777 |
||||
| "recursive_protection", | ||||
| ] | ||||
| encoding_expressions = ["datafusion-functions/encoding_expressions"] | ||||
| # Used for testing ONLY: causes all values to hash to the same value (test for collisions) | ||||
| force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"] | ||||
| math_expressions = ["datafusion-functions/math_expressions"] | ||||
| parquet = ["datafusion-common/parquet", "dep:parquet", "datafusion-datasource-parquet"] | ||||
| parquet_encryption = [ | ||||
| "dep:parquet", | ||||
| "parquet/encryption", | ||||
| "datafusion-common/parquet_encryption", | ||||
| "datafusion-datasource-parquet/parquet_encryption", | ||||
| "dep:hex", | ||||
| ] | ||||
| pyarrow = ["datafusion-common/pyarrow", "parquet"] | ||||
| regex_expressions = [ | ||||
| "datafusion-functions/regex_expressions", | ||||
|
|
@@ -127,6 +135,7 @@ datafusion-session = { workspace = true } | |||
| datafusion-sql = { workspace = true } | ||||
| flate2 = { version = "1.1.2", optional = true } | ||||
| futures = { workspace = true } | ||||
| hex = { workspace = true, optional = true } | ||||
| itertools = { workspace = true } | ||||
| log = { workspace = true } | ||||
| object_store = { workspace = true } | ||||
|
|
||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,7 @@ use crate::test_util::{aggr_test_schema, arrow_test_data}; | |
| use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array}; | ||
| use arrow::datatypes::{DataType, Field, Schema}; | ||
| use arrow::record_batch::RecordBatch; | ||
| #[cfg(feature = "compression")] | ||
| use datafusion_common::DataFusionError; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only used by compression. Got warning via Figured I may as well fix it while I am here. |
||
| use datafusion_datasource::source::DataSourceExec; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we can move all the crypto code into its own module, so we only have to
#cfgthe module rather than so many individual lines 🤔There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wasn't able to do this entirely, but I have refactored most of this into methods in encryption.rs, which cleans this up more.