Skip to content

Commit 06631c2

Browse files
authored
feat: mapping sql Char/Text/String default to Utf8View (#16290)
* feat: mapping sql Char/Text/String default to Utf8View * Add support utf8view for sort merge join * fix binary utf8view union with int32 * fix slt order * fix * fmt * Fix test * fix * fix test * fix * clean * Address comments * Fix test * support md5 for ut8view
1 parent 324a271 commit 06631c2

File tree

25 files changed

+170
-194
lines changed

25 files changed

+170
-194
lines changed

datafusion/common/src/config.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,10 +259,10 @@ config_namespace! {
259259
/// string length and thus DataFusion can not enforce such limits.
260260
pub support_varchar_with_length: bool, default = true
261261

262-
/// If true, `VARCHAR` is mapped to `Utf8View` during SQL planning.
263-
/// If false, `VARCHAR` is mapped to `Utf8` during SQL planning.
264-
/// Default is false.
265-
pub map_varchar_to_utf8view: bool, default = true
262+
/// If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning.
263+
/// If false, they are mapped to `Utf8`.
264+
/// Default is true.
265+
pub map_string_types_to_utf8view: bool, default = true
266266

267267
/// When set to true, the source locations relative to the original SQL
268268
/// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected

datafusion/core/src/execution/session_state.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ impl SessionState {
494494
enable_options_value_normalization: sql_parser_options
495495
.enable_options_value_normalization,
496496
support_varchar_with_length: sql_parser_options.support_varchar_with_length,
497-
map_varchar_to_utf8view: sql_parser_options.map_varchar_to_utf8view,
497+
map_string_types_to_utf8view: sql_parser_options.map_string_types_to_utf8view,
498498
collect_spans: sql_parser_options.collect_spans,
499499
}
500500
}

datafusion/core/tests/sql/create_drop.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ async fn create_external_table_with_ddl() -> Result<()> {
6161
assert_eq!(3, table_schema.fields().len());
6262

6363
assert_eq!(&DataType::Int32, table_schema.field(0).data_type());
64-
assert_eq!(&DataType::Utf8, table_schema.field(1).data_type());
64+
assert_eq!(&DataType::Utf8View, table_schema.field(1).data_type());
6565
assert_eq!(&DataType::Boolean, table_schema.field(2).data_type());
6666

6767
Ok(())

datafusion/core/tests/user_defined/user_defined_scalar_functions.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1181,7 +1181,7 @@ async fn create_scalar_function_from_sql_statement_postgres_syntax() -> Result<(
11811181
quote_style: None,
11821182
span: Span::empty(),
11831183
}),
1184-
data_type: DataType::Utf8,
1184+
data_type: DataType::Utf8View,
11851185
default_expr: None,
11861186
}]),
11871187
return_type: Some(DataType::Int32),

datafusion/expr-common/src/type_coercion/binary.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,7 @@ pub fn type_union_resolution(data_types: &[DataType]) -> Option<DataType> {
462462

463463
// If all the data_types are null, return string
464464
if data_types.iter().all(|t| t == &DataType::Null) {
465-
return Some(DataType::Utf8);
465+
return Some(DataType::Utf8View);
466466
}
467467

468468
// Ignore Nulls, if any data_type category is not the same, return None
@@ -1202,7 +1202,8 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
12021202
fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
12031203
use arrow::datatypes::DataType::*;
12041204
match (lhs_type, rhs_type) {
1205-
(Utf8 | LargeUtf8, other_type) | (other_type, Utf8 | LargeUtf8)
1205+
(Utf8 | LargeUtf8 | Utf8View, other_type)
1206+
| (other_type, Utf8 | LargeUtf8 | Utf8View)
12061207
if other_type.is_numeric() =>
12071208
{
12081209
Some(other_type.clone())

datafusion/functions/src/crypto/basic.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use arrow::array::{
2121
Array, ArrayRef, BinaryArray, BinaryArrayType, BinaryViewArray, GenericBinaryArray,
2222
OffsetSizeTrait,
2323
};
24-
use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray};
24+
use arrow::array::{AsArray, GenericStringArray, StringViewArray};
2525
use arrow::datatypes::DataType;
2626
use blake2::{Blake2b512, Blake2s256, Digest};
2727
use blake3::Hasher as Blake3;
@@ -169,18 +169,18 @@ pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
169169
let [data] = take_function_args("md5", args)?;
170170
let value = digest_process(data, DigestAlgorithm::Md5)?;
171171

172-
// md5 requires special handling because of its unique utf8 return type
172+
// md5 requires special handling because of its unique utf8view return type
173173
Ok(match value {
174174
ColumnarValue::Array(array) => {
175175
let binary_array = as_binary_array(&array)?;
176-
let string_array: StringArray = binary_array
176+
let string_array: StringViewArray = binary_array
177177
.iter()
178178
.map(|opt| opt.map(hex_encode::<_>))
179179
.collect();
180180
ColumnarValue::Array(Arc::new(string_array))
181181
}
182182
ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
183-
ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>)))
183+
ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode::<_>)))
184184
}
185185
_ => return exec_err!("Impossibly got invalid results from digest"),
186186
})

datafusion/functions/src/crypto/md5.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,12 @@ impl ScalarUDFImpl for Md5Func {
9292
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
9393
use DataType::*;
9494
Ok(match &arg_types[0] {
95-
LargeUtf8 | LargeBinary => Utf8,
96-
Utf8View | Utf8 | Binary | BinaryView => Utf8,
95+
LargeUtf8 | LargeBinary => Utf8View,
96+
Utf8View | Utf8 | Binary | BinaryView => Utf8View,
9797
Null => Null,
9898
Dictionary(_, t) => match **t {
99-
LargeUtf8 | LargeBinary => Utf8,
100-
Utf8 | Binary | BinaryView => Utf8,
99+
LargeUtf8 | LargeBinary => Utf8View,
100+
Utf8 | Binary | BinaryView => Utf8View,
101101
Null => Null,
102102
_ => {
103103
return plan_err!(

datafusion/physical-plan/src/joins/sort_merge_join.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2492,6 +2492,7 @@ fn compare_join_arrays(
24922492
DataType::Float32 => compare_value!(Float32Array),
24932493
DataType::Float64 => compare_value!(Float64Array),
24942494
DataType::Utf8 => compare_value!(StringArray),
2495+
DataType::Utf8View => compare_value!(StringViewArray),
24952496
DataType::LargeUtf8 => compare_value!(LargeStringArray),
24962497
DataType::Decimal128(..) => compare_value!(Decimal128Array),
24972498
DataType::Timestamp(time_unit, None) => match time_unit {
@@ -2559,6 +2560,7 @@ fn is_join_arrays_equal(
25592560
DataType::Float32 => compare_value!(Float32Array),
25602561
DataType::Float64 => compare_value!(Float64Array),
25612562
DataType::Utf8 => compare_value!(StringArray),
2563+
DataType::Utf8View => compare_value!(StringViewArray),
25622564
DataType::LargeUtf8 => compare_value!(LargeStringArray),
25632565
DataType::Decimal128(..) => compare_value!(Decimal128Array),
25642566
DataType::Timestamp(time_unit, None) => match time_unit {

datafusion/sql/src/planner.rs

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ pub struct ParserOptions {
5252
pub enable_options_value_normalization: bool,
5353
/// Whether to collect spans
5454
pub collect_spans: bool,
55-
/// Whether `VARCHAR` is mapped to `Utf8View` during SQL planning.
56-
pub map_varchar_to_utf8view: bool,
55+
/// Whether string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning.
56+
pub map_string_types_to_utf8view: bool,
5757
}
5858

5959
impl ParserOptions {
@@ -72,7 +72,7 @@ impl ParserOptions {
7272
parse_float_as_decimal: false,
7373
enable_ident_normalization: true,
7474
support_varchar_with_length: true,
75-
map_varchar_to_utf8view: true,
75+
map_string_types_to_utf8view: true,
7676
enable_options_value_normalization: false,
7777
collect_spans: false,
7878
}
@@ -112,9 +112,9 @@ impl ParserOptions {
112112
self
113113
}
114114

115-
/// Sets the `map_varchar_to_utf8view` option.
116-
pub fn with_map_varchar_to_utf8view(mut self, value: bool) -> Self {
117-
self.map_varchar_to_utf8view = value;
115+
/// Sets the `map_string_types_to_utf8view` option.
116+
pub fn with_map_string_types_to_utf8view(mut self, value: bool) -> Self {
117+
self.map_string_types_to_utf8view = value;
118118
self
119119
}
120120

@@ -143,7 +143,7 @@ impl From<&SqlParserOptions> for ParserOptions {
143143
parse_float_as_decimal: options.parse_float_as_decimal,
144144
enable_ident_normalization: options.enable_ident_normalization,
145145
support_varchar_with_length: options.support_varchar_with_length,
146-
map_varchar_to_utf8view: options.map_varchar_to_utf8view,
146+
map_string_types_to_utf8view: options.map_string_types_to_utf8view,
147147
enable_options_value_normalization: options
148148
.enable_options_value_normalization,
149149
collect_spans: options.collect_spans,
@@ -577,7 +577,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
577577
please set `support_varchar_with_length` to be true"
578578
),
579579
_ => {
580-
if self.options.map_varchar_to_utf8view {
580+
if self.options.map_string_types_to_utf8view {
581581
Ok(DataType::Utf8View)
582582
} else {
583583
Ok(DataType::Utf8)
@@ -601,7 +601,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
601601
)
602602
}
603603
SQLDataType::Char(_) | SQLDataType::Text | SQLDataType::String(_) => {
604-
Ok(DataType::Utf8)
604+
if self.options.map_string_types_to_utf8view {
605+
Ok(DataType::Utf8View)
606+
} else {
607+
Ok(DataType::Utf8)
608+
}
605609
}
606610
SQLDataType::Timestamp(precision, tz_info)
607611
if precision.is_none() || [0, 3, 6, 9].contains(&precision.unwrap()) =>

datafusion/sql/tests/cases/params.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -746,31 +746,31 @@ fn test_prepare_statement_to_plan_multi_params() {
746746
assert_snapshot!(
747747
plan,
748748
@r#"
749-
Prepare: "my_plan" [Int32, Utf8, Float64, Int32, Float64, Utf8]
749+
Prepare: "my_plan" [Int32, Utf8View, Float64, Int32, Float64, Utf8View]
750750
Projection: person.id, person.age, $6
751751
Filter: person.age IN ([$1, $4]) AND person.salary > $3 AND person.salary < $5 OR person.first_name < $2
752752
TableScan: person
753753
"#
754754
);
755-
assert_snapshot!(dt, @r#"[Int32, Utf8, Float64, Int32, Float64, Utf8]"#);
755+
assert_snapshot!(dt, @r#"[Int32, Utf8View, Float64, Int32, Float64, Utf8View]"#);
756756

757757
///////////////////
758758
// replace params with values
759759
let param_values = vec![
760760
ScalarValue::Int32(Some(10)),
761-
ScalarValue::from("abc"),
761+
ScalarValue::Utf8View(Some("abc".into())),
762762
ScalarValue::Float64(Some(100.0)),
763763
ScalarValue::Int32(Some(20)),
764764
ScalarValue::Float64(Some(200.0)),
765-
ScalarValue::from("xyz"),
765+
ScalarValue::Utf8View(Some("xyz".into())),
766766
];
767767

768768
let plan_with_params = plan.with_param_values(param_values).unwrap();
769769
assert_snapshot!(
770770
plan_with_params,
771771
@r#"
772-
Projection: person.id, person.age, Utf8("xyz") AS $6
773-
Filter: person.age IN ([Int32(10), Int32(20)]) AND person.salary > Float64(100) AND person.salary < Float64(200) OR person.first_name < Utf8("abc")
772+
Projection: person.id, person.age, Utf8View("xyz") AS $6
773+
Filter: person.age IN ([Int32(10), Int32(20)]) AND person.salary > Float64(100) AND person.salary < Float64(200) OR person.first_name < Utf8View("abc")
774774
TableScan: person
775775
"#
776776
);

0 commit comments

Comments
 (0)