Skip to content

Commit dd8f41e

Browse files
committed
Support automatic Binary/LargeBinary --> Utf8 coercion
1 parent 2a4e7f7 commit dd8f41e

File tree

3 files changed

+21
-8
lines changed

3 files changed

+21
-8
lines changed

datafusion/expr/src/built_in_function.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,10 +1558,9 @@ macro_rules! make_utf8_to_return_type {
15581558
}
15591559
};
15601560
}
1561-
1562-
/// `utf8_to_str_type` returns either a Utf8 or LargeUtf8 based on the input type size.
1561+
// `utf8_to_str_type` returns either a Utf8 or LargeUtf8 based on the input type size.
15631562
make_utf8_to_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8);
1564-
/// `utf8_to_str_type` returns either a Int32 or Int64 based on the input type size.
1563+
// `utf8_to_str_type` returns either a Int32 or Int64 based on the input type size.
15651564
make_utf8_to_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
15661565

15671566
fn utf8_or_binary_to_binary_type(arg_type: &DataType, name: &str) -> Result<DataType> {

datafusion/expr/src/type_coercion/binary.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -650,13 +650,17 @@ fn string_concat_internal_coercion(
650650

651651
/// Coercion rules for Strings: the type that both lhs and rhs can be
652652
/// casted to for the purpose of a string computation
653+
///
654+
/// Note this also permits coercing `Binary` and `LargeBinary` types to
655+
/// `Utf8` and `LargeUtf`, which will actually generate an error at runtime
656+
/// if the binary field holds invalid Utf8 data
653657
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
654658
use arrow::datatypes::DataType::*;
655659
match (lhs_type, rhs_type) {
656-
(Utf8, Utf8) => Some(Utf8),
657-
(LargeUtf8, Utf8) => Some(LargeUtf8),
658-
(Utf8, LargeUtf8) => Some(LargeUtf8),
659-
(LargeUtf8, LargeUtf8) => Some(LargeUtf8),
660+
(Utf8, Utf8 | Binary) => Some(Utf8),
661+
(Utf8 | Binary, LargeUtf8) => Some(LargeUtf8),
662+
(LargeUtf8 | LargeBinary, Utf8 | Binary) => Some(LargeUtf8),
663+
(LargeUtf8 | LargeBinary, LargeUtf8 | LargeBinary) => Some(LargeUtf8),
660664
// TODO: cast between array elements (#6558)
661665
(List(_), List(_)) => Some(lhs_type.clone()),
662666
(List(_), _) => Some(lhs_type.clone()),

datafusion/sqllogictest/test_files/binary.slt

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,9 @@ NULL
221221
query error DataFusion error: type_coercion
222222
SELECT binary FROM t where binary LIKE '%F';
223223

224-
query error DataFusion error: type_coercion
224+
query ?
225225
SELECT largebinary FROM t where largebinary LIKE '%F';
226+
----
226227

227228

228229
# character_length function
@@ -239,6 +240,15 @@ NULL NULL NULL NULL
239240
Bar 3 Bar 3
240241
FooBar 6 FooBar 6
241242

243+
query I
244+
SELECT character_length(X'20');
245+
----
246+
1
247+
248+
# still errors on values that can not be coerced to utf8
249+
query error Encountered non UTF\-8 data: invalid utf\-8 sequence of 1 bytes from index 0
250+
SELECT character_length(X'c328');
251+
242252
# regexp_replace
243253
query TTTT
244254
SELECT

0 commit comments

Comments
 (0)