diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs index 6fcb9c6f0840..7b367174006d 100644 --- a/datafusion/functions/src/utils.rs +++ b/datafusion/functions/src/utils.rs @@ -32,7 +32,7 @@ use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; /// /// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`, /// -/// If the input type is `Utf8View` the return type is `Utf8View`, +/// If the input type is `Utf8View` the return type is $utf8Type, macro_rules! get_optimal_return_type { ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => { pub(crate) fn $FUNC(arg_type: &DataType, name: &str) -> Result { @@ -41,8 +41,8 @@ macro_rules! get_optimal_return_type { DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type, // Binary inputs are automatically coerced to Utf8 DataType::Utf8 | DataType::Binary => $utf8Type, - // Utf8View inputs will yield Utf8View outputs - DataType::Utf8View => DataType::Utf8View, + // Utf8View max offset size is u32::MAX, the same as UTF8 + DataType::Utf8View | DataType::BinaryView => $utf8Type, DataType::Null => DataType::Null, DataType::Dictionary(_, value_type) => match **value_type { DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type, @@ -183,6 +183,21 @@ pub mod test { }; } + use arrow::datatypes::DataType; #[allow(unused_imports)] pub(crate) use test_function; + + use super::*; + + #[test] + fn string_to_int_type() { + let v = utf8_to_int_type(&DataType::Utf8, "test").unwrap(); + assert_eq!(v, DataType::Int32); + + let v = utf8_to_int_type(&DataType::Utf8View, "test").unwrap(); + assert_eq!(v, DataType::Int32); + + let v = utf8_to_int_type(&DataType::LargeUtf8, "test").unwrap(); + assert_eq!(v, DataType::Int64); + } } diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 8169401874e6..1150d059e7d5 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -270,14 +270,14 @@ impl RecordBatchStream for CoalesceBatchesStream { } } -/// Heuristically compact [`StringViewArray`]s to reduce memory usage, if needed +/// Heuristically compact `StringViewArray`s to reduce memory usage, if needed /// /// This function decides when to consolidate the StringView into a new buffer /// to reduce memory usage and improve string locality for better performance. /// -/// This differs from [`StringViewArray::gc`] because: +/// This differs from `StringViewArray::gc` because: /// 1. It may not compact the array depending on a heuristic. -/// 2. It uses a larger default block size (2MB) to reduce the number of buffers to track. +/// 2. It uses a precise block size to reduce the number of buffers to track. /// /// # Heuristic ///