Skip to content

Commit 19c59b3

Browse files
doki23mcheshkov
authored andcommitted
Support for casting Utf8 and LargeUtf8 --> Interval (apache#3762)
* cast string to interval * cast string to interval * unit tests * fix * update * code clean * update unit tests and align_interval_parts * fix ut * make clippy happy * Update arrow-cast/src/parse.rs Co-authored-by: Raphael Taylor-Davies <[email protected]> * change return types of calculate_from_part and fix bug of align_interval_parts * make clippy happy * remote useless overflow check * remove the "convert to higher units" logic --------- Co-authored-by: Raphael Taylor-Davies <[email protected]> Can drop this after rebase on commit 14544fb "Support for casting Utf8 and LargeUtf8 --> Interval (apache#3762)", first released in 35.0.0
1 parent be83d41 commit 19c59b3

File tree

3 files changed

+625
-6
lines changed

3 files changed

+625
-6
lines changed

arrow/src/compute/kernels/cast.rs

Lines changed: 274 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ use std::sync::Arc;
4141
use crate::buffer::MutableBuffer;
4242
use crate::compute::kernels::arithmetic::{divide, multiply};
4343
use crate::compute::kernels::arity::unary;
44-
use crate::compute::kernels::cast_utils::string_to_timestamp_nanos;
44+
use crate::compute::kernels::cast_utils::{
45+
parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month,
46+
string_to_timestamp_nanos,
47+
};
4548
use crate::datatypes::*;
4649
use crate::error::{ArrowError, Result};
4750
use crate::{array::*, compute::take};
@@ -176,9 +179,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
176179

177180
(Utf8, LargeUtf8) => true,
178181
(LargeUtf8, Utf8) => true,
179-
(Utf8, Date32 | Date64 | Timestamp(TimeUnit::Nanosecond, None)) => true,
182+
(Utf8, Date32 | Date64 | Timestamp(TimeUnit::Nanosecond, None) | Interval(_)) => true,
180183
(Utf8, _) => DataType::is_numeric(to_type),
181-
(LargeUtf8, Date32 | Date64 | Timestamp(TimeUnit::Nanosecond, None)) => true,
184+
(LargeUtf8, Date32 | Date64 | Timestamp(TimeUnit::Nanosecond, None) | Interval(_)) => true,
182185
(LargeUtf8, _) => DataType::is_numeric(to_type),
183186
(Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true,
184187
(Date32, Utf8) | (Date32, LargeUtf8) => true,
@@ -764,6 +767,15 @@ pub fn cast_with_options(
764767
Timestamp(TimeUnit::Nanosecond, None) => {
765768
cast_string_to_timestamp_ns::<i32>(&**array, cast_options)
766769
}
770+
Interval(IntervalUnit::YearMonth) => {
771+
cast_string_to_year_month_interval::<i32>(&**array, cast_options)
772+
}
773+
Interval(IntervalUnit::DayTime) => {
774+
cast_string_to_day_time_interval::<i32>(&**array, cast_options)
775+
}
776+
Interval(IntervalUnit::MonthDayNano) => {
777+
cast_string_to_month_day_nano_interval::<i32>(&**array, cast_options)
778+
}
767779
_ => Err(ArrowError::CastError(format!(
768780
"Casting from {:?} to {:?} not supported",
769781
from_type, to_type,
@@ -898,6 +910,15 @@ pub fn cast_with_options(
898910
Timestamp(TimeUnit::Nanosecond, None) => {
899911
cast_string_to_timestamp_ns::<i64>(&**array, cast_options)
900912
}
913+
Interval(IntervalUnit::YearMonth) => {
914+
cast_string_to_year_month_interval::<i64>(&**array, cast_options)
915+
}
916+
Interval(IntervalUnit::DayTime) => {
917+
cast_string_to_day_time_interval::<i64>(&**array, cast_options)
918+
}
919+
Interval(IntervalUnit::MonthDayNano) => {
920+
cast_string_to_month_day_nano_interval::<i64>(&**array, cast_options)
921+
}
901922
_ => Err(ArrowError::CastError(format!(
902923
"Casting from {:?} to {:?} not supported",
903924
from_type, to_type,
@@ -1757,6 +1778,105 @@ fn cast_string_to_timestamp_ns<Offset: StringOffsetSizeTrait>(
17571778
Ok(Arc::new(array) as ArrayRef)
17581779
}
17591780

1781+
fn cast_string_to_year_month_interval<Offset: StringOffsetSizeTrait>(
1782+
array: &dyn Array,
1783+
cast_options: &CastOptions,
1784+
) -> Result<ArrayRef> {
1785+
let string_array = array
1786+
.as_any()
1787+
.downcast_ref::<GenericStringArray<Offset>>()
1788+
.unwrap();
1789+
let interval_array = if cast_options.safe {
1790+
let iter = string_array
1791+
.iter()
1792+
.map(|v| v.and_then(|v| parse_interval_year_month(v).ok()));
1793+
1794+
// Benefit:
1795+
// 20% performance improvement
1796+
// Soundness:
1797+
// The iterator is trustedLen because it comes from an `StringArray`.
1798+
unsafe { IntervalYearMonthArray::from_trusted_len_iter(iter) }
1799+
} else {
1800+
let vec = string_array
1801+
.iter()
1802+
.map(|v| v.map(parse_interval_year_month).transpose())
1803+
.collect::<Result<Vec<_>>>()?;
1804+
1805+
// Benefit:
1806+
// 20% performance improvement
1807+
// Soundness:
1808+
// The iterator is trustedLen because it comes from an `StringArray`.
1809+
unsafe { IntervalYearMonthArray::from_trusted_len_iter(vec) }
1810+
};
1811+
Ok(Arc::new(interval_array) as ArrayRef)
1812+
}
1813+
1814+
fn cast_string_to_day_time_interval<Offset: StringOffsetSizeTrait>(
1815+
array: &dyn Array,
1816+
cast_options: &CastOptions,
1817+
) -> Result<ArrayRef> {
1818+
let string_array = array
1819+
.as_any()
1820+
.downcast_ref::<GenericStringArray<Offset>>()
1821+
.unwrap();
1822+
let interval_array = if cast_options.safe {
1823+
let iter = string_array
1824+
.iter()
1825+
.map(|v| v.and_then(|v| parse_interval_day_time(v).ok()));
1826+
1827+
// Benefit:
1828+
// 20% performance improvement
1829+
// Soundness:
1830+
// The iterator is trustedLen because it comes from an `StringArray`.
1831+
unsafe { IntervalDayTimeArray::from_trusted_len_iter(iter) }
1832+
} else {
1833+
let vec = string_array
1834+
.iter()
1835+
.map(|v| v.map(parse_interval_day_time).transpose())
1836+
.collect::<Result<Vec<_>>>()?;
1837+
1838+
// Benefit:
1839+
// 20% performance improvement
1840+
// Soundness:
1841+
// The iterator is trustedLen because it comes from an `StringArray`.
1842+
unsafe { IntervalDayTimeArray::from_trusted_len_iter(vec) }
1843+
};
1844+
Ok(Arc::new(interval_array) as ArrayRef)
1845+
}
1846+
1847+
fn cast_string_to_month_day_nano_interval<Offset: StringOffsetSizeTrait>(
1848+
array: &dyn Array,
1849+
cast_options: &CastOptions,
1850+
) -> Result<ArrayRef> {
1851+
let string_array = array
1852+
.as_any()
1853+
.downcast_ref::<GenericStringArray<Offset>>()
1854+
.unwrap();
1855+
let interval_array = if cast_options.safe {
1856+
let iter = string_array
1857+
.iter()
1858+
.map(|v| v.and_then(|v| parse_interval_month_day_nano(v).ok()));
1859+
1860+
// Benefit:
1861+
// 20% performance improvement
1862+
// Soundness:
1863+
// The iterator is trustedLen because it comes from an `StringArray`.
1864+
unsafe { IntervalMonthDayNanoArray::from_trusted_len_iter(iter) }
1865+
} else {
1866+
let vec = string_array
1867+
.iter()
1868+
.map(|v| v.map(parse_interval_month_day_nano).transpose())
1869+
.collect::<Result<Vec<_>>>()?;
1870+
1871+
// Benefit:
1872+
// 20% performance improvement
1873+
// Soundness:
1874+
// The iterator is trustedLen because it comes from an `StringArray`.
1875+
unsafe { IntervalMonthDayNanoArray::from_trusted_len_iter(vec) }
1876+
};
1877+
Ok(Arc::new(interval_array) as ArrayRef)
1878+
}
1879+
17601880
/// Casts Utf8 to Boolean
17611881
fn cast_utf8_to_boolean(from: &ArrayRef, cast_options: &CastOptions) -> Result<ArrayRef> {
17621882
let array = as_string_array(from);
@@ -3005,6 +3125,157 @@ mod tests {
30053125
}
30063126
}
30073127

3128+
macro_rules! test_safe_string_to_interval {
3129+
($data_vec:expr, $interval_unit:expr, $array_ty:ty, $expect_vec:expr) => {
3130+
let source_string_array =
3131+
Arc::new(StringArray::from($data_vec.clone())) as ArrayRef;
3132+
3133+
let options = CastOptions { safe: true };
3134+
3135+
let target_interval_array = cast_with_options(
3136+
&source_string_array.clone(),
3137+
&DataType::Interval($interval_unit),
3138+
&options,
3139+
)
3140+
.unwrap();
3141+
3142+
for row in 0..target_interval_array.len() {
3143+
let interval_str = array_value_to_string(&target_interval_array, row);
3144+
assert_eq!(
3145+
$expect_vec[row].unwrap_or(""),
3146+
interval_str.unwrap_or("".to_string())
3147+
);
3148+
}
3149+
};
3150+
}
3151+
3152+
#[test]
3153+
fn test_cast_string_to_interval_year_month() {
3154+
test_safe_string_to_interval!(
3155+
vec![
3156+
Some("1 year 1 month"),
3157+
Some("1.5 years 13 month"),
3158+
Some("30 days"),
3159+
Some("31 days"),
3160+
Some("2 months 31 days"),
3161+
Some("2 months 31 days 1 second"),
3162+
Some("foobar"),
3163+
],
3164+
IntervalUnit::YearMonth,
3165+
IntervalYearMonthArray,
3166+
vec![
3167+
Some("1 years 1 mons 0 days 0 hours 0 mins 0.000 secs"),
3168+
Some("2 years 7 mons 0 days 0 hours 0 mins 0.000 secs"),
3169+
None,
3170+
None,
3171+
None,
3172+
None,
3173+
None,
3174+
]
3175+
);
3176+
}
3177+
3178+
#[test]
3179+
fn test_cast_string_to_interval_day_time() {
3180+
test_safe_string_to_interval!(
3181+
vec![
3182+
Some("1 year 1 month"),
3183+
Some("1.5 years 13 month"),
3184+
Some("30 days"),
3185+
Some("1 day 2 second 3 milliseconds"),
3186+
Some("1 day 2 second 3.5 milliseconds"),
3187+
Some("foobar"),
3188+
],
3189+
IntervalUnit::DayTime,
3190+
IntervalDayTimeArray,
3191+
vec![
3192+
Some("0 years 0 mons 390 days 0 hours 0 mins 0.000 secs"),
3193+
Some("0 years 0 mons 930 days 0 hours 0 mins 0.000 secs"),
3194+
Some("0 years 0 mons 30 days 0 hours 0 mins 0.000 secs"),
3195+
Some("0 years 0 mons 1 days 0 hours 0 mins 2.003 secs"),
3196+
None,
3197+
None,
3198+
]
3199+
);
3200+
}
3201+
3202+
#[test]
3203+
fn test_cast_string_to_interval_month_day_nano() {
3204+
test_safe_string_to_interval!(
3205+
vec![
3206+
Some("1 year 1 month 1 day"),
3207+
None,
3208+
Some("1.5 years 13 month 35 days 1.4 milliseconds"),
3209+
Some("3 days"),
3210+
Some("8 seconds"),
3211+
None,
3212+
Some("1 day 29800 milliseconds"),
3213+
Some("3 months 1 second"),
3214+
Some("6 minutes 120 second"),
3215+
Some("2 years 39 months 9 days 19 hours 1 minute 83 seconds 399222 milliseconds"),
3216+
Some("foobar"),
3217+
],
3218+
IntervalUnit::MonthDayNano,
3219+
IntervalMonthDayNanoArray,
3220+
vec![
3221+
Some("0 years 13 mons 1 days 0 hours 0 mins 0.000000000 secs"),
3222+
None,
3223+
Some("0 years 31 mons 35 days 0 hours 0 mins 0.001400000 secs"),
3224+
Some("0 years 0 mons 3 days 0 hours 0 mins 0.000000000 secs"),
3225+
Some("0 years 0 mons 0 days 0 hours 0 mins 8.000000000 secs"),
3226+
None,
3227+
Some("0 years 0 mons 1 days 0 hours 0 mins 29.800000000 secs"),
3228+
Some("0 years 3 mons 0 days 0 hours 0 mins 1.000000000 secs"),
3229+
Some("0 years 0 mons 0 days 0 hours 8 mins 0.000000000 secs"),
3230+
Some("0 years 63 mons 9 days 19 hours 9 mins 2.222000000 secs"),
3231+
None,
3232+
]
3233+
);
3234+
}
3235+
3236+
macro_rules! test_unsafe_string_to_interval_err {
3237+
($data_vec:expr, $interval_unit:expr, $error_msg:expr) => {
3238+
let string_array = Arc::new(StringArray::from($data_vec.clone())) as ArrayRef;
3239+
let options = CastOptions { safe: false };
3240+
let arrow_err = cast_with_options(
3241+
&string_array.clone(),
3242+
&DataType::Interval($interval_unit),
3243+
&options,
3244+
)
3245+
.unwrap_err();
3246+
assert_eq!($error_msg, arrow_err.to_string());
3247+
};
3248+
}
3249+
3250+
#[test]
3251+
fn test_cast_string_to_interval_err() {
3252+
test_unsafe_string_to_interval_err!(
3253+
vec![Some("foobar")],
3254+
IntervalUnit::YearMonth,
3255+
r#"Not yet implemented: Unsupported Interval Expression with value "foobar""#
3256+
);
3257+
test_unsafe_string_to_interval_err!(
3258+
vec![Some("foobar")],
3259+
IntervalUnit::DayTime,
3260+
r#"Not yet implemented: Unsupported Interval Expression with value "foobar""#
3261+
);
3262+
test_unsafe_string_to_interval_err!(
3263+
vec![Some("foobar")],
3264+
IntervalUnit::MonthDayNano,
3265+
r#"Not yet implemented: Unsupported Interval Expression with value "foobar""#
3266+
);
3267+
test_unsafe_string_to_interval_err!(
3268+
vec![Some("2 months 31 days 1 second")],
3269+
IntervalUnit::YearMonth,
3270+
r#"Cast error: Cannot cast 2 months 31 days 1 second to IntervalYearMonth. Only year and month fields are allowed."#
3271+
);
3272+
test_unsafe_string_to_interval_err!(
3273+
vec![Some("1 day 1.5 milliseconds")],
3274+
IntervalUnit::DayTime,
3275+
r#"Cast error: Cannot cast 1 day 1.5 milliseconds to IntervalDayTime because the nanos part isn't multiple of milliseconds"#
3276+
);
3277+
}
3278+
30083279
#[test]
30093280
fn test_cast_date32_to_int32() {
30103281
let a = Date32Array::from(vec![10000, 17890]);

0 commit comments

Comments
 (0)