Skip to content

Commit 15c65ee

Browse files
authored
Fix interval to use consistent units and arrow parser (#5806)
* Fix `interval` to use consistent units and arrow parser * cleanup * update datafusion-cli
1 parent b87871f commit 15c65ee

File tree

12 files changed

+237
-434
lines changed

12 files changed

+237
-434
lines changed

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/common/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ pub use error::{
3939
field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError,
4040
SharedResult,
4141
};
42-
pub use parsers::parse_interval;
4342
pub use scalar::{ScalarType, ScalarValue};
4443
pub use stats::{ColumnStatistics, Statistics};
4544
pub use table_reference::{OwnedTableReference, ResolvedTableReference, TableReference};

datafusion/common/src/parsers.rs

Lines changed: 0 additions & 338 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,9 @@
1818
//! Interval parsing logic
1919
use sqlparser::parser::ParserError;
2020

21-
use crate::{DataFusionError, Result, ScalarValue};
2221
use std::result;
2322
use std::str::FromStr;
2423

25-
const SECONDS_PER_HOUR: f64 = 3_600_f64;
26-
const NANOS_PER_SECOND: f64 = 1_000_000_000_f64;
27-
2824
/// Readable file compression type
2925
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
3026
pub enum CompressionTypeVariant {
@@ -76,337 +72,3 @@ impl CompressionTypeVariant {
7672
!matches!(self, &Self::UNCOMPRESSED)
7773
}
7874
}
79-
80-
#[rustfmt::skip]
81-
#[derive(Clone, Copy)]
82-
#[repr(u16)]
83-
enum IntervalType {
84-
Century = 0b_0000_0000_0001,
85-
Decade = 0b_0000_0000_0010,
86-
Year = 0b_0000_0000_0100,
87-
Month = 0b_0000_0000_1000,
88-
Week = 0b_0000_0001_0000,
89-
Day = 0b_0000_0010_0000,
90-
Hour = 0b_0000_0100_0000,
91-
Minute = 0b_0000_1000_0000,
92-
Second = 0b_0001_0000_0000,
93-
Millisecond = 0b_0010_0000_0000,
94-
Microsecond = 0b_0100_0000_0000,
95-
Nanosecond = 0b_1000_0000_0000,
96-
}
97-
98-
impl FromStr for IntervalType {
99-
type Err = DataFusionError;
100-
101-
fn from_str(s: &str) -> Result<Self> {
102-
match s.to_lowercase().as_str() {
103-
"century" | "centuries" => Ok(Self::Century),
104-
"decade" | "decades" => Ok(Self::Decade),
105-
"year" | "years" => Ok(Self::Year),
106-
"month" | "months" => Ok(Self::Month),
107-
"week" | "weeks" => Ok(Self::Week),
108-
"day" | "days" => Ok(Self::Day),
109-
"hour" | "hours" => Ok(Self::Hour),
110-
"minute" | "minutes" => Ok(Self::Minute),
111-
"second" | "seconds" => Ok(Self::Second),
112-
"millisecond" | "milliseconds" => Ok(Self::Millisecond),
113-
"microsecond" | "microseconds" => Ok(Self::Microsecond),
114-
"nanosecond" | "nanoseconds" => Ok(Self::Nanosecond),
115-
_ => Err(DataFusionError::NotImplemented(format!(
116-
"Unknown interval type: {s}"
117-
))),
118-
}
119-
}
120-
}
121-
122-
/// Parses a string with an interval like `'0.5 MONTH'` to an
123-
/// appropriately typed [`ScalarValue`]
124-
pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
125-
// We are storing parts as integers, it's why we need to align parts fractional
126-
// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
127-
// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
128-
let align_interval_parts =
129-
|month_part: f64, mut day_part: f64, mut nanos_part: f64| -> (i64, i64, f64) {
130-
// Convert fractional month to days, It's not supported by Arrow types, but anyway
131-
day_part += (month_part - (month_part as i64) as f64) * 30_f64;
132-
133-
// Convert fractional days to hours
134-
nanos_part += (day_part - ((day_part as i64) as f64))
135-
* 24_f64
136-
* SECONDS_PER_HOUR
137-
* NANOS_PER_SECOND;
138-
139-
(month_part as i64, day_part as i64, nanos_part)
140-
};
141-
142-
let mut used_interval_types = 0;
143-
144-
let mut calculate_from_part = |interval_period_str: &str,
145-
interval_type: &str|
146-
-> Result<(i64, i64, f64)> {
147-
// @todo It's better to use Decimal in order to protect rounding errors
148-
// Wait https://github.com/apache/arrow/pull/9232
149-
let interval_period = match f64::from_str(interval_period_str) {
150-
Ok(n) => n,
151-
Err(_) => {
152-
return Err(DataFusionError::NotImplemented(format!(
153-
"Unsupported Interval Expression with value {value:?}"
154-
)));
155-
}
156-
};
157-
158-
if interval_period > (i64::MAX as f64) {
159-
return Err(DataFusionError::NotImplemented(format!(
160-
"Interval field value out of range: {value:?}"
161-
)));
162-
}
163-
164-
let it = IntervalType::from_str(interval_type).map_err(|_| {
165-
DataFusionError::NotImplemented(format!(
166-
"Invalid input syntax for type interval: {value:?}"
167-
))
168-
})?;
169-
170-
// Disallow duplicate interval types
171-
if used_interval_types & (it as u16) != 0 {
172-
return Err(DataFusionError::SQL(ParserError::ParserError(format!(
173-
"Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
174-
))));
175-
} else {
176-
used_interval_types |= it as u16;
177-
}
178-
179-
match it {
180-
IntervalType::Century => {
181-
Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
182-
}
183-
IntervalType::Decade => {
184-
Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
185-
}
186-
IntervalType::Year => {
187-
Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
188-
}
189-
IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
190-
IntervalType::Week => {
191-
Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
192-
}
193-
IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
194-
IntervalType::Hour => {
195-
Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
196-
}
197-
IntervalType::Minute => {
198-
Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
199-
}
200-
IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
201-
IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
202-
IntervalType::Microsecond => Ok((0, 0, interval_period * 1_000f64)),
203-
IntervalType::Nanosecond => Ok((0, 0, interval_period)),
204-
}
205-
};
206-
207-
let mut result_month: i64 = 0;
208-
let mut result_days: i64 = 0;
209-
let mut result_nanos: i128 = 0;
210-
211-
let mut parts = value.split_whitespace();
212-
213-
loop {
214-
let interval_period_str = parts.next();
215-
if interval_period_str.is_none() {
216-
break;
217-
}
218-
219-
let unit = parts.next().unwrap_or(leading_field);
220-
221-
let (diff_month, diff_days, diff_nanos) =
222-
calculate_from_part(interval_period_str.unwrap(), unit)?;
223-
224-
result_month += diff_month;
225-
226-
if result_month > (i32::MAX as i64) {
227-
return Err(DataFusionError::NotImplemented(format!(
228-
"Interval field value out of range: {value:?}"
229-
)));
230-
}
231-
232-
result_days += diff_days;
233-
234-
if result_days > (i32::MAX as i64) {
235-
return Err(DataFusionError::NotImplemented(format!(
236-
"Interval field value out of range: {value:?}"
237-
)));
238-
}
239-
240-
result_nanos += diff_nanos as i128;
241-
242-
if result_nanos > (i64::MAX as i128) {
243-
return Err(DataFusionError::NotImplemented(format!(
244-
"Interval field value out of range: {value:?}"
245-
)));
246-
}
247-
}
248-
249-
// Interval is tricky thing
250-
// 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month
251-
// The true way to store and calculate intervals is to store it as it defined
252-
// It's why we there are 3 different interval types in Arrow
253-
254-
// If have a unit smaller than milliseconds then must use IntervalMonthDayNano
255-
if (result_nanos % 1_000_000 != 0)
256-
|| (result_month != 0 && (result_days != 0 || result_nanos != 0))
257-
{
258-
let result: i128 = ((result_month as i128) << 96)
259-
// ensure discard high 32 bits of result_days before casting to i128
260-
| (((result_days & u32::MAX as i64) as i128) << 64)
261-
// ensure discard high 64 bits of result_nanos
262-
| (result_nanos & u64::MAX as i128);
263-
264-
return Ok(ScalarValue::IntervalMonthDayNano(Some(result)));
265-
}
266-
267-
// Month interval
268-
if result_month != 0 {
269-
return Ok(ScalarValue::IntervalYearMonth(Some(result_month as i32)));
270-
}
271-
272-
// IntervalMonthDayNano uses nanos, but IntervalDayTime uses millis
273-
let result: i64 =
274-
// ensure discard high 32 bits of milliseconds
275-
(result_days << 32) | ((result_nanos as i64 / 1_000_000) & (u32::MAX as i64));
276-
Ok(ScalarValue::IntervalDayTime(Some(result)))
277-
}
278-
279-
#[cfg(test)]
280-
mod test {
281-
use super::*;
282-
use crate::assert_contains;
283-
284-
const MILLIS_PER_SECOND: f64 = 1_000_f64;
285-
286-
#[test]
287-
fn test_parse_ym() {
288-
assert_eq!(
289-
parse_interval("months", "1 month").unwrap(),
290-
ScalarValue::new_interval_ym(0, 1)
291-
);
292-
293-
assert_eq!(
294-
parse_interval("months", "2 month").unwrap(),
295-
ScalarValue::new_interval_ym(0, 2)
296-
);
297-
298-
assert_eq!(
299-
parse_interval("months", "3 year 1 month").unwrap(),
300-
ScalarValue::new_interval_ym(3, 1)
301-
);
302-
303-
assert_contains!(
304-
parse_interval("months", "1 centurys 1 month")
305-
.unwrap_err()
306-
.to_string(),
307-
r#"Invalid input syntax for type interval: "1 centurys 1 month""#
308-
);
309-
310-
assert_eq!(
311-
parse_interval("months", "3 year -1 month").unwrap(),
312-
ScalarValue::new_interval_ym(3, -1)
313-
);
314-
315-
assert_eq!(
316-
parse_interval("months", "-3 year -1 month").unwrap(),
317-
ScalarValue::new_interval_ym(-3, -1)
318-
);
319-
320-
assert_eq!(
321-
parse_interval("months", "-3 year 1 month").unwrap(),
322-
ScalarValue::new_interval_ym(-3, 1)
323-
);
324-
}
325-
326-
#[test]
327-
fn test_dt() {
328-
assert_eq!(
329-
parse_interval("months", "5 days").unwrap(),
330-
ScalarValue::new_interval_dt(5, 0)
331-
);
332-
333-
assert_eq!(
334-
parse_interval("months", "7 days 3 hours").unwrap(),
335-
ScalarValue::new_interval_dt(
336-
7,
337-
(3.0 * SECONDS_PER_HOUR * MILLIS_PER_SECOND) as i32
338-
)
339-
);
340-
341-
assert_eq!(
342-
parse_interval("months", "7 days 5 minutes").unwrap(),
343-
ScalarValue::new_interval_dt(7, 5 * 60 * MILLIS_PER_SECOND as i32)
344-
);
345-
346-
assert_eq!(
347-
parse_interval("months", "7 days -5 minutes").unwrap(),
348-
ScalarValue::new_interval_dt(7, -5 * 60 * MILLIS_PER_SECOND as i32)
349-
);
350-
351-
assert_eq!(
352-
parse_interval("months", "-7 days 5 hours").unwrap(),
353-
ScalarValue::new_interval_dt(-7, 5 * 60 * 60 * MILLIS_PER_SECOND as i32)
354-
);
355-
356-
assert_eq!(
357-
parse_interval("months", "-7 days -5 hours -5 minutes -5 seconds").unwrap(),
358-
ScalarValue::new_interval_dt(
359-
-7,
360-
-(5 * 60 * 60 + 5 * 60 + 5) * MILLIS_PER_SECOND as i32
361-
)
362-
);
363-
}
364-
365-
#[test]
366-
fn test_mdn() {
367-
assert_eq!(
368-
parse_interval("months", "1 year 25 millisecond").unwrap(),
369-
ScalarValue::new_interval_mdn(12, 0, 25 * 1_000_000)
370-
);
371-
372-
assert_eq!(
373-
parse_interval("months", "1 year 1 day 0.000000001 seconds").unwrap(),
374-
ScalarValue::new_interval_mdn(12, 1, 1)
375-
);
376-
377-
assert_eq!(
378-
parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(),
379-
ScalarValue::new_interval_mdn(12, 1, 1_00 * 1_000)
380-
);
381-
382-
assert_eq!(
383-
parse_interval("months", "1 year 1 day 1 microsecond").unwrap(),
384-
ScalarValue::new_interval_mdn(12, 1, 1_000)
385-
);
386-
387-
assert_eq!(
388-
parse_interval("months", "1 year 1 day 5 nanoseconds").unwrap(),
389-
ScalarValue::new_interval_mdn(12, 1, 5)
390-
);
391-
392-
assert_eq!(
393-
parse_interval("months", "1 month -1 second").unwrap(),
394-
ScalarValue::new_interval_mdn(1, 0, -1_000_000_000)
395-
);
396-
397-
assert_eq!(
398-
parse_interval("months", "-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond").unwrap(),
399-
ScalarValue::new_interval_mdn(-13, -8, -(60 * 60 + 60 + 1) * NANOS_PER_SECOND as i64 - 1_110_000)
400-
);
401-
}
402-
403-
#[test]
404-
fn test_duplicate_interval_type() {
405-
let err = parse_interval("months", "1 month 1 second 1 second")
406-
.expect_err("parsing interval should have failed");
407-
assert_eq!(
408-
r#"SQL(ParserError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'"))"#,
409-
format!("{err:?}")
410-
);
411-
}
412-
}

0 commit comments

Comments
 (0)