|
18 | 18 | //! Interval parsing logic |
19 | 19 | use sqlparser::parser::ParserError; |
20 | 20 |
|
21 | | -use crate::{DataFusionError, Result, ScalarValue}; |
22 | 21 | use std::result; |
23 | 22 | use std::str::FromStr; |
24 | 23 |
|
25 | | -const SECONDS_PER_HOUR: f64 = 3_600_f64; |
26 | | -const NANOS_PER_SECOND: f64 = 1_000_000_000_f64; |
27 | | - |
28 | 24 | /// Readable file compression type |
29 | 25 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |
30 | 26 | pub enum CompressionTypeVariant { |
@@ -76,337 +72,3 @@ impl CompressionTypeVariant { |
76 | 72 | !matches!(self, &Self::UNCOMPRESSED) |
77 | 73 | } |
78 | 74 | } |
79 | | - |
80 | | -#[rustfmt::skip] |
81 | | -#[derive(Clone, Copy)] |
82 | | -#[repr(u16)] |
83 | | -enum IntervalType { |
84 | | - Century = 0b_0000_0000_0001, |
85 | | - Decade = 0b_0000_0000_0010, |
86 | | - Year = 0b_0000_0000_0100, |
87 | | - Month = 0b_0000_0000_1000, |
88 | | - Week = 0b_0000_0001_0000, |
89 | | - Day = 0b_0000_0010_0000, |
90 | | - Hour = 0b_0000_0100_0000, |
91 | | - Minute = 0b_0000_1000_0000, |
92 | | - Second = 0b_0001_0000_0000, |
93 | | - Millisecond = 0b_0010_0000_0000, |
94 | | - Microsecond = 0b_0100_0000_0000, |
95 | | - Nanosecond = 0b_1000_0000_0000, |
96 | | -} |
97 | | - |
98 | | -impl FromStr for IntervalType { |
99 | | - type Err = DataFusionError; |
100 | | - |
101 | | - fn from_str(s: &str) -> Result<Self> { |
102 | | - match s.to_lowercase().as_str() { |
103 | | - "century" | "centuries" => Ok(Self::Century), |
104 | | - "decade" | "decades" => Ok(Self::Decade), |
105 | | - "year" | "years" => Ok(Self::Year), |
106 | | - "month" | "months" => Ok(Self::Month), |
107 | | - "week" | "weeks" => Ok(Self::Week), |
108 | | - "day" | "days" => Ok(Self::Day), |
109 | | - "hour" | "hours" => Ok(Self::Hour), |
110 | | - "minute" | "minutes" => Ok(Self::Minute), |
111 | | - "second" | "seconds" => Ok(Self::Second), |
112 | | - "millisecond" | "milliseconds" => Ok(Self::Millisecond), |
113 | | - "microsecond" | "microseconds" => Ok(Self::Microsecond), |
114 | | - "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond), |
115 | | - _ => Err(DataFusionError::NotImplemented(format!( |
116 | | - "Unknown interval type: {s}" |
117 | | - ))), |
118 | | - } |
119 | | - } |
120 | | -} |
121 | | - |
122 | | -/// Parses a string with an interval like `'0.5 MONTH'` to an |
123 | | -/// appropriately typed [`ScalarValue`] |
124 | | -pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> { |
125 | | - // We are storing parts as integers, it's why we need to align parts fractional |
126 | | - // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days |
127 | | - // INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours |
128 | | - let align_interval_parts = |
129 | | - |month_part: f64, mut day_part: f64, mut nanos_part: f64| -> (i64, i64, f64) { |
130 | | - // Convert fractional month to days, It's not supported by Arrow types, but anyway |
131 | | - day_part += (month_part - (month_part as i64) as f64) * 30_f64; |
132 | | - |
133 | | - // Convert fractional days to hours |
134 | | - nanos_part += (day_part - ((day_part as i64) as f64)) |
135 | | - * 24_f64 |
136 | | - * SECONDS_PER_HOUR |
137 | | - * NANOS_PER_SECOND; |
138 | | - |
139 | | - (month_part as i64, day_part as i64, nanos_part) |
140 | | - }; |
141 | | - |
142 | | - let mut used_interval_types = 0; |
143 | | - |
144 | | - let mut calculate_from_part = |interval_period_str: &str, |
145 | | - interval_type: &str| |
146 | | - -> Result<(i64, i64, f64)> { |
147 | | - // @todo It's better to use Decimal in order to protect rounding errors |
148 | | - // Wait https://github.com/apache/arrow/pull/9232 |
149 | | - let interval_period = match f64::from_str(interval_period_str) { |
150 | | - Ok(n) => n, |
151 | | - Err(_) => { |
152 | | - return Err(DataFusionError::NotImplemented(format!( |
153 | | - "Unsupported Interval Expression with value {value:?}" |
154 | | - ))); |
155 | | - } |
156 | | - }; |
157 | | - |
158 | | - if interval_period > (i64::MAX as f64) { |
159 | | - return Err(DataFusionError::NotImplemented(format!( |
160 | | - "Interval field value out of range: {value:?}" |
161 | | - ))); |
162 | | - } |
163 | | - |
164 | | - let it = IntervalType::from_str(interval_type).map_err(|_| { |
165 | | - DataFusionError::NotImplemented(format!( |
166 | | - "Invalid input syntax for type interval: {value:?}" |
167 | | - )) |
168 | | - })?; |
169 | | - |
170 | | - // Disallow duplicate interval types |
171 | | - if used_interval_types & (it as u16) != 0 { |
172 | | - return Err(DataFusionError::SQL(ParserError::ParserError(format!( |
173 | | - "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'" |
174 | | - )))); |
175 | | - } else { |
176 | | - used_interval_types |= it as u16; |
177 | | - } |
178 | | - |
179 | | - match it { |
180 | | - IntervalType::Century => { |
181 | | - Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0)) |
182 | | - } |
183 | | - IntervalType::Decade => { |
184 | | - Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0)) |
185 | | - } |
186 | | - IntervalType::Year => { |
187 | | - Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0)) |
188 | | - } |
189 | | - IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)), |
190 | | - IntervalType::Week => { |
191 | | - Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0)) |
192 | | - } |
193 | | - IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)), |
194 | | - IntervalType::Hour => { |
195 | | - Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND)) |
196 | | - } |
197 | | - IntervalType::Minute => { |
198 | | - Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND)) |
199 | | - } |
200 | | - IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)), |
201 | | - IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)), |
202 | | - IntervalType::Microsecond => Ok((0, 0, interval_period * 1_000f64)), |
203 | | - IntervalType::Nanosecond => Ok((0, 0, interval_period)), |
204 | | - } |
205 | | - }; |
206 | | - |
207 | | - let mut result_month: i64 = 0; |
208 | | - let mut result_days: i64 = 0; |
209 | | - let mut result_nanos: i128 = 0; |
210 | | - |
211 | | - let mut parts = value.split_whitespace(); |
212 | | - |
213 | | - loop { |
214 | | - let interval_period_str = parts.next(); |
215 | | - if interval_period_str.is_none() { |
216 | | - break; |
217 | | - } |
218 | | - |
219 | | - let unit = parts.next().unwrap_or(leading_field); |
220 | | - |
221 | | - let (diff_month, diff_days, diff_nanos) = |
222 | | - calculate_from_part(interval_period_str.unwrap(), unit)?; |
223 | | - |
224 | | - result_month += diff_month; |
225 | | - |
226 | | - if result_month > (i32::MAX as i64) { |
227 | | - return Err(DataFusionError::NotImplemented(format!( |
228 | | - "Interval field value out of range: {value:?}" |
229 | | - ))); |
230 | | - } |
231 | | - |
232 | | - result_days += diff_days; |
233 | | - |
234 | | - if result_days > (i32::MAX as i64) { |
235 | | - return Err(DataFusionError::NotImplemented(format!( |
236 | | - "Interval field value out of range: {value:?}" |
237 | | - ))); |
238 | | - } |
239 | | - |
240 | | - result_nanos += diff_nanos as i128; |
241 | | - |
242 | | - if result_nanos > (i64::MAX as i128) { |
243 | | - return Err(DataFusionError::NotImplemented(format!( |
244 | | - "Interval field value out of range: {value:?}" |
245 | | - ))); |
246 | | - } |
247 | | - } |
248 | | - |
249 | | - // Interval is tricky thing |
250 | | - // 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month |
251 | | - // The true way to store and calculate intervals is to store it as it defined |
252 | | - // It's why we there are 3 different interval types in Arrow |
253 | | - |
254 | | - // If have a unit smaller than milliseconds then must use IntervalMonthDayNano |
255 | | - if (result_nanos % 1_000_000 != 0) |
256 | | - || (result_month != 0 && (result_days != 0 || result_nanos != 0)) |
257 | | - { |
258 | | - let result: i128 = ((result_month as i128) << 96) |
259 | | - // ensure discard high 32 bits of result_days before casting to i128 |
260 | | - | (((result_days & u32::MAX as i64) as i128) << 64) |
261 | | - // ensure discard high 64 bits of result_nanos |
262 | | - | (result_nanos & u64::MAX as i128); |
263 | | - |
264 | | - return Ok(ScalarValue::IntervalMonthDayNano(Some(result))); |
265 | | - } |
266 | | - |
267 | | - // Month interval |
268 | | - if result_month != 0 { |
269 | | - return Ok(ScalarValue::IntervalYearMonth(Some(result_month as i32))); |
270 | | - } |
271 | | - |
272 | | - // IntervalMonthDayNano uses nanos, but IntervalDayTime uses millis |
273 | | - let result: i64 = |
274 | | - // ensure discard high 32 bits of milliseconds |
275 | | - (result_days << 32) | ((result_nanos as i64 / 1_000_000) & (u32::MAX as i64)); |
276 | | - Ok(ScalarValue::IntervalDayTime(Some(result))) |
277 | | -} |
278 | | - |
279 | | -#[cfg(test)] |
280 | | -mod test { |
281 | | - use super::*; |
282 | | - use crate::assert_contains; |
283 | | - |
284 | | - const MILLIS_PER_SECOND: f64 = 1_000_f64; |
285 | | - |
286 | | - #[test] |
287 | | - fn test_parse_ym() { |
288 | | - assert_eq!( |
289 | | - parse_interval("months", "1 month").unwrap(), |
290 | | - ScalarValue::new_interval_ym(0, 1) |
291 | | - ); |
292 | | - |
293 | | - assert_eq!( |
294 | | - parse_interval("months", "2 month").unwrap(), |
295 | | - ScalarValue::new_interval_ym(0, 2) |
296 | | - ); |
297 | | - |
298 | | - assert_eq!( |
299 | | - parse_interval("months", "3 year 1 month").unwrap(), |
300 | | - ScalarValue::new_interval_ym(3, 1) |
301 | | - ); |
302 | | - |
303 | | - assert_contains!( |
304 | | - parse_interval("months", "1 centurys 1 month") |
305 | | - .unwrap_err() |
306 | | - .to_string(), |
307 | | - r#"Invalid input syntax for type interval: "1 centurys 1 month""# |
308 | | - ); |
309 | | - |
310 | | - assert_eq!( |
311 | | - parse_interval("months", "3 year -1 month").unwrap(), |
312 | | - ScalarValue::new_interval_ym(3, -1) |
313 | | - ); |
314 | | - |
315 | | - assert_eq!( |
316 | | - parse_interval("months", "-3 year -1 month").unwrap(), |
317 | | - ScalarValue::new_interval_ym(-3, -1) |
318 | | - ); |
319 | | - |
320 | | - assert_eq!( |
321 | | - parse_interval("months", "-3 year 1 month").unwrap(), |
322 | | - ScalarValue::new_interval_ym(-3, 1) |
323 | | - ); |
324 | | - } |
325 | | - |
326 | | - #[test] |
327 | | - fn test_dt() { |
328 | | - assert_eq!( |
329 | | - parse_interval("months", "5 days").unwrap(), |
330 | | - ScalarValue::new_interval_dt(5, 0) |
331 | | - ); |
332 | | - |
333 | | - assert_eq!( |
334 | | - parse_interval("months", "7 days 3 hours").unwrap(), |
335 | | - ScalarValue::new_interval_dt( |
336 | | - 7, |
337 | | - (3.0 * SECONDS_PER_HOUR * MILLIS_PER_SECOND) as i32 |
338 | | - ) |
339 | | - ); |
340 | | - |
341 | | - assert_eq!( |
342 | | - parse_interval("months", "7 days 5 minutes").unwrap(), |
343 | | - ScalarValue::new_interval_dt(7, 5 * 60 * MILLIS_PER_SECOND as i32) |
344 | | - ); |
345 | | - |
346 | | - assert_eq!( |
347 | | - parse_interval("months", "7 days -5 minutes").unwrap(), |
348 | | - ScalarValue::new_interval_dt(7, -5 * 60 * MILLIS_PER_SECOND as i32) |
349 | | - ); |
350 | | - |
351 | | - assert_eq!( |
352 | | - parse_interval("months", "-7 days 5 hours").unwrap(), |
353 | | - ScalarValue::new_interval_dt(-7, 5 * 60 * 60 * MILLIS_PER_SECOND as i32) |
354 | | - ); |
355 | | - |
356 | | - assert_eq!( |
357 | | - parse_interval("months", "-7 days -5 hours -5 minutes -5 seconds").unwrap(), |
358 | | - ScalarValue::new_interval_dt( |
359 | | - -7, |
360 | | - -(5 * 60 * 60 + 5 * 60 + 5) * MILLIS_PER_SECOND as i32 |
361 | | - ) |
362 | | - ); |
363 | | - } |
364 | | - |
365 | | - #[test] |
366 | | - fn test_mdn() { |
367 | | - assert_eq!( |
368 | | - parse_interval("months", "1 year 25 millisecond").unwrap(), |
369 | | - ScalarValue::new_interval_mdn(12, 0, 25 * 1_000_000) |
370 | | - ); |
371 | | - |
372 | | - assert_eq!( |
373 | | - parse_interval("months", "1 year 1 day 0.000000001 seconds").unwrap(), |
374 | | - ScalarValue::new_interval_mdn(12, 1, 1) |
375 | | - ); |
376 | | - |
377 | | - assert_eq!( |
378 | | - parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(), |
379 | | - ScalarValue::new_interval_mdn(12, 1, 1_00 * 1_000) |
380 | | - ); |
381 | | - |
382 | | - assert_eq!( |
383 | | - parse_interval("months", "1 year 1 day 1 microsecond").unwrap(), |
384 | | - ScalarValue::new_interval_mdn(12, 1, 1_000) |
385 | | - ); |
386 | | - |
387 | | - assert_eq!( |
388 | | - parse_interval("months", "1 year 1 day 5 nanoseconds").unwrap(), |
389 | | - ScalarValue::new_interval_mdn(12, 1, 5) |
390 | | - ); |
391 | | - |
392 | | - assert_eq!( |
393 | | - parse_interval("months", "1 month -1 second").unwrap(), |
394 | | - ScalarValue::new_interval_mdn(1, 0, -1_000_000_000) |
395 | | - ); |
396 | | - |
397 | | - assert_eq!( |
398 | | - parse_interval("months", "-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond").unwrap(), |
399 | | - ScalarValue::new_interval_mdn(-13, -8, -(60 * 60 + 60 + 1) * NANOS_PER_SECOND as i64 - 1_110_000) |
400 | | - ); |
401 | | - } |
402 | | - |
403 | | - #[test] |
404 | | - fn test_duplicate_interval_type() { |
405 | | - let err = parse_interval("months", "1 month 1 second 1 second") |
406 | | - .expect_err("parsing interval should have failed"); |
407 | | - assert_eq!( |
408 | | - r#"SQL(ParserError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'"))"#, |
409 | | - format!("{err:?}") |
410 | | - ); |
411 | | - } |
412 | | -} |
0 commit comments