Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 260 additions & 1 deletion datafusion/core/tests/sqllogictests/test_files/timestamps.slt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great test coverage and comments, thanks @NGA-TRAN 💯

Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ SELECT DATE_BIN(INTERVAL '5 microseconds', TIMESTAMP '2022-08-03 14:38:50.000006
2022-08-03T14:38:50.000005

# Does not support months for Month-Day-Nano interval
statement error This feature is not implemented: DATE_BIN stride does not support month intervals
statement error DataFusion error: This feature is not implemented: DATE_BIN stride does not support combination of month, day and nanosecond intervals
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new error message I have added in this PR

SELECT DATE_BIN(INTERVAL '1 month 5 nanoseconds', TIMESTAMP '2022-08-03 14:38:50.000000006Z', TIMESTAMP '1970-01-01T00:00:00Z')

# Can coerce string interval arguments
Expand Down Expand Up @@ -500,6 +500,265 @@ FROM (
(TIMESTAMP '2021-06-10 17:19:10Z', TIMESTAMP '2001-01-01T00:00:00Z', 0.3)
) as t (time, origin, val)


# month interval with INTERVAL keyword in date_bin with default start time
query P
select date_bin(INTERVAL '1 month', column1)
from (values
(timestamp '2022-01-01 00:00:00'),
(timestamp '2022-01-01 01:00:00'),
(timestamp '2022-01-02 00:00:00'),
(timestamp '2022-02-02 00:00:00'),
(timestamp '2022-02-15 00:00:00'),
(timestamp '2022-03-31 00:00:00')
) as sq
----
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-02-01T00:00:00
2022-02-01T00:00:00
2022-03-01T00:00:00


# year interval in date_bin with default start time
query P
select date_bin(INTERVAL '1 year', column1)
from (values
(timestamp '2022-01-01 00:00:00'),
(timestamp '2023-01-01 01:00:00'),
(timestamp '2022-01-02 00:00:00'),
(timestamp '2022-02-02 00:00:00'),
(timestamp '2022-02-15 00:00:00'),
(timestamp '2022-03-31 00:00:00')
) as sq
----
2022-01-01T00:00:00
2023-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00

query P
SELECT DATE_BIN('1 month', '2022-01-01 00:00:00Z', '1970-01-01T00:00:00Z');
----
2022-01-01T00:00:00


# Tests without INTERVAL keyword
# 1-month interval in date_bin with default start time
query P
select date_bin('1 month', column1)
from (values
(timestamp '2022-01-01 00:00:00'),
(timestamp '2022-01-01 01:00:00'),
(timestamp '2022-01-02 00:00:00'),
(timestamp '2022-02-02 00:00:00'),
(timestamp '2022-02-15 00:00:00'),
(timestamp '2022-03-31 00:00:00')
) as sq
----
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-02-01T00:00:00
2022-02-01T00:00:00
2022-03-01T00:00:00

# 2-month interval in date_bin with default start time
query P
select date_bin('2 month', column1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

from (values
(timestamp '2022-01-01 00:00:00'),
(timestamp '2022-01-01 01:00:00'),
(timestamp '2022-01-02 00:00:00'),
(timestamp '2022-02-02 00:00:00'),
(timestamp '2022-02-15 00:00:00'),
(timestamp '2022-03-31 00:00:00')
) as sq
----
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-03-01T00:00:00


# month interval with start date end of the month plus some minutes
#
# The the return of `date_bin` is the start of the bin. The bin width is one year.
# The source data must be inside the bin.
# Since the origin is '1970-12-31T00:15:00Z', the start of the bins are
# '1970-12-31T00:15:00Z',
# '1971-12-31T00:15:00Z',
# ...,
# '2021-12-31T00:15:00Z',
# '2022-12-31T00:15:00Z',
# ...
#
# Note the datetime '2022-03-31 00:00:00'. Its bin is NOT '2022-03-31 00:15:00' which is after its time
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

# Its bin is '2022-02-28T00:15:00'
#
query P
select date_bin('1 month', column1, '1970-12-31T00:15:00Z')
from (values
(timestamp '2022-01-01 00:00:00'),
(timestamp '2022-01-01 01:00:00'),
(timestamp '2022-01-02 00:00:00'),
(timestamp '2022-02-02 00:00:00'),
(timestamp '2022-02-15 00:00:00'),
(timestamp '2022-03-31 00:00:00')
) as sq
----
2021-12-31T00:15:00
2021-12-31T00:15:00
2021-12-31T00:15:00
2022-01-31T00:15:00
2022-01-31T00:15:00
2022-02-28T00:15:00

# month interval with start date is end of the month plus some minutes
query P
select date_bin('2 months', column1, '1970-12-31T00:15:00Z')
from (values
(timestamp '2022-01-01 00:00:00'),
(timestamp '2022-01-01 01:00:00'),
(timestamp '2022-01-02 00:00:00'),
(timestamp '2022-02-02 00:00:00'),
(timestamp '2022-02-15 00:00:00'),
(timestamp '2022-03-31 00:00:00')
) as sq
----
2021-12-31T00:15:00
2021-12-31T00:15:00
2021-12-31T00:15:00
2021-12-31T00:15:00
2021-12-31T00:15:00
2022-02-28T00:15:00

# year interval in date_bin with default start time
query P
select date_bin('1 year', column1)
from (values
(timestamp '2022-01-01 00:00:00'),
(timestamp '2022-01-01 01:00:00'),
(timestamp '2022-01-02 00:00:00'),
(timestamp '2022-02-02 00:00:00'),
(timestamp '2022-02-15 00:00:00'),
(timestamp '2022-03-31 00:00:00'),
(timestamp '2023-10-28 01:33:00')
) as sq
----
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2022-01-01T00:00:00
2023-01-01T00:00:00

# year interval with start date is end of the month plus some minutes
query P
select date_bin('1 year', column1, '1970-12-31T00:15:00Z')
from (values
(timestamp '2022-01-01 00:00:00'),
(timestamp '2022-01-01 01:00:00'),
(timestamp '2022-01-02 00:00:00'),
(timestamp '2022-02-02 00:00:00'),
(timestamp '2022-02-15 00:00:00'),
(timestamp '2022-03-31 00:00:00'),
(timestamp '2023-03-31 00:00:00')
) as sq
----
2021-12-31T00:15:00
2021-12-31T00:15:00
2021-12-31T00:15:00
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why the timestamp

'2022-01-02 00:00:00'

would be binned into the prior year (2021)

2021-12-31

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return data is the start of the bin. The bin width is one year. The source data must be inside the bin. Since the origin is '1970-12-31T00:15:00Z', the start of the bins are '1970-12-31T00:15:00Z', '1971-12-31T00:15:00Z', ..., '2021-12-31T00:15:00Z', '2022-12-31T00:15:00Z', ...

'2022-01-02 00:00:00' is between '2021-12-31T00:15:00Z' and '2022-12-31T00:15:00Z' so it must be in the bin that starts with '2021-12-31T00:15:00Z'

2021-12-31T00:15:00
2021-12-31T00:15:00
2021-12-31T00:15:00
2022-12-31T00:15:00

# month interval on constant
query P
SELECT DATE_BIN('1 month', '2022-01-01 00:00:00Z');
----
2022-01-01T00:00:00

# five months interval on constant
query P
SELECT DATE_BIN('5 month', '2022-01-01T00:00:00Z');
----
2021-09-01T00:00:00

# month interval with default start time
query P
SELECT DATE_BIN('1 month', '2022-01-01 00:00:00Z');
----
2022-01-01T00:00:00

# origin on the first date but not at midnight
query P
SELECT DATE_BIN('1 month', '2022-04-01T00:00:00Z', '2021-05-01T00:04:00Z');
----
2022-03-01T00:04:00

# origin is May 31 (last date of the month) to produce bin on Feb 28
query P
SELECT DATE_BIN('3 month', '2022-04-01T00:00:00Z', '2021-05-31T00:04:00Z');
----
2022-02-28T00:04:00

# origin is on Feb 29 and interval is one month. The bins will be:
# '2000-02-29T00:00:00'
# '2000-01-29T00:00:00'
# '1999-12-29T00:00:00'
# ....
# Reason: Even though 29 (or 28 for non-leap year) is the last date of Feb but it
# is not last date of other month. Months' chrono consider a month before or after that
# will land on the same 29th date.
query P
select date_bin('1 month', timestamp '2000-01-31T00:00:00', timestamp '2000-02-29T00:00:00');
----
2000-01-29T00:00:00

# similar for the origin March 29
query P
select date_bin('1 month', timestamp '2000-01-31T00:00:00', timestamp '2000-03-29T00:00:00');
----
2000-01-29T00:00:00

# any value of origin
query P
SELECT DATE_BIN('3 month', '2022-01-01T00:00:00Z', '2021-05-05T17:56:21Z');
----
2021-11-05T17:56:21

# origin is later than source
query P
SELECT DATE_BIN('3 month', '2022-01-01T00:00:00Z', '2022-05-05T17:56:21Z');
----
2021-11-05T17:56:21

# year interval on constant
query P
SELECT DATE_BIN('1 year', '2022-01-01 00:00:00Z');
----
2022-01-01T00:00:00

# 3-year interval on constant
query P
SELECT DATE_BIN('3 year', '2022-01-01 00:00:00Z');
----
2021-01-01T00:00:00

# 3 year 1 months = 37 months
query P
SELECT DATE_BIN('3 years 1 months', '2022-09-01 00:00:00Z');
----
2022-06-01T00:00:00

###
## test date_trunc function
###
Expand Down
8 changes: 4 additions & 4 deletions datafusion/expr/src/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -474,21 +474,21 @@ pub fn signature(fun: &BuiltinScalarFunction) -> Signature {
BuiltinScalarFunction::DateBin => Signature::one_of(
vec![
TypeSignature::Exact(vec![
DataType::Interval(IntervalUnit::DayTime),
DataType::Interval(IntervalUnit::MonthDayNano),
DataType::Timestamp(TimeUnit::Nanosecond, None),
DataType::Timestamp(TimeUnit::Nanosecond, None),
]),
TypeSignature::Exact(vec![
DataType::Interval(IntervalUnit::MonthDayNano),
DataType::Interval(IntervalUnit::DayTime),
DataType::Timestamp(TimeUnit::Nanosecond, None),
DataType::Timestamp(TimeUnit::Nanosecond, None),
]),
TypeSignature::Exact(vec![
DataType::Interval(IntervalUnit::DayTime),
DataType::Interval(IntervalUnit::MonthDayNano),
DataType::Timestamp(TimeUnit::Nanosecond, None),
]),
TypeSignature::Exact(vec![
DataType::Interval(IntervalUnit::MonthDayNano),
DataType::Interval(IntervalUnit::DayTime),
DataType::Timestamp(TimeUnit::Nanosecond, None),
]),
],
Expand Down
Loading