|
19 | 19 |
|
20 | 20 | use std::any::type_name; |
21 | 21 | use std::collections::HashSet; |
| 22 | +use std::fmt::{Display, Formatter}; |
22 | 23 | use std::sync::Arc; |
23 | 24 |
|
24 | 25 | use arrow::array::*; |
@@ -1777,97 +1778,173 @@ macro_rules! to_string { |
1777 | 1778 | }}; |
1778 | 1779 | } |
1779 | 1780 |
|
1780 | | -fn union_generic_lists<OffsetSize: OffsetSizeTrait>( |
| 1781 | +#[derive(Debug, PartialEq)] |
| 1782 | +enum SetOp { |
| 1783 | + Union, |
| 1784 | + Intersect, |
| 1785 | +} |
| 1786 | + |
| 1787 | +impl Display for SetOp { |
| 1788 | + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
| 1789 | + match self { |
| 1790 | + SetOp::Union => write!(f, "array_union"), |
| 1791 | + SetOp::Intersect => write!(f, "array_intersect"), |
| 1792 | + } |
| 1793 | + } |
| 1794 | +} |
| 1795 | + |
| 1796 | +fn generic_set_lists<OffsetSize: OffsetSizeTrait>( |
1781 | 1797 | l: &GenericListArray<OffsetSize>, |
1782 | 1798 | r: &GenericListArray<OffsetSize>, |
1783 | | - field: &FieldRef, |
1784 | | -) -> Result<GenericListArray<OffsetSize>> { |
1785 | | - let converter = RowConverter::new(vec![SortField::new(l.value_type())])?; |
| 1799 | + field: Arc<Field>, |
| 1800 | + set_op: SetOp, |
| 1801 | +) -> Result<ArrayRef> { |
| 1802 | + if matches!(l.value_type(), DataType::Null) { |
| 1803 | + let field = Arc::new(Field::new("item", r.value_type(), true)); |
| 1804 | + return general_array_distinct::<OffsetSize>(r, &field); |
| 1805 | + } else if matches!(r.value_type(), DataType::Null) { |
| 1806 | + let field = Arc::new(Field::new("item", l.value_type(), true)); |
| 1807 | + return general_array_distinct::<OffsetSize>(l, &field); |
| 1808 | + } |
1786 | 1809 |
|
1787 | | - let nulls = NullBuffer::union(l.nulls(), r.nulls()); |
1788 | | - let l_values = l.values().clone(); |
1789 | | - let r_values = r.values().clone(); |
1790 | | - let l_values = converter.convert_columns(&[l_values])?; |
1791 | | - let r_values = converter.convert_columns(&[r_values])?; |
| 1810 | + if l.value_type() != r.value_type() { |
| 1811 | + return internal_err!("{set_op:?} is not implemented for '{l:?}' and '{r:?}'"); |
| 1812 | + } |
1792 | 1813 |
|
1793 | | - // Might be worth adding an upstream OffsetBufferBuilder |
1794 | | - let mut offsets = Vec::<OffsetSize>::with_capacity(l.len() + 1); |
1795 | | - offsets.push(OffsetSize::usize_as(0)); |
1796 | | - let mut rows = Vec::with_capacity(l_values.num_rows() + r_values.num_rows()); |
1797 | | - let mut dedup = HashSet::new(); |
1798 | | - for (l_w, r_w) in l.offsets().windows(2).zip(r.offsets().windows(2)) { |
1799 | | - let l_slice = l_w[0].as_usize()..l_w[1].as_usize(); |
1800 | | - let r_slice = r_w[0].as_usize()..r_w[1].as_usize(); |
1801 | | - for i in l_slice { |
1802 | | - let left_row = l_values.row(i); |
1803 | | - if dedup.insert(left_row) { |
1804 | | - rows.push(left_row); |
1805 | | - } |
1806 | | - } |
1807 | | - for i in r_slice { |
1808 | | - let right_row = r_values.row(i); |
1809 | | - if dedup.insert(right_row) { |
1810 | | - rows.push(right_row); |
| 1814 | + let dt = l.value_type(); |
| 1815 | + |
| 1816 | + let mut offsets = vec![OffsetSize::usize_as(0)]; |
| 1817 | + let mut new_arrays = vec![]; |
| 1818 | + |
| 1819 | + let converter = RowConverter::new(vec![SortField::new(dt)])?; |
| 1820 | + for (first_arr, second_arr) in l.iter().zip(r.iter()) { |
| 1821 | + if let (Some(first_arr), Some(second_arr)) = (first_arr, second_arr) { |
| 1822 | + let l_values = converter.convert_columns(&[first_arr])?; |
| 1823 | + let r_values = converter.convert_columns(&[second_arr])?; |
| 1824 | + |
| 1825 | + let l_iter = l_values.iter().sorted().dedup(); |
| 1826 | + let values_set: HashSet<_> = l_iter.clone().collect(); |
| 1827 | + let mut rows = if set_op == SetOp::Union { |
| 1828 | + l_iter.collect::<Vec<_>>() |
| 1829 | + } else { |
| 1830 | + vec![] |
| 1831 | + }; |
| 1832 | + for r_val in r_values.iter().sorted().dedup() { |
| 1833 | + match set_op { |
| 1834 | + SetOp::Union => { |
| 1835 | + if !values_set.contains(&r_val) { |
| 1836 | + rows.push(r_val); |
| 1837 | + } |
| 1838 | + } |
| 1839 | + SetOp::Intersect => { |
| 1840 | + if values_set.contains(&r_val) { |
| 1841 | + rows.push(r_val); |
| 1842 | + } |
| 1843 | + } |
| 1844 | + } |
1811 | 1845 | } |
| 1846 | + |
| 1847 | + let last_offset = match offsets.last().copied() { |
| 1848 | + Some(offset) => offset, |
| 1849 | + None => return internal_err!("offsets should not be empty"), |
| 1850 | + }; |
| 1851 | + offsets.push(last_offset + OffsetSize::usize_as(rows.len())); |
| 1852 | + let arrays = converter.convert_rows(rows)?; |
| 1853 | + let array = match arrays.first() { |
| 1854 | + Some(array) => array.clone(), |
| 1855 | + None => { |
| 1856 | + return internal_err!("{set_op}: failed to get array from rows"); |
| 1857 | + } |
| 1858 | + }; |
| 1859 | + new_arrays.push(array); |
1812 | 1860 | } |
1813 | | - offsets.push(OffsetSize::usize_as(rows.len())); |
1814 | | - dedup.clear(); |
1815 | 1861 | } |
1816 | 1862 |
|
1817 | | - let values = converter.convert_rows(rows)?; |
1818 | 1863 | let offsets = OffsetBuffer::new(offsets.into()); |
1819 | | - let result = values[0].clone(); |
1820 | | - Ok(GenericListArray::<OffsetSize>::new( |
1821 | | - field.clone(), |
1822 | | - offsets, |
1823 | | - result, |
1824 | | - nulls, |
1825 | | - )) |
| 1864 | + let new_arrays_ref = new_arrays.iter().map(|v| v.as_ref()).collect::<Vec<_>>(); |
| 1865 | + let values = compute::concat(&new_arrays_ref)?; |
| 1866 | + let arr = GenericListArray::<OffsetSize>::try_new(field, offsets, values, None)?; |
| 1867 | + Ok(Arc::new(arr)) |
1826 | 1868 | } |
1827 | 1869 |
|
1828 | | -/// Array_union SQL function |
1829 | | -pub fn array_union(args: &[ArrayRef]) -> Result<ArrayRef> { |
1830 | | - if args.len() != 2 { |
1831 | | - return exec_err!("array_union needs 2 arguments"); |
1832 | | - } |
1833 | | - let array1 = &args[0]; |
1834 | | - let array2 = &args[1]; |
| 1870 | +fn general_set_op( |
| 1871 | + array1: &ArrayRef, |
| 1872 | + array2: &ArrayRef, |
| 1873 | + set_op: SetOp, |
| 1874 | +) -> Result<ArrayRef> { |
| 1875 | + match (array1.data_type(), array2.data_type()) { |
| 1876 | + (DataType::Null, DataType::List(field)) => { |
| 1877 | + if set_op == SetOp::Intersect { |
| 1878 | + return Ok(new_empty_array(&DataType::Null)); |
| 1879 | + } |
| 1880 | + let array = as_list_array(&array2)?; |
| 1881 | + general_array_distinct::<i32>(array, field) |
| 1882 | + } |
1835 | 1883 |
|
1836 | | - fn union_arrays<O: OffsetSizeTrait>( |
1837 | | - array1: &ArrayRef, |
1838 | | - array2: &ArrayRef, |
1839 | | - l_field_ref: &Arc<Field>, |
1840 | | - r_field_ref: &Arc<Field>, |
1841 | | - ) -> Result<ArrayRef> { |
1842 | | - match (l_field_ref.data_type(), r_field_ref.data_type()) { |
1843 | | - (DataType::Null, _) => Ok(array2.clone()), |
1844 | | - (_, DataType::Null) => Ok(array1.clone()), |
1845 | | - (_, _) => { |
1846 | | - let list1 = array1.as_list::<O>(); |
1847 | | - let list2 = array2.as_list::<O>(); |
1848 | | - let result = union_generic_lists::<O>(list1, list2, l_field_ref)?; |
1849 | | - Ok(Arc::new(result)) |
| 1884 | + (DataType::List(field), DataType::Null) => { |
| 1885 | + if set_op == SetOp::Intersect { |
| 1886 | + return make_array(&[]); |
1850 | 1887 | } |
| 1888 | + let array = as_list_array(&array1)?; |
| 1889 | + general_array_distinct::<i32>(array, field) |
1851 | 1890 | } |
1852 | | - } |
| 1891 | + (DataType::Null, DataType::LargeList(field)) => { |
| 1892 | + if set_op == SetOp::Intersect { |
| 1893 | + return Ok(new_empty_array(&DataType::Null)); |
| 1894 | + } |
| 1895 | + let array = as_large_list_array(&array2)?; |
| 1896 | + general_array_distinct::<i64>(array, field) |
| 1897 | + } |
| 1898 | + (DataType::LargeList(field), DataType::Null) => { |
| 1899 | + if set_op == SetOp::Intersect { |
| 1900 | + return make_array(&[]); |
| 1901 | + } |
| 1902 | + let array = as_large_list_array(&array1)?; |
| 1903 | + general_array_distinct::<i64>(array, field) |
| 1904 | + } |
| 1905 | + (DataType::Null, DataType::Null) => Ok(new_empty_array(&DataType::Null)), |
1853 | 1906 |
|
1854 | | - match (array1.data_type(), array2.data_type()) { |
1855 | | - (DataType::Null, _) => Ok(array2.clone()), |
1856 | | - (_, DataType::Null) => Ok(array1.clone()), |
1857 | | - (DataType::List(l_field_ref), DataType::List(r_field_ref)) => { |
1858 | | - union_arrays::<i32>(array1, array2, l_field_ref, r_field_ref) |
| 1907 | + (DataType::List(field), DataType::List(_)) => { |
| 1908 | + let array1 = as_list_array(&array1)?; |
| 1909 | + let array2 = as_list_array(&array2)?; |
| 1910 | + generic_set_lists::<i32>(array1, array2, field.clone(), set_op) |
1859 | 1911 | } |
1860 | | - (DataType::LargeList(l_field_ref), DataType::LargeList(r_field_ref)) => { |
1861 | | - union_arrays::<i64>(array1, array2, l_field_ref, r_field_ref) |
| 1912 | + (DataType::LargeList(field), DataType::LargeList(_)) => { |
| 1913 | + let array1 = as_large_list_array(&array1)?; |
| 1914 | + let array2 = as_large_list_array(&array2)?; |
| 1915 | + generic_set_lists::<i64>(array1, array2, field.clone(), set_op) |
1862 | 1916 | } |
1863 | | - _ => { |
| 1917 | + (data_type1, data_type2) => { |
1864 | 1918 | internal_err!( |
1865 | | - "array_union only support list with offsets of type int32 and int64" |
| 1919 | + "{set_op} does not support types '{data_type1:?}' and '{data_type2:?}'" |
1866 | 1920 | ) |
1867 | 1921 | } |
1868 | 1922 | } |
1869 | 1923 | } |
1870 | 1924 |
|
| 1925 | +/// Array_union SQL function |
| 1926 | +pub fn array_union(args: &[ArrayRef]) -> Result<ArrayRef> { |
| 1927 | + if args.len() != 2 { |
| 1928 | + return exec_err!("array_union needs two arguments"); |
| 1929 | + } |
| 1930 | + let array1 = &args[0]; |
| 1931 | + let array2 = &args[1]; |
| 1932 | + |
| 1933 | + general_set_op(array1, array2, SetOp::Union) |
| 1934 | +} |
| 1935 | + |
| 1936 | +/// array_intersect SQL function |
| 1937 | +pub fn array_intersect(args: &[ArrayRef]) -> Result<ArrayRef> { |
| 1938 | + if args.len() != 2 { |
| 1939 | + return exec_err!("array_intersect needs two arguments"); |
| 1940 | + } |
| 1941 | + |
| 1942 | + let array1 = &args[0]; |
| 1943 | + let array2 = &args[1]; |
| 1944 | + |
| 1945 | + general_set_op(array1, array2, SetOp::Intersect) |
| 1946 | +} |
| 1947 | + |
1871 | 1948 | /// Array_to_string SQL function |
1872 | 1949 | pub fn array_to_string(args: &[ArrayRef]) -> Result<ArrayRef> { |
1873 | 1950 | if args.len() < 2 || args.len() > 3 { |
@@ -2228,7 +2305,7 @@ pub fn array_has(args: &[ArrayRef]) -> Result<ArrayRef> { |
2228 | 2305 | DataType::LargeList(_) => { |
2229 | 2306 | general_array_has_dispatch::<i64>(&args[0], &args[1], ComparisonType::Single) |
2230 | 2307 | } |
2231 | | - _ => internal_err!("array_has does not support type '{array_type:?}'."), |
| 2308 | + _ => exec_err!("array_has does not support type '{array_type:?}'."), |
2232 | 2309 | } |
2233 | 2310 | } |
2234 | 2311 |
|
@@ -2359,74 +2436,6 @@ pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef |
2359 | 2436 | Ok(Arc::new(list_array) as ArrayRef) |
2360 | 2437 | } |
2361 | 2438 |
|
2362 | | -/// array_intersect SQL function |
2363 | | -pub fn array_intersect(args: &[ArrayRef]) -> Result<ArrayRef> { |
2364 | | - if args.len() != 2 { |
2365 | | - return exec_err!("array_intersect needs two arguments"); |
2366 | | - } |
2367 | | - |
2368 | | - let first_array = &args[0]; |
2369 | | - let second_array = &args[1]; |
2370 | | - |
2371 | | - match (first_array.data_type(), second_array.data_type()) { |
2372 | | - (DataType::Null, _) => Ok(second_array.clone()), |
2373 | | - (_, DataType::Null) => Ok(first_array.clone()), |
2374 | | - _ => { |
2375 | | - let first_array = as_list_array(&first_array)?; |
2376 | | - let second_array = as_list_array(&second_array)?; |
2377 | | - |
2378 | | - if first_array.value_type() != second_array.value_type() { |
2379 | | - return internal_err!("array_intersect is not implemented for '{first_array:?}' and '{second_array:?}'"); |
2380 | | - } |
2381 | | - |
2382 | | - let dt = first_array.value_type(); |
2383 | | - |
2384 | | - let mut offsets = vec![0]; |
2385 | | - let mut new_arrays = vec![]; |
2386 | | - |
2387 | | - let converter = RowConverter::new(vec![SortField::new(dt.clone())])?; |
2388 | | - for (first_arr, second_arr) in first_array.iter().zip(second_array.iter()) { |
2389 | | - if let (Some(first_arr), Some(second_arr)) = (first_arr, second_arr) { |
2390 | | - let l_values = converter.convert_columns(&[first_arr])?; |
2391 | | - let r_values = converter.convert_columns(&[second_arr])?; |
2392 | | - |
2393 | | - let values_set: HashSet<_> = l_values.iter().collect(); |
2394 | | - let mut rows = Vec::with_capacity(r_values.num_rows()); |
2395 | | - for r_val in r_values.iter().sorted().dedup() { |
2396 | | - if values_set.contains(&r_val) { |
2397 | | - rows.push(r_val); |
2398 | | - } |
2399 | | - } |
2400 | | - |
2401 | | - let last_offset: i32 = match offsets.last().copied() { |
2402 | | - Some(offset) => offset, |
2403 | | - None => return internal_err!("offsets should not be empty"), |
2404 | | - }; |
2405 | | - offsets.push(last_offset + rows.len() as i32); |
2406 | | - let arrays = converter.convert_rows(rows)?; |
2407 | | - let array = match arrays.first() { |
2408 | | - Some(array) => array.clone(), |
2409 | | - None => { |
2410 | | - return internal_err!( |
2411 | | - "array_intersect: failed to get array from rows" |
2412 | | - ) |
2413 | | - } |
2414 | | - }; |
2415 | | - new_arrays.push(array); |
2416 | | - } |
2417 | | - } |
2418 | | - |
2419 | | - let field = Arc::new(Field::new("item", dt, true)); |
2420 | | - let offsets = OffsetBuffer::new(offsets.into()); |
2421 | | - let new_arrays_ref = |
2422 | | - new_arrays.iter().map(|v| v.as_ref()).collect::<Vec<_>>(); |
2423 | | - let values = compute::concat(&new_arrays_ref)?; |
2424 | | - let arr = Arc::new(ListArray::try_new(field, offsets, values, None)?); |
2425 | | - Ok(arr) |
2426 | | - } |
2427 | | - } |
2428 | | -} |
2429 | | - |
2430 | 2439 | pub fn general_array_distinct<OffsetSize: OffsetSizeTrait>( |
2431 | 2440 | array: &GenericListArray<OffsetSize>, |
2432 | 2441 | field: &FieldRef, |
|
0 commit comments