|
21 | 21 | //! [`filter`]: crate::filter::filter |
22 | 22 | //! [`take`]: crate::take::take |
23 | 23 | use crate::concat::concat_batches; |
24 | | -use arrow_array::StringViewArray; |
| 24 | +use crate::filter::filter_record_batch; |
25 | 25 | use arrow_array::{cast::AsArray, Array, ArrayRef, RecordBatch}; |
| 26 | +use arrow_array::{BooleanArray, StringViewArray}; |
26 | 27 | use arrow_data::ByteView; |
27 | 28 | use arrow_schema::{ArrowError, SchemaRef}; |
28 | 29 | use std::collections::VecDeque; |
29 | 30 | use std::sync::Arc; |
30 | | - |
31 | 31 | // Originally From DataFusion's coalesce module: |
32 | 32 | // https://github.com/apache/datafusion/blob/9d2f04996604e709ee440b65f41e7b882f50b788/datafusion/physical-plan/src/coalesce/mod.rs#L26-L25 |
33 | 33 |
|
@@ -155,9 +155,62 @@ impl BatchCoalescer { |
155 | 155 | Arc::clone(&self.schema) |
156 | 156 | } |
157 | 157 |
|
158 | | - /// Push next batch into the Coalescer |
| 158 | + /// Push a batch into the Coalescer after applying a filter |
| 159 | + /// |
| 160 | + /// This is semantically equivalent of calling [`Self::push_batch`] |
| 161 | + /// with the results from [`filter_record_batch`] |
| 162 | + /// |
| 163 | + /// # Example |
| 164 | + /// # Example |
| 165 | + /// ``` |
| 166 | + /// # use arrow_array::{record_batch, BooleanArray}; |
| 167 | + /// # use arrow_select::coalesce::BatchCoalescer; |
| 168 | + /// let batch1 = record_batch!(("a", Int32, [1, 2, 3])).unwrap(); |
| 169 | + /// let batch2 = record_batch!(("a", Int32, [4, 5, 6])).unwrap(); |
| 170 | + /// // Apply a filter to each batch to pick the first and last row |
| 171 | + /// let filter = BooleanArray::from(vec![true, false, true]); |
| 172 | + /// // create a new Coalescer that targets creating 1000 row batches |
| 173 | + /// let mut coalescer = BatchCoalescer::new(batch1.schema(), 1000); |
| 174 | + /// coalescer.push_batch_with_filter(batch1, &filter); |
| 175 | + /// coalescer.push_batch_with_filter(batch2, &filter); |
| 176 | + /// // finsh and retrieve the created batch |
| 177 | + /// coalescer.finish_buffered_batch().unwrap(); |
| 178 | + /// let completed_batch = coalescer.next_completed_batch().unwrap(); |
| 179 | + /// // filtered out 2 and 5: |
| 180 | + /// let expected_batch = record_batch!(("a", Int32, [1, 3, 4, 6])).unwrap(); |
| 181 | + /// assert_eq!(completed_batch, expected_batch); |
| 182 | + /// ``` |
| 183 | + pub fn push_batch_with_filter( |
| 184 | + &mut self, |
| 185 | + batch: RecordBatch, |
| 186 | + filter: &BooleanArray, |
| 187 | + ) -> Result<(), ArrowError> { |
| 188 | + // TODO: optimize this to avoid materializing (copying the results |
| 189 | + // of filter to a new batch) |
| 190 | + let filtered_batch = filter_record_batch(&batch, filter)?; |
| 191 | + self.push_batch(filtered_batch) |
| 192 | + } |
| 193 | + |
| 194 | + /// Push all the rows from `batch` into the Coalescer |
159 | 195 | /// |
160 | 196 | /// See [`Self::next_completed_batch()`] to retrieve any completed batches. |
| 197 | + /// |
| 198 | + /// # Example |
| 199 | + /// ``` |
| 200 | + /// # use arrow_array::record_batch; |
| 201 | + /// # use arrow_select::coalesce::BatchCoalescer; |
| 202 | + /// let batch1 = record_batch!(("a", Int32, [1, 2, 3])).unwrap(); |
| 203 | + /// let batch2 = record_batch!(("a", Int32, [4, 5, 6])).unwrap(); |
| 204 | + /// // create a new Coalescer that targets creating 1000 row batches |
| 205 | + /// let mut coalescer = BatchCoalescer::new(batch1.schema(), 1000); |
| 206 | + /// coalescer.push_batch(batch1); |
| 207 | + /// coalescer.push_batch(batch2); |
| 208 | + /// // finsh and retrieve the created batch |
| 209 | + /// coalescer.finish_buffered_batch().unwrap(); |
| 210 | + /// let completed_batch = coalescer.next_completed_batch().unwrap(); |
| 211 | + /// let expected_batch = record_batch!(("a", Int32, [1, 2, 3, 4, 5, 6])).unwrap(); |
| 212 | + /// assert_eq!(completed_batch, expected_batch); |
| 213 | + /// ``` |
161 | 214 | pub fn push_batch(&mut self, batch: RecordBatch) -> Result<(), ArrowError> { |
162 | 215 | if batch.num_rows() == 0 { |
163 | 216 | // If the batch is empty, we don't need to do anything |
|
0 commit comments