@@ -66,43 +66,57 @@ use log::trace;
6666/// min_values("X") -> None
6767/// ```
6868pub trait PruningStatistics {
69- /// return the minimum values for the named column, if known.
70- /// Note: the returned array must contain `num_containers()` rows
69+ /// Return the minimum values for the named column, if known.
70+ ///
71+ /// If the minimum value for a particular container is not known, the
72+ /// returned array should have `null` in that row. If the minimum value is
73+ /// not known for any row, return `None`.
74+ ///
75+ /// Note: the returned array must contain [`Self::num_containers`] rows
7176 fn min_values ( & self , column : & Column ) -> Option < ArrayRef > ;
7277
73- /// return the maximum values for the named column, if known.
74- /// Note: the returned array must contain `num_containers()` rows.
78+ /// Return the maximum values for the named column, if known.
79+ ///
80+ /// See [`Self::min_values`] for when to return `None` and null values.
81+ ///
82+ /// Note: the returned array must contain [`Self::num_containers`] rows
7583 fn max_values ( & self , column : & Column ) -> Option < ArrayRef > ;
7684
77- /// return the number of containers (e.g. row groups) being
78- /// pruned with these statistics
85+ /// Return the number of containers (e.g. row groups) being
86+ /// pruned with these statistics (the number of rows in each returned array)
7987 fn num_containers ( & self ) -> usize ;
8088
81- /// return the number of null values for the named column as an
89+ /// Return the number of null values for the named column as an
8290 /// `Option<UInt64Array>`.
8391 ///
84- /// Note: the returned array must contain `num_containers()` rows.
92+ /// See [`Self::min_values`] for when to return `None` and null values.
93+ ///
94+ /// Note: the returned array must contain [`Self::num_containers`] rows
8595 fn null_counts ( & self , column : & Column ) -> Option < ArrayRef > ;
8696}
8797
88- /// Evaluates filter expressions on statistics, rather than the actual data. If
89- /// no rows could possibly pass the filter entire containers can be "pruned"
90- /// ( skipped), without reading any actual data, leading to significant
98+ /// Evaluates filter expressions on statistics such as min/max values and null
99+ /// counts, attempting to prove a "container" (e.g. Parquet Row Group) can be
100+ /// skipped without reading the actual data, potentially leading to significant
91101/// performance improvements.
92102///
93- /// [`PruningPredicate`]s are used to prune (avoid scanning) Parquet Row Groups
103+ /// For example, [`PruningPredicate`]s are used to prune Parquet Row Groups
94104/// based on the min/max values found in the Parquet metadata. If the
95105/// `PruningPredicate` can guarantee that no rows in the Row Group match the
96106/// filter, the entire Row Group is skipped during query execution.
97107///
98- /// Note that this API is designed to be general, as it works:
108+ /// The `PruningPredicate` API is general, allowing it to be used for pruning
109+ /// other types of containers (e.g. files) based on statistics that may be
110+ /// known from external catalogs (e.g. Delta Lake) or other sources. Thus it
111+ /// supports:
99112///
100113/// 1. Arbitrary expressions expressions (including user defined functions)
101114///
102- /// 2. Anything that implements the [`PruningStatistics`] trait, not just
103- /// Parquet metadata, allowing it to be used by other systems to prune entities
104- /// (e.g. entire files) if the statistics are known via some other source, such
105- /// as a catalog.
115+ /// 2. Vectorized evaluation (provide more than one set of statistics at a time)
116+ /// so it is suitable for pruning 1000s of containers.
117+ ///
118+ /// 3. Anything that implements the [`PruningStatistics`] trait, not just
119+ /// Parquet metadata.
106120///
107121/// # Example
108122///
@@ -122,6 +136,7 @@ pub trait PruningStatistics {
122136/// B: true (rows might match x = 5)
123137/// C: true (rows might match x = 5)
124138/// ```
139+ ///
125140/// See [`PruningPredicate::try_new`] and [`PruningPredicate::prune`] for more information.
126141#[ derive( Debug , Clone ) ]
127142pub struct PruningPredicate {
@@ -251,8 +266,12 @@ fn is_always_true(expr: &Arc<dyn PhysicalExpr>) -> bool {
251266 . unwrap_or_default ( )
252267}
253268
254- /// Records for which columns statistics are necessary to evaluate a
255- /// pruning predicate.
269+ /// Describes which columns statistics are necessary to evaluate a
270+ /// [`PruningPredicate`].
271+ ///
272+ /// This structure permits reading and creating the minimum number statistics,
273+ /// which is important since statistics may be non trivial to read (e.g. large
274+ /// strings or when there are 1000s of columns).
256275///
257276/// Handles creating references to the min/max statistics
258277/// for columns as well as recording which statistics are needed
0 commit comments