@@ -94,7 +94,7 @@ pub trait Accumulator: Send + Sync + Debug {
9494 ///
9595 /// Intermediate state is used for "multi-phase" grouping in
9696 /// DataFusion, where an aggregate is computed in parallel with
97- /// multiple `Accumulator` instances, as illustrated below:
97+ /// multiple `Accumulator` instances, as described below:
9898 ///
9999 /// # MultiPhase Grouping
100100 ///
@@ -130,7 +130,7 @@ pub trait Accumulator: Send + Sync + Debug {
130130 /// `───────' `───────'
131131 /// ```
132132 ///
133- /// The partial state is serialied as `Arrays` and then combined
133+ /// The partial state is serialized as `Arrays` and then combined
134134 /// with other partial states from different instances of this
135135 /// Accumulator (that ran on different partitions, for example).
136136 ///
@@ -147,6 +147,107 @@ pub trait Accumulator: Send + Sync + Debug {
147147 /// Note that [`ScalarValue::List`] can be used to pass multiple
148148 /// values if the number of intermediate values is not known at
149149 /// planning time (e.g. for `MEDIAN`)
150+ ///
151+ /// # Multi-phase repartitioned Grouping
152+ ///
153+ /// Many multi-phase grouping plans contain a Repartition operation
154+ /// as well as shown below:
155+ ///
156+ /// ```text
157+ /// ▲ ▲
158+ /// │ │
159+ /// │ │
160+ /// │ │
161+ /// │ │
162+ /// │ │
163+ /// ┌───────────────────────┐ ┌───────────────────────┐ 4. Each AggregateMode::Final
164+ /// │GroupBy │ │GroupBy │ GroupBy has an entry for its
165+ /// │(AggregateMode::Final) │ │(AggregateMode::Final) │ subset of groups (in this case
166+ /// │ │ │ │ that means half the entries)
167+ /// └───────────────────────┘ └───────────────────────┘
168+ /// ▲ ▲
169+ /// │ │
170+ /// └─────────────┬────────────┘
171+ /// │
172+ /// │
173+ /// │
174+ /// ┌─────────────────────────┐ 3. Repartitioning by hash(group
175+ /// │ Repartition │ keys) ensures that each distinct
176+ /// │ HASH(x) │ group key now appears in exactly
177+ /// └─────────────────────────┘ one partition
178+ /// ▲
179+ /// │
180+ /// ┌───────────────┴─────────────┐
181+ /// │ │
182+ /// │ │
183+ /// ┌─────────────────────────┐ ┌──────────────────────────┐ 2. Each AggregateMode::Partial
184+ /// │ GroubyBy │ │ GroubyBy │ GroupBy has an entry for *all*
185+ /// │(AggregateMode::Partial) │ │ (AggregateMode::Partial) │ the groups
186+ /// └─────────────────────────┘ └──────────────────────────┘
187+ /// ▲ ▲
188+ /// │ ┌┘
189+ /// │ │
190+ /// .─────────. .─────────.
191+ /// ,─' '─. ,─' '─.
192+ /// ; Input : ; Input : 1. Since input data is
193+ /// : Partition 0 ; : Partition 1 ; arbitrarily or RoundRobin
194+ /// ╲ ╱ ╲ ╱ distributed, each partition
195+ /// '─. ,─' '─. ,─' likely has all distinct
196+ /// `───────' `───────'
197+ /// ```
198+ ///
199+ /// This structure is used so that the `AggregateMode::Partial` accumulators
200+ /// reduces the cardinality of the input as soon as possible. Typically,
201+ /// each partial accumulator sees all groups in the input as the group keys
202+ /// are evenly distributed across the input.
203+ ///
204+ /// The final output is computed by repartitioning the result of
205+ /// [`Self::state`] from each Partial aggregate and `hash(group keys)` so
206+ /// that each distinct group key appears in exactly one of the
207+ /// `AggregateMode::Final` GroupBy nodes. The output of the final nodes are
208+ /// then unioned together to produce the overall final output.
209+ ///
210+ /// Here is an example that shows the distribution of groups in the
211+ /// different phases
212+ ///
213+ /// ```text
214+ /// ┌─────┐ ┌─────┐
215+ /// │ 1 │ │ 3 │
216+ /// ├─────┤ ├─────┤
217+ /// │ 2 │ │ 4 │ After repartitioning by
218+ /// └─────┘ └─────┘ hash(group keys), each distinct
219+ /// ┌─────┐ ┌─────┐ group key now appears in exactly
220+ /// │ 1 │ │ 3 │ one partition
221+ /// ├─────┤ ├─────┤
222+ /// │ 2 │ │ 4 │
223+ /// └─────┘ └─────┘
224+ ///
225+ ///
226+ /// ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
227+ ///
228+ /// ┌─────┐ ┌─────┐
229+ /// │ 2 │ │ 2 │
230+ /// ├─────┤ ├─────┤
231+ /// │ 1 │ │ 2 │
232+ /// ├─────┤ ├─────┤
233+ /// │ 3 │ │ 3 │
234+ /// ├─────┤ ├─────┤
235+ /// │ 4 │ │ 1 │
236+ /// └─────┘ └─────┘ Input data is arbitrarily or
237+ /// ... ... RoundRobin distributed, each
238+ /// ┌─────┐ ┌─────┐ partition likely has all
239+ /// │ 1 │ │ 4 │ distinct group keys
240+ /// ├─────┤ ├─────┤
241+ /// │ 4 │ │ 3 │
242+ /// ├─────┤ ├─────┤
243+ /// │ 1 │ │ 1 │
244+ /// ├─────┤ ├─────┤
245+ /// │ 4 │ │ 3 │
246+ /// └─────┘ └─────┘
247+ ///
248+ /// group values group values
249+ /// in partition 0 in partition 1
250+ /// ```
150251 fn state ( & mut self ) -> Result < Vec < ScalarValue > > ;
151252
152253 /// Updates the accumulator's state from an `Array` containing one
0 commit comments