apache · Jefffrey · Sep 18, 2025 · Mar 24, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod decimal;
 mod numeric;
 
+pub use decimal::DecimalDistinctAvgAccumulator;
 pub use numeric::Float64DistinctAvgAccumulator;
diff --git a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs
@@ -0,0 +1,192 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{
+    array::{ArrayRef, ArrowNumericType},
+    datatypes::{i256, Decimal128Type, Decimal256Type, DecimalType},
+};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr_common::accumulator::Accumulator;
+use std::fmt::Debug;
+use std::mem::size_of_val;
+
+use crate::aggregate::sum_distinct::DistinctSumAccumulator;
+use crate::utils::DecimalAverager;
+
+/// Generic implementation of `AVG DISTINCT` for Decimal types.
+/// Handles both Decimal128Type and Decimal256Type.
+#[derive(Debug)]
+pub struct DecimalDistinctAvgAccumulator<T: DecimalType + Debug> {
+    sum_accumulator: DistinctSumAccumulator<T>,
+    sum_scale: i8,
+    target_precision: u8,
+    target_scale: i8,
+}
+
+impl<T: DecimalType + Debug> DecimalDistinctAvgAccumulator<T> {
+    pub fn with_decimal_params(
+        sum_scale: i8,
+        target_precision: u8,
+        target_scale: i8,
+    ) -> Self {
+        let data_type = T::TYPE_CONSTRUCTOR(T::MAX_PRECISION, sum_scale);
+
+        Self {
+            sum_accumulator: DistinctSumAccumulator::new(&data_type),
+            sum_scale,
+            target_precision,
+            target_scale,
+        }
+    }
+}
+
+impl<T: DecimalType + ArrowNumericType + Debug> Accumulator
+    for DecimalDistinctAvgAccumulator<T>
+{
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.sum_accumulator.state()
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.sum_accumulator.update_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.sum_accumulator.merge_batch(states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.sum_accumulator.distinct_count() == 0 {
+            return ScalarValue::new_primitive::<T>(
+                None,
+                &T::TYPE_CONSTRUCTOR(self.target_precision, self.target_scale),
+            );
+        }
+
+        let sum_scalar = self.sum_accumulator.evaluate()?;
+
+        match sum_scalar {
+            ScalarValue::Decimal128(Some(sum), _, _) => {
+                let decimal_averager = DecimalAverager::<Decimal128Type>::try_new(
+                    self.sum_scale,
+                    self.target_precision,
+                    self.target_scale,
+                )?;
+                let avg = decimal_averager
+                    .avg(sum, self.sum_accumulator.distinct_count() as i128)?;
+                Ok(ScalarValue::Decimal128(
+                    Some(avg),
+                    self.target_precision,
+                    self.target_scale,
+                ))
+            }
+            ScalarValue::Decimal256(Some(sum), _, _) => {
+                let decimal_averager = DecimalAverager::<Decimal256Type>::try_new(
+                    self.sum_scale,
+                    self.target_precision,
+                    self.target_scale,
+                )?;
+                // `distinct_count` returns `u64`, but `avg` expects `i256`
+                // first convert `u64` to `i128`, then convert `i128` to `i256` to avoid overflow
+                let distinct_cnt: i128 = self.sum_accumulator.distinct_count() as i128;
+                let count: i256 = i256::from_i128(distinct_cnt);
+                let avg = decimal_averager.avg(sum, count)?;
+                Ok(ScalarValue::Decimal256(
+                    Some(avg),
+                    self.target_precision,
+                    self.target_scale,
+                ))
+            }
+
+            _ => unreachable!("Unsupported decimal type: {:?}", sum_scalar),
+        }
+    }
+
+    fn size(&self) -> usize {
+        let fixed_size = size_of_val(self);
+
+        // Account for the size of the sum_accumulator with its contained values
+        fixed_size + self.sum_accumulator.size()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Decimal128Array, Decimal256Array};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_decimal128_distinct_avg_accumulator() -> Result<()> {
+        let precision = 10_u8;
+        let scale = 4_i8;
+        let array = Decimal128Array::from(vec![
+            Some(100_0000),
+            Some(125_0000),
+            Some(175_0000),
+            Some(200_0000),
+            Some(200_0000),
+            Some(300_0000),
+            None,
+            None,
+        ])
+        .with_precision_and_scale(precision, scale)?;
+
+        let mut accumulator =
+            DecimalDistinctAvgAccumulator::<Decimal128Type>::with_decimal_params(
+                scale, 14, 8,
+            );
+        accumulator.update_batch(&[Arc::new(array)])?;
+
+        let result = accumulator.evaluate()?;
+        let expected_result = ScalarValue::Decimal128(Some(180_00000000), 14, 8);
+        assert_eq!(result, expected_result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_decimal256_distinct_avg_accumulator() -> Result<()> {
+        let precision = 50_u8;
+        let scale = 2_i8;
+
+        let array = Decimal256Array::from(vec![
+            Some(i256::from_i128(10_000)),
+            Some(i256::from_i128(12_500)),
+            Some(i256::from_i128(17_500)),
+            Some(i256::from_i128(20_000)),
+            Some(i256::from_i128(20_000)),
+            Some(i256::from_i128(30_000)),
+            None,
+            None,
+        ])
+        .with_precision_and_scale(precision, scale)?;
+
+        let mut accumulator =
+            DecimalDistinctAvgAccumulator::<Decimal256Type>::with_decimal_params(
+                scale, 54, 6,
+            );
+        accumulator.update_batch(&[Arc::new(array)])?;
+
+        let result = accumulator.evaluate()?;
+        let expected_result =
+            ScalarValue::Decimal256(Some(i256::from_i128(180_000000)), 54, 6);
+        assert_eq!(result, expected_result);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs
@@ -27,6 +27,7 @@ use arrow::datatypes::{
     i256, ArrowNativeType, DataType, Decimal128Type, Decimal256Type, DecimalType,
     DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
     DurationSecondType, Field, FieldRef, Float64Type, TimeUnit, UInt64Type,
+    DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION,
 };
 use datafusion_common::{
     exec_err, not_impl_err, utils::take_function_args, Result, ScalarValue,
@@ -40,7 +41,9 @@ use datafusion_expr::{
     ReversedUDAF, Signature,
 };
 
-use datafusion_functions_aggregate_common::aggregate::avg_distinct::Float64DistinctAvgAccumulator;
+use datafusion_functions_aggregate_common::aggregate::avg_distinct::{
+    DecimalDistinctAvgAccumulator, Float64DistinctAvgAccumulator,
+};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState;
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::{
     filtered_null_mask, set_nulls,
@@ -120,13 +123,36 @@ impl AggregateUDFImpl for Avg {
 
         // instantiate specialized accumulator based for the type
         if acc_args.is_distinct {
-            match &data_type {
+            match (&data_type, acc_args.return_type()) {
                 // Numeric types are converted to Float64 via `coerce_avg_type` during logical plan creation
-                Float64 => Ok(Box::new(Float64DistinctAvgAccumulator::default())),
-                _ => exec_err!("AVG(DISTINCT) for {} not supported", data_type),
+                (Float64, _) => Ok(Box::new(Float64DistinctAvgAccumulator::default())),
+
+                (
+                    Decimal128(_, scale),
+                    Decimal128(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalDistinctAvgAccumulator::<Decimal128Type>::with_decimal_params(
+                    *scale,
+                    *target_precision,
+                    *target_scale,
+                ))),
+
+                (
+                    Decimal256(_, scale),
+                    Decimal256(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalDistinctAvgAccumulator::<Decimal256Type>::with_decimal_params(
+                    *scale,
+                    *target_precision,
+                    *target_scale,
+                ))),
+
+                (dt, return_type) => exec_err!(
+                    "AVG(DISTINCT) for ({} --> {}) not supported",
+                    dt,
+                    return_type
+                ),
             }
         } else {
-            match (&data_type, acc_args.return_field.data_type()) {
+            match (&data_type, acc_args.return_type()) {
                 (Float64, Float64) => Ok(Box::<AvgAccumulator>::default()),
                 (
                     Decimal128(sum_precision, sum_scale),
@@ -161,22 +187,31 @@ impl AggregateUDFImpl for Avg {
                     }))
                 }
 
-                _ => exec_err!(
-                    "AvgAccumulator for ({} --> {})",
-                    &data_type,
-                    acc_args.return_field.data_type()
-                ),
+                (dt, return_type) => {
+                    exec_err!("AvgAccumulator for ({} --> {})", dt, return_type)
+                }
             }
         }
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         if args.is_distinct {
-            // Copied from datafusion_functions_aggregate::sum::Sum::state_fields
+            // Decimal accumulator actually uses a different precision during accumulation,
+            // see DecimalDistinctAvgAccumulator::with_decimal_params
+            let dt = match args.input_fields[0].data_type() {
+                DataType::Decimal128(_, scale) => {
+                    DataType::Decimal128(DECIMAL128_MAX_PRECISION, *scale)
+                }
+                DataType::Decimal256(_, scale) => {
+                    DataType::Decimal256(DECIMAL256_MAX_PRECISION, *scale)
+                }
+                _ => args.return_type().clone(),
+            };
+            // Similar to datafusion_functions_aggregate::sum::Sum::state_fields
             // since the accumulator uses DistinctSumAccumulator internally.
             Ok(vec![Field::new_list(
                 format_state_name(args.name, "avg distinct"),
-                Field::new_list_field(args.return_type().clone(), true),
+                Field::new_list_field(dt, true),
                 false,
             )
             .into()])