Add validation for Decimal256 (#2113)

viirya · web-flow · commit 34f58f96d421 · 2022-07-22T10:02:24.000-07:00
* Add validation for decimal256

* Add validation in array data
diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs
@@ -352,7 +352,7 @@ impl From<ArrayData> for Decimal256Array {
         assert_eq!(
             data.buffers().len(),
             1,
-            "Decimal128Array data should contain 1 buffer only (values)"
+            "Decimal256Array data should contain 1 buffer only (values)"
         );
         let values = data.buffers()[0].as_ptr();
         let (precision, scale) = match data.data_type() {
diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use num::BigInt;
 use std::any::Any;
 use std::sync::Arc;
 
@@ -25,7 +26,7 @@ use crate::array::{ArrayBuilder, FixedSizeBinaryBuilder};
 
 use crate::error::{ArrowError, Result};
 
-use crate::datatypes::validate_decimal_precision;
+use crate::datatypes::{validate_decimal256_precision, validate_decimal_precision};
 use crate::util::decimal::{BasicDecimal, Decimal256};
 
 /// Array Builder for [`Decimal128Array`]
@@ -51,6 +52,10 @@ pub struct Decimal256Builder {
     builder: FixedSizeBinaryBuilder,
     precision: usize,
     scale: usize,
+
+    /// Should decimal values be validated for compatibility with scale and precision?
+    /// defaults to true
+    value_validation: bool,
 }
 
 impl Decimal128Builder {
@@ -163,14 +168,35 @@ impl Decimal256Builder {
             builder: FixedSizeBinaryBuilder::new(capacity, Self::BYTE_LENGTH),
             precision,
             scale,
+            value_validation: true,
         }
     }
 
+    /// Disable validation
+    ///
+    /// # Safety
+    ///
+    /// After disabling validation, caller must ensure that appended values are compatible
+    /// for the specified precision and scale.
+    pub unsafe fn disable_value_validation(&mut self) {
+        self.value_validation = false;
+    }
+
     /// Appends a [`Decimal256`] number into the builder.
     ///
     /// Returns an error if `value` has different precision, scale or length in bytes than this builder
     #[inline]
     pub fn append_value(&mut self, value: &Decimal256) -> Result<()> {
+        let value = if self.value_validation {
+            let raw_bytes = value.raw_value();
+            let integer = BigInt::from_signed_bytes_le(raw_bytes);
+            let value_str = integer.to_string();
+            validate_decimal256_precision(&value_str, self.precision)?;
+            value
+        } else {
+            value
+        };
+
         if self.precision != value.precision() || self.scale != value.scale() {
             return Err(ArrowError::InvalidArgumentError(
                 "Decimal value does not have the same precision or scale as Decimal256Builder".to_string()
@@ -206,9 +232,10 @@ impl Decimal256Builder {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use num::Num;
 
     use crate::array::array_decimal::{BasicDecimalArray, Decimal128Array};
-    use crate::array::Array;
+    use crate::array::{array_decimal, Array};
     use crate::datatypes::DataType;
     use crate::util::decimal::Decimal128;
 
@@ -298,4 +325,40 @@ mod tests {
         let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap();
         builder.append_value(&value).unwrap();
     }
+
+    #[test]
+    #[should_panic(
+        expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 76. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999"
+    )]
+    fn test_decimal256_builder_out_of_range_precision_scale() {
+        let mut builder = Decimal256Builder::new(30, 76, 6);
+
+        let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap();
+        let bytes = big_value.to_signed_bytes_le();
+        let value = Decimal256::try_new_from_bytes(76, 6, &bytes).unwrap();
+        builder.append_value(&value).unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 76. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999"
+    )]
+    fn test_decimal256_data_validation() {
+        let mut builder = Decimal256Builder::new(30, 76, 6);
+        // Disable validation at builder
+        unsafe {
+            builder.disable_value_validation();
+        }
+
+        let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap();
+        let bytes = big_value.to_signed_bytes_le();
+        let value = Decimal256::try_new_from_bytes(76, 6, &bytes).unwrap();
+        builder
+            .append_value(&value)
+            .expect("should not validate invalid value at builder");
+
+        let array = builder.finish();
+        let array_data = array_decimal::BasicDecimalArray::data(&array);
+        array_data.validate_values().unwrap();
+    }
 }
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
@@ -18,14 +18,18 @@
 //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates
 //! common attributes and operations for Arrow array.
 
-use crate::datatypes::{validate_decimal_precision, DataType, IntervalUnit, UnionMode};
+use crate::datatypes::{
+    validate_decimal256_precision, validate_decimal_precision, DataType, IntervalUnit,
+    UnionMode,
+};
 use crate::error::{ArrowError, Result};
 use crate::{bitmap::Bitmap, datatypes::ArrowNativeType};
 use crate::{
     buffer::{Buffer, MutableBuffer},
     util::bit_util,
 };
 use half::f16;
+use num::BigInt;
 use std::convert::TryInto;
 use std::mem;
 use std::ops::Range;
@@ -1018,6 +1022,17 @@ impl ArrayData {
                 }
                 Ok(())
             }
+            DataType::Decimal256(p, _) => {
+                let values = self.buffers()[0].as_slice();
+                for pos in 0..self.len() {
+                    let offset = pos * 32;
+                    let raw_bytes = &values[offset..offset + 32];
+                    let integer = BigInt::from_signed_bytes_le(raw_bytes);
+                    let value_str = integer.to_string();
+                    validate_decimal256_precision(&value_str, *p)?;
+                }
+                Ok(())
+            }
             DataType::Utf8 => self.validate_utf8::<i32>(),
             DataType::LargeUtf8 => self.validate_utf8::<i64>(),
             DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use num::{BigInt, Num, ToPrimitive};
 use std::fmt;
 
 use serde_derive::{Deserialize, Serialize};
@@ -300,6 +301,49 @@ pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [
     99999999999999999999999999999999999999,
 ];
 
+/// `MAX_DECIMAL_FOR_LARGER_PRECISION[p]` holds the maximum integer value
+/// that can be stored in [DataType::Decimal256] value of precision `p` > 38
+pub const MAX_DECIMAL_FOR_LARGER_PRECISION: [&str; 38] = [
+    "99999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999999999999999999999999",
+    "9999999999999999999999999999999999999999999999999999999999999999999999999",
+    "99999999999999999999999999999999999999999999999999999999999999999999999999",
+    "999999999999999999999999999999999999999999999999999999999999999999999999999",
+];
+
 /// `MIN_DECIMAL_FOR_EACH_PRECISION[p]` holds the minimum `i128` value
 /// that can be stored in a [DataType::Decimal] value of precision `p`
 pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [
@@ -343,13 +387,62 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [
     -99999999999999999999999999999999999999,
 ];
 
+/// `MIN_DECIMAL_FOR_LARGER_PRECISION[p]` holds the minimum integer value
+/// that can be stored in a [DataType::Decimal256] value of precision `p` > 38
+pub const MIN_DECIMAL_FOR_LARGER_PRECISION: [&str; 38] = [
+    "-99999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999999999999999999999999",
+    "-9999999999999999999999999999999999999999999999999999999999999999999999999",
+    "-99999999999999999999999999999999999999999999999999999999999999999999999999",
+    "-999999999999999999999999999999999999999999999999999999999999999999999999999",
+];
+
 /// The maximum precision for [DataType::Decimal] values
 pub const DECIMAL_MAX_PRECISION: usize = 38;
 
 /// The maximum scale for [DataType::Decimal] values
 pub const DECIMAL_MAX_SCALE: usize = 38;
 
-/// The default scale for [DataType::Decimal] values
+/// The maximum precision for [DataType::Decimal256] values
+pub const DECIMAL256_MAX_PRECISION: usize = 76;
+
+/// The maximum scale for [DataType::Decimal256] values
+pub const DECIMAL256_MAX_SCALE: usize = 76;
+
+/// The default scale for [DataType::Decimal] and [DataType::Decimal256] values
 pub const DECIMAL_DEFAULT_SCALE: usize = 10;
 
 /// Validates that the specified `i128` value can be properly
@@ -379,6 +472,55 @@ pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Resul
     }
 }
 
+/// Validates that the specified string value can be properly
+/// interpreted as a Decimal256 number with precision `precision`
+#[inline]
+pub(crate) fn validate_decimal256_precision(
+    value: &str,
+    precision: usize,
+) -> Result<BigInt> {
+    if precision > 38 {
+        let max_str = MAX_DECIMAL_FOR_LARGER_PRECISION[precision - 38 - 1];
+        let min_str = MIN_DECIMAL_FOR_LARGER_PRECISION[precision - 38 - 1];
+
+        let max = BigInt::from_str_radix(max_str, 10).unwrap();
+        let min = BigInt::from_str_radix(min_str, 10).unwrap();
+
+        let value = BigInt::from_str_radix(value, 10).unwrap();
+        if value > max {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "{} is too large to store in a Decimal256 of precision {}. Max is {}",
+                value, precision, max
+            )))
+        } else if value < min {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "{} is too small to store in a Decimal256 of precision {}. Min is {}",
+                value, precision, min
+            )))
+        } else {
+            Ok(value)
+        }
+    } else {
+        let max = MAX_DECIMAL_FOR_EACH_PRECISION[precision - 1];
+        let min = MIN_DECIMAL_FOR_EACH_PRECISION[precision - 1];
+        let value = BigInt::from_str_radix(value, 10).unwrap();
+
+        if value.to_i128().unwrap() > max {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "{} is too large to store in a Decimal256 of precision {}. Max is {}",
+                value, precision, max
+            )))
+        } else if value.to_i128().unwrap() < min {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "{} is too small to store in a Decimal256 of precision {}. Min is {}",
+                value, precision, min
+            )))
+        } else {
+            Ok(value)
+        }
+    }
+}
+
 impl DataType {
     /// Parse a data type from a JSON representation.
     pub(crate) fn from(json: &Value) -> Result<DataType> {