Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions arrow-avro/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ all-features = true
default = ["deflate", "snappy", "zstd", "bzip2", "xz"]
deflate = ["flate2"]
snappy = ["snap", "crc"]
canonical_extension_types = ["arrow-schema/canonical_extension_types"]

[dependencies]
arrow-schema = { workspace = true }
Expand All @@ -52,6 +53,7 @@ zstd = { version = "0.13", default-features = false, optional = true }
bzip2 = { version = "0.4.4", default-features = false, optional = true }
xz = { version = "0.1", default-features = false, optional = true }
crc = { version = "3.0", optional = true }
uuid = "1.17"

[dev-dependencies]
rand = { version = "0.9.1", default-features = false, features = ["std", "std_rng", "thread_rng"] }
Expand Down
32 changes: 28 additions & 4 deletions arrow-avro/src/codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ pub enum Nullability {
NullSecond,
}

#[cfg(feature = "canonical_extension_types")]
fn with_extension_type(codec: &Codec, field: Field) -> Field {
match codec {
Codec::Uuid => field.with_extension_type(arrow_schema::extension::Uuid),
_ => field,
}
}

/// An Avro datatype mapped to the arrow data model
#[derive(Debug, Clone)]
pub struct AvroDataType {
Expand All @@ -61,8 +69,13 @@ impl AvroDataType {

/// Returns an arrow [`Field`] with the given name
pub fn field_with_name(&self, name: &str) -> Field {
let d = self.codec.data_type();
Field::new(name, d, self.nullability.is_some()).with_metadata(self.metadata.clone())
let nullable = self.nullability.is_some();
let data_type = self.codec.data_type();
let field = Field::new(name, data_type, nullable).with_metadata(self.metadata.clone());
#[cfg(feature = "canonical_extension_types")]
return with_extension_type(&self.codec, field);
#[cfg(not(feature = "canonical_extension_types"))]
field
}

/// Returns a reference to the codec used by this data type
Expand Down Expand Up @@ -200,7 +213,7 @@ pub enum Codec {
/// - `scale` (`Option<usize>`): Number of fractional digits.
/// - `fixed_size` (`Option<usize>`): Size in bytes if backed by a `fixed` type, otherwise `None`.
Decimal(usize, Option<usize>, Option<usize>),
/// Represents Avro Uuid type, a FixedSizeBinary with a length of 16
/// Represents Avro Uuid type, a FixedSizeBinary with a length of 16.
Uuid,
/// Represents an Avro enum, maps to Arrow's Dictionary(Int32, Utf8) type.
///
Expand Down Expand Up @@ -479,6 +492,18 @@ fn make_data_type<'a>(
codec: Codec::Decimal(precision, Some(scale), Some(size as usize)),
}
}
Some("duration") => {
if size != 12 {
return Err(ArrowError::ParseError(format!(
"Invalid fixed size for Duration: {size}, must be 12"
)));
};
AvroDataType {
nullability: None,
metadata: md,
codec: Codec::Interval,
}
}
_ => AvroDataType {
nullability: None,
metadata: md,
Expand Down Expand Up @@ -543,7 +568,6 @@ fn make_data_type<'a>(
(Some("local-timestamp-micros"), c @ Codec::Int64) => {
*c = Codec::TimestampMicros(false)
}
(Some("duration"), c @ Codec::Fixed(12)) => *c = Codec::Interval,
(Some("uuid"), c @ Codec::Utf8) => *c = Codec::Uuid,
(Some(logical), _) => {
// Insert unrecognized logical type into metadata map
Expand Down
65 changes: 63 additions & 2 deletions arrow-avro/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -397,9 +397,9 @@ mod test {
use crate::reader::vlq::VLQDecoder;
use crate::reader::{read_header, Decoder, ReaderBuilder};
use crate::test_util::arrow_test_data;
use arrow_array::types::Int32Type;
use arrow_array::types::{Int32Type, IntervalMonthDayNanoType};
use arrow_array::*;
use arrow_schema::{ArrowError, DataType, Field, Schema};
use arrow_schema::{ArrowError, DataType, Field, IntervalUnit, Schema};
use bytes::{Buf, BufMut, Bytes};
use futures::executor::block_on;
use futures::{stream, Stream, StreamExt, TryStreamExt};
Expand Down Expand Up @@ -796,4 +796,65 @@ mod test {
assert_eq!(actual2, expected);
}
}

#[test]
fn test_duration_uuid() {
let batch = read_file("test/data/duration_uuid.avro", 4, false);
let schema = batch.schema();
let fields = schema.fields();
assert_eq!(fields.len(), 2);
assert_eq!(fields[0].name(), "duration_field");
assert_eq!(
fields[0].data_type(),
&DataType::Interval(IntervalUnit::MonthDayNano)
);
assert_eq!(fields[1].name(), "uuid_field");
assert_eq!(fields[1].data_type(), &DataType::FixedSizeBinary(16));
assert_eq!(batch.num_rows(), 4);
assert_eq!(batch.num_columns(), 2);
let duration_array = batch
.column(0)
.as_any()
.downcast_ref::<IntervalMonthDayNanoArray>()
.unwrap();
let expected_duration_array: IntervalMonthDayNanoArray = [
Some(IntervalMonthDayNanoType::make_value(1, 15, 500_000_000)),
Some(IntervalMonthDayNanoType::make_value(0, 5, 2_500_000_000)),
Some(IntervalMonthDayNanoType::make_value(2, 0, 0)),
Some(IntervalMonthDayNanoType::make_value(12, 31, 999_000_000)),
]
.iter()
.copied()
.collect();
assert_eq!(&expected_duration_array, duration_array);
let uuid_array = batch
.column(1)
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
let expected_uuid_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
[
Some([
0xfe, 0x7b, 0xc3, 0x0b, 0x4c, 0xe8, 0x4c, 0x5e, 0xb6, 0x7c, 0x22, 0x34, 0xa2,
0xd3, 0x8e, 0x66,
]),
Some([
0xb3, 0x3f, 0x2a, 0xd7, 0x97, 0xb4, 0x4d, 0xe1, 0x8b, 0xfe, 0x94, 0x94, 0x1d,
0x60, 0x15, 0x6e,
]),
Some([
0x5f, 0x74, 0x92, 0x64, 0x07, 0x4b, 0x40, 0x05, 0x84, 0xbf, 0x11, 0x5e, 0xa8,
0x4e, 0xd2, 0x0a,
]),
Some([
0x08, 0x26, 0xcc, 0x06, 0xd2, 0xe3, 0x45, 0x99, 0xb4, 0xad, 0xaf, 0x5f, 0xa6,
0x90, 0x5c, 0xdb,
]),
]
.into_iter(),
16,
)
.unwrap();
assert_eq!(&expected_uuid_array, uuid_array);
}
}
Loading
Loading