Skip to content

Commit 66dd25a

Browse files
committed
Add canonical UUID support and clean up UUID parsing:
- Introduced `canonical_extension_types` feature for standardized UUID handling. - Added `Uuid` crate dependency for parsing and validating UUIDs. - Updated `field_with_name` method to support canonical UUID representation. - Removed custom UUID parsing logic and replaced it with `Uuid` crate functionality. - Updated `Cargo.toml` accordingly.
1 parent ebcbea4 commit 66dd25a

File tree

3 files changed

+27
-44
lines changed

3 files changed

+27
-44
lines changed

arrow-avro/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ all-features = true
3939
default = ["deflate", "snappy", "zstd"]
4040
deflate = ["flate2"]
4141
snappy = ["snap", "crc"]
42+
canonical_extension_types = ["arrow-schema/canonical_extension_types"]
4243

4344
[dependencies]
4445
arrow-schema = { workspace = true }
@@ -50,6 +51,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend"
5051
snap = { version = "1.0", default-features = false, optional = true }
5152
zstd = { version = "0.13", default-features = false, optional = true }
5253
crc = { version = "3.0", optional = true }
54+
uuid = "1.17"
5355

5456
[dev-dependencies]
5557
rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }

arrow-avro/src/codec.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,25 @@ impl AvroDataType {
6161
}
6262

6363
/// Returns an arrow [`Field`] with the given name
64+
#[cfg(feature = "canonical_extension_types")]
6465
pub fn field_with_name(&self, name: &str) -> Field {
65-
let d = self.codec.data_type();
66-
Field::new(name, d, self.nullability.is_some()).with_metadata(self.metadata.clone())
66+
let nullable = self.nullability.is_some();
67+
let data_type = self.codec.data_type();
68+
let field = match self.codec {
69+
Codec::Uuid => Field::new(name, data_type, nullable)
70+
.with_extension_type(arrow_schema::extension::Uuid),
71+
_ => Field::new(name, data_type, nullable),
72+
};
73+
field.with_metadata(self.metadata.clone())
74+
}
75+
76+
/// Returns an arrow [`Field`] with the given name
77+
#[cfg(not(feature = "canonical_extension_types"))]
78+
pub fn field_with_name(&self, name: &str) -> Field {
79+
let nullable = self.nullability.is_some();
80+
let data_type = self.codec.data_type();
81+
let field = Field::new(name, data_type, nullable);
82+
field.with_metadata(self.metadata.clone())
6783
}
6884

6985
/// Returns a reference to the codec used by this data type

arrow-avro/src/reader/record.rs

Lines changed: 7 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ use std::cmp::Ordering;
3636
use std::collections::HashMap;
3737
use std::io::Read;
3838
use std::sync::Arc;
39+
use uuid::Uuid;
3940

4041
const DEFAULT_CAPACITY: usize = 1024;
4142

@@ -321,8 +322,12 @@ impl Decoder {
321322
}
322323
Self::Uuid(values) => {
323324
let s_bytes = buf.get_bytes()?;
324-
let uuid_bytes = parse_uuid_bytes(s_bytes)?;
325-
values.extend_from_slice(&uuid_bytes);
325+
let s = std::str::from_utf8(s_bytes).map_err(|e| {
326+
ArrowError::ParseError(format!("UUID bytes are not valid UTF-8: {e}"))
327+
})?;
328+
let uuid = Uuid::try_parse(s)
329+
.map_err(|e| ArrowError::ParseError(format!("Failed to parse uuid: {e}")))?;
330+
values.extend_from_slice(uuid.as_bytes());
326331
}
327332
Self::Array(_, off, encoding) => {
328333
let total_items = read_blocks(buf, |cursor| encoding.decode(cursor))?;
@@ -619,46 +624,6 @@ fn sign_extend_to<const N: usize>(raw: &[u8]) -> Result<[u8; N], ArrowError> {
619624
Ok(arr)
620625
}
621626

622-
#[inline]
623-
fn hex_char_to_u8(c: u8) -> Result<u8, ArrowError> {
624-
match c {
625-
b'0'..=b'9' => Ok(c - b'0'),
626-
b'a'..=b'f' => Ok(c - b'a' + 10),
627-
b'A'..=b'F' => Ok(c - b'A' + 10),
628-
_ => Err(ArrowError::ParseError(format!(
629-
"Invalid hex character '{c}' in UUID string",
630-
))),
631-
}
632-
}
633-
634-
#[inline]
635-
fn parse_uuid_bytes(s_bytes: &[u8]) -> Result<[u8; 16], ArrowError> {
636-
if s_bytes.len() != 36 {
637-
return Err(ArrowError::ParseError(format!(
638-
"Invalid UUID string length: expected 36, got {}",
639-
s_bytes.len()
640-
)));
641-
}
642-
let mut bytes = [0u8; 16];
643-
let mut str_idx = 0;
644-
for byte_chunk in bytes.iter_mut() {
645-
if str_idx == 8 || str_idx == 13 || str_idx == 18 || str_idx == 23 {
646-
if s_bytes[str_idx] != b'-' {
647-
return Err(ArrowError::ParseError(format!(
648-
"Invalid UUID format: expected hyphen at index {str_idx}"
649-
)));
650-
}
651-
str_idx += 1;
652-
}
653-
let high_nibble = hex_char_to_u8(s_bytes[str_idx])?;
654-
str_idx += 1;
655-
let low_nibble = hex_char_to_u8(s_bytes[str_idx])?;
656-
str_idx += 1;
657-
*byte_chunk = (high_nibble << 4) | low_nibble;
658-
}
659-
Ok(bytes)
660-
}
661-
662627
#[cfg(test)]
663628
mod tests {
664629
use super::*;

0 commit comments

Comments
 (0)