From 4e8c2ee54a67162f062f0a0857c46ba32cdba9d8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Sep 2025 13:24:59 -0500 Subject: [PATCH 1/8] add definitions --- parquet/src/arrow/schema/primitive.rs | 4 +- parquet/src/basic.rs | 111 ++++++++++++++++++++++---- parquet/src/schema/printer.rs | 15 +++- parquet/src/schema/types.rs | 4 +- 4 files changed, 113 insertions(+), 21 deletions(-) diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 24b18b39bc18..2362f12019f4 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -276,8 +276,8 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> Result Ok(DataType::Utf8), (Some(LogicalType::Bson), _) => Ok(DataType::Binary), (Some(LogicalType::Enum), _) => Ok(DataType::Binary), - (Some(LogicalType::Geometry), _) => Ok(DataType::Binary), - (Some(LogicalType::Geography), _) => Ok(DataType::Binary), + (Some(LogicalType::Geometry { .. }), _) => Ok(DataType::Binary), + (Some(LogicalType::Geography { .. }), _) => Ok(DataType::Binary), (None, ConvertedType::NONE) => Ok(DataType::Binary), (None, ConvertedType::JSON) => Ok(DataType::Utf8), (None, ConvertedType::BSON) => Ok(DataType::Binary), diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 6353b5f4ee63..2d4589dc8428 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -28,8 +28,8 @@ use crate::errors::{ParquetError, Result}; // Re-export crate::format types used in this module pub use crate::format::{ - BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, NullType, - StringType, TimeType, TimeUnit, TimestampType, UUIDType, + BsonType, DateType, DecimalType, EnumType, GeographyType, GeometryType, IntType, JsonType, + ListType, MapType, NullType, StringType, TimeType, TimeUnit, TimestampType, UUIDType, }; // ---------------------------------------------------------------------- @@ -231,9 +231,18 @@ pub enum LogicalType { /// A Variant value. Variant, /// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation. - Geometry, + Geometry { + /// A custom CRS. If unset, it defaults to "OGC:CRS84", which means that the geometries + /// must be stored in longitude, latitude based on the WGS84 datum. + crs: Option, + }, /// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation. - Geography, + Geography { + /// A custom CRS. If unset, the CRS defaults to "OGC:CRS84". + crs: Option, + /// Edge interpolation method. + algorithm: Option, + }, } // ---------------------------------------------------------------------- @@ -345,6 +354,31 @@ pub enum Encoding { BYTE_STREAM_SPLIT, } +// ---------------------------------------------------------------------- +// Mirrors `parquet::EdgeInterpolationAlgorithm` + +/// Edge interpolation algorithm for Geography logical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum EdgeInterpolationAlgorithm { + /// Edges are interpolated as geodesics on a sphere. + SPHERICAL, + + /// + VINCENTY, + + /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 + THOMAS, + + /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. + ANDOYER, + + /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 + KARNEY, + + /// An unknown/unrecognized algorithm + UNKNOWN(i32), +} + impl FromStr for Encoding { type Err = ParquetError; @@ -584,9 +618,9 @@ impl ColumnOrder { LogicalType::Unknown => SortOrder::UNDEFINED, LogicalType::Uuid => SortOrder::UNSIGNED, LogicalType::Float16 => SortOrder::SIGNED, - LogicalType::Variant | LogicalType::Geometry | LogicalType::Geography => { - SortOrder::UNDEFINED - } + LogicalType::Variant + | LogicalType::Geometry { .. } + | LogicalType::Geography { .. } => SortOrder::UNDEFINED, }, // Fall back to converted type None => Self::get_converted_sort_order(converted_type, physical_type), @@ -850,8 +884,28 @@ impl From for LogicalType { parquet::LogicalType::UUID(_) => LogicalType::Uuid, parquet::LogicalType::FLOAT16(_) => LogicalType::Float16, parquet::LogicalType::VARIANT(_) => LogicalType::Variant, - parquet::LogicalType::GEOMETRY(_) => LogicalType::Geometry, - parquet::LogicalType::GEOGRAPHY(_) => LogicalType::Geography, + parquet::LogicalType::GEOMETRY(t) => LogicalType::Geometry { crs: t.crs }, + parquet::LogicalType::GEOGRAPHY(t) => LogicalType::Geography { + crs: t.crs, + algorithm: t.algorithm.map(|algorithm| match algorithm { + parquet::EdgeInterpolationAlgorithm::SPHERICAL => { + EdgeInterpolationAlgorithm::ANDOYER + } + parquet::EdgeInterpolationAlgorithm::VINCENTY => { + EdgeInterpolationAlgorithm::VINCENTY + } + parquet::EdgeInterpolationAlgorithm::ANDOYER => { + EdgeInterpolationAlgorithm::ANDOYER + } + parquet::EdgeInterpolationAlgorithm::THOMAS => { + EdgeInterpolationAlgorithm::THOMAS + } + parquet::EdgeInterpolationAlgorithm::KARNEY => { + EdgeInterpolationAlgorithm::KARNEY + } + _ => EdgeInterpolationAlgorithm::UNKNOWN(algorithm.0), + }), + }, } } } @@ -894,8 +948,32 @@ impl From for parquet::LogicalType { LogicalType::Uuid => parquet::LogicalType::UUID(Default::default()), LogicalType::Float16 => parquet::LogicalType::FLOAT16(Default::default()), LogicalType::Variant => parquet::LogicalType::VARIANT(Default::default()), - LogicalType::Geometry => parquet::LogicalType::GEOMETRY(Default::default()), - LogicalType::Geography => parquet::LogicalType::GEOGRAPHY(Default::default()), + LogicalType::Geometry { crs } => parquet::LogicalType::GEOMETRY(GeometryType { crs }), + LogicalType::Geography { crs, algorithm } => { + parquet::LogicalType::GEOGRAPHY(GeographyType { + crs, + algorithm: algorithm.map(|algorithm| match algorithm { + EdgeInterpolationAlgorithm::SPHERICAL => { + parquet::EdgeInterpolationAlgorithm::SPHERICAL + } + EdgeInterpolationAlgorithm::VINCENTY => { + parquet::EdgeInterpolationAlgorithm::VINCENTY + } + EdgeInterpolationAlgorithm::THOMAS => { + parquet::EdgeInterpolationAlgorithm::THOMAS + } + EdgeInterpolationAlgorithm::ANDOYER => { + parquet::EdgeInterpolationAlgorithm::ANDOYER + } + EdgeInterpolationAlgorithm::KARNEY => { + parquet::EdgeInterpolationAlgorithm::KARNEY + } + EdgeInterpolationAlgorithm::UNKNOWN(code) => { + parquet::EdgeInterpolationAlgorithm(code) + } + }), + }) + } } } } @@ -950,8 +1028,8 @@ impl From> for ConvertedType { LogicalType::Uuid | LogicalType::Float16 | LogicalType::Variant - | LogicalType::Geometry - | LogicalType::Geography + | LogicalType::Geometry { .. } + | LogicalType::Geography { .. } | LogicalType::Unknown => ConvertedType::NONE, }, None => ConvertedType::NONE, @@ -1201,8 +1279,11 @@ impl str::FromStr for LogicalType { "Interval parquet logical type not yet supported" )), "FLOAT16" => Ok(LogicalType::Float16), - "GEOMETRY" => Ok(LogicalType::Geometry), - "GEOGRAPHY" => Ok(LogicalType::Geography), + "GEOMETRY" => Ok(LogicalType::Geometry { crs: None }), + "GEOGRAPHY" => Ok(LogicalType::Geography { + crs: None, + algorithm: None, + }), other => Err(general_err!("Invalid parquet logical type {}", other)), } } diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index 5ef068da915b..c83c68184d57 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -327,8 +327,19 @@ fn print_logical_and_converted( LogicalType::Map => "MAP".to_string(), LogicalType::Float16 => "FLOAT16".to_string(), LogicalType::Variant => "VARIANT".to_string(), - LogicalType::Geometry => "GEOMETRY".to_string(), - LogicalType::Geography => "GEOGRAPHY".to_string(), + LogicalType::Geometry { crs } => { + if let Some(crs) = crs { + format!("GEOMETRY({crs})") + } else { + "GEOMETRY".to_string() + } + } + LogicalType::Geography { crs, algorithm } => match (crs, algorithm) { + (None, None) => "GEOGRAPHY".to_string(), + (None, Some(algorithm)) => format!("GEOGRAPHY(, {algorithm:?})"), + (Some(crs), None) => format!("GEOGRAPHY({crs})"), + (Some(crs), Some(algorithm)) => format!("GEOGRAPHY({crs}, {algorithm:?})"), + }, LogicalType::Unknown => "UNKNOWN".to_string(), }, None => { diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 2f6131571ecc..3b8d1328ac00 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -401,8 +401,8 @@ impl<'a> PrimitiveTypeBuilder<'a> { (LogicalType::String, PhysicalType::BYTE_ARRAY) => {} (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {} (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {} - (LogicalType::Geometry, PhysicalType::BYTE_ARRAY) => {} - (LogicalType::Geography, PhysicalType::BYTE_ARRAY) => {} + (LogicalType::Geometry { .. }, PhysicalType::BYTE_ARRAY) => {} + (LogicalType::Geography { .. }, PhysicalType::BYTE_ARRAY) => {} (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {} (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => { return Err(general_err!( From 40e6871f29b7c4c7d135e2b7b4300e4c1e3c2058 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Sep 2025 13:56:59 -0500 Subject: [PATCH 2/8] printing tests --- parquet/src/basic.rs | 83 ++++++++++++++++++++--------- parquet/src/schema/printer.rs | 98 +++++++++++++++++++++++++++++++++-- 2 files changed, 151 insertions(+), 30 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 2d4589dc8428..584cd98be3e6 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -354,31 +354,6 @@ pub enum Encoding { BYTE_STREAM_SPLIT, } -// ---------------------------------------------------------------------- -// Mirrors `parquet::EdgeInterpolationAlgorithm` - -/// Edge interpolation algorithm for Geography logical type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum EdgeInterpolationAlgorithm { - /// Edges are interpolated as geodesics on a sphere. - SPHERICAL, - - /// - VINCENTY, - - /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 - THOMAS, - - /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. - ANDOYER, - - /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 - KARNEY, - - /// An unknown/unrecognized algorithm - UNKNOWN(i32), -} - impl FromStr for Encoding { type Err = ParquetError; @@ -694,6 +669,31 @@ impl ColumnOrder { } } +// ---------------------------------------------------------------------- +// Mirrors `parquet::EdgeInterpolationAlgorithm` + +/// Edge interpolation algorithm for Geography logical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum EdgeInterpolationAlgorithm { + /// Edges are interpolated as geodesics on a sphere. + SPHERICAL, + + /// + VINCENTY, + + /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 + THOMAS, + + /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. + ANDOYER, + + /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 + KARNEY, + + /// An unknown/unrecognized algorithm + UNKNOWN(i32), +} + impl fmt::Display for Type { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{self:?}") @@ -742,6 +742,12 @@ impl fmt::Display for ColumnOrder { } } +impl fmt::Display for EdgeInterpolationAlgorithm { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{self:?}") + } +} + // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion @@ -1933,6 +1939,17 @@ mod tests { ConvertedType::from(Some(LogicalType::Float16)), ConvertedType::NONE ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Geometry { crs: None })), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Geography { + crs: None, + algorithm: None + })), + ConvertedType::NONE + ); assert_eq!( ConvertedType::from(Some(LogicalType::Unknown)), ConvertedType::NONE @@ -2509,4 +2526,20 @@ mod tests { "Parquet error: unknown encoding: gzip(-10)" ); } + + #[test] + fn test_display_edge_algorithm() { + assert_eq!( + EdgeInterpolationAlgorithm::SPHERICAL.to_string(), + "SPHERICAL" + ); + assert_eq!(EdgeInterpolationAlgorithm::VINCENTY.to_string(), "VINCENTY"); + assert_eq!(EdgeInterpolationAlgorithm::THOMAS.to_string(), "THOMAS"); + assert_eq!(EdgeInterpolationAlgorithm::ANDOYER.to_string(), "ANDOYER"); + assert_eq!(EdgeInterpolationAlgorithm::KARNEY.to_string(), "KARNEY"); + assert_eq!( + EdgeInterpolationAlgorithm::UNKNOWN(99).to_string(), + "UNKNOWN(99)" + ); + } } diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index c83c68184d57..36395b86003a 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -329,16 +329,18 @@ fn print_logical_and_converted( LogicalType::Variant => "VARIANT".to_string(), LogicalType::Geometry { crs } => { if let Some(crs) = crs { - format!("GEOMETRY({crs})") + // TODO: replace control characters quotes and backslashes + // with unicode escapes for CRS values + format!(r#"GEOMETRY("{crs}")"#) } else { "GEOMETRY".to_string() } } LogicalType::Geography { crs, algorithm } => match (crs, algorithm) { (None, None) => "GEOGRAPHY".to_string(), - (None, Some(algorithm)) => format!("GEOGRAPHY(, {algorithm:?})"), - (Some(crs), None) => format!("GEOGRAPHY({crs})"), - (Some(crs), Some(algorithm)) => format!("GEOGRAPHY({crs}, {algorithm:?})"), + (Some(crs), None) => format!(r#"GEOGRAPHY("{crs}")"#), + (None, Some(algorithm)) => format!("GEOGRAPHY(None, {algorithm})"), + (Some(crs), Some(algorithm)) => format!(r#"GEOGRAPHY("{crs}", {algorithm})"#), }, LogicalType::Unknown => "UNKNOWN".to_string(), }, @@ -460,7 +462,7 @@ mod tests { use std::sync::Arc; - use crate::basic::{Repetition, Type as PhysicalType}; + use crate::basic::{EdgeInterpolationAlgorithm, Repetition, Type as PhysicalType}; use crate::errors::Result; use crate::schema::parser::parse_message_type; @@ -790,6 +792,92 @@ mod tests { .unwrap(), "REQUIRED BYTE_ARRAY field [42] (STRING);", ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geometry { crs: None }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOMETRY);", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geometry { + crs: Some("non-missing CRS".to_string()), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + r#"REQUIRED BYTE_ARRAY field (GEOMETRY("non-missing CRS"));"#, + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: None, + algorithm: None, + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOGRAPHY);", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: Some("non-missing CRS".to_string()), + algorithm: None, + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + r#"REQUIRED BYTE_ARRAY field (GEOGRAPHY("non-missing CRS"));"#, + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::KARNEY), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + r#"REQUIRED BYTE_ARRAY field (GEOGRAPHY(None, KARNEY));"#, + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: Some("non-missing CRS".to_string()), + algorithm: Some(EdgeInterpolationAlgorithm::KARNEY), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + r#"REQUIRED BYTE_ARRAY field (GEOGRAPHY("non-missing CRS", KARNEY));"#, + ), ]; types_and_strings.into_iter().for_each(|(field, expected)| { From 8990f61d39264147038d796764a15c62322d44ff Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Sep 2025 14:26:33 -0500 Subject: [PATCH 3/8] make edge interpolation non optional --- parquet/src/basic.rs | 69 +++++++++++++++++++---------------- parquet/src/schema/printer.rs | 53 +++++++-------------------- 2 files changed, 51 insertions(+), 71 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 584cd98be3e6..94054fae3704 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -241,7 +241,7 @@ pub enum LogicalType { /// A custom CRS. If unset, the CRS defaults to "OGC:CRS84". crs: Option, /// Edge interpolation method. - algorithm: Option, + algorithm: EdgeInterpolationAlgorithm, }, } @@ -694,6 +694,12 @@ pub enum EdgeInterpolationAlgorithm { UNKNOWN(i32), } +impl Default for EdgeInterpolationAlgorithm { + fn default() -> Self { + Self::SPHERICAL + } +} + impl fmt::Display for Type { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{self:?}") @@ -893,24 +899,27 @@ impl From for LogicalType { parquet::LogicalType::GEOMETRY(t) => LogicalType::Geometry { crs: t.crs }, parquet::LogicalType::GEOGRAPHY(t) => LogicalType::Geography { crs: t.crs, - algorithm: t.algorithm.map(|algorithm| match algorithm { - parquet::EdgeInterpolationAlgorithm::SPHERICAL => { - EdgeInterpolationAlgorithm::ANDOYER - } - parquet::EdgeInterpolationAlgorithm::VINCENTY => { - EdgeInterpolationAlgorithm::VINCENTY - } - parquet::EdgeInterpolationAlgorithm::ANDOYER => { - EdgeInterpolationAlgorithm::ANDOYER - } - parquet::EdgeInterpolationAlgorithm::THOMAS => { - EdgeInterpolationAlgorithm::THOMAS - } - parquet::EdgeInterpolationAlgorithm::KARNEY => { - EdgeInterpolationAlgorithm::KARNEY - } - _ => EdgeInterpolationAlgorithm::UNKNOWN(algorithm.0), - }), + algorithm: t + .algorithm + .map(|algorithm| match algorithm { + parquet::EdgeInterpolationAlgorithm::SPHERICAL => { + EdgeInterpolationAlgorithm::SPHERICAL + } + parquet::EdgeInterpolationAlgorithm::VINCENTY => { + EdgeInterpolationAlgorithm::VINCENTY + } + parquet::EdgeInterpolationAlgorithm::ANDOYER => { + EdgeInterpolationAlgorithm::ANDOYER + } + parquet::EdgeInterpolationAlgorithm::THOMAS => { + EdgeInterpolationAlgorithm::THOMAS + } + parquet::EdgeInterpolationAlgorithm::KARNEY => { + EdgeInterpolationAlgorithm::KARNEY + } + _ => EdgeInterpolationAlgorithm::UNKNOWN(algorithm.0), + }) + .unwrap_or(EdgeInterpolationAlgorithm::SPHERICAL), }, } } @@ -958,26 +967,24 @@ impl From for parquet::LogicalType { LogicalType::Geography { crs, algorithm } => { parquet::LogicalType::GEOGRAPHY(GeographyType { crs, - algorithm: algorithm.map(|algorithm| match algorithm { - EdgeInterpolationAlgorithm::SPHERICAL => { - parquet::EdgeInterpolationAlgorithm::SPHERICAL - } + algorithm: match algorithm { + EdgeInterpolationAlgorithm::SPHERICAL => None, EdgeInterpolationAlgorithm::VINCENTY => { - parquet::EdgeInterpolationAlgorithm::VINCENTY + Some(parquet::EdgeInterpolationAlgorithm::VINCENTY) } EdgeInterpolationAlgorithm::THOMAS => { - parquet::EdgeInterpolationAlgorithm::THOMAS + Some(parquet::EdgeInterpolationAlgorithm::THOMAS) } EdgeInterpolationAlgorithm::ANDOYER => { - parquet::EdgeInterpolationAlgorithm::ANDOYER + Some(parquet::EdgeInterpolationAlgorithm::ANDOYER) } EdgeInterpolationAlgorithm::KARNEY => { - parquet::EdgeInterpolationAlgorithm::KARNEY + Some(parquet::EdgeInterpolationAlgorithm::KARNEY) } EdgeInterpolationAlgorithm::UNKNOWN(code) => { - parquet::EdgeInterpolationAlgorithm(code) + Some(parquet::EdgeInterpolationAlgorithm(code)) } - }), + }, }) } } @@ -1288,7 +1295,7 @@ impl str::FromStr for LogicalType { "GEOMETRY" => Ok(LogicalType::Geometry { crs: None }), "GEOGRAPHY" => Ok(LogicalType::Geography { crs: None, - algorithm: None, + algorithm: EdgeInterpolationAlgorithm::SPHERICAL, }), other => Err(general_err!("Invalid parquet logical type {}", other)), } @@ -1946,7 +1953,7 @@ mod tests { assert_eq!( ConvertedType::from(Some(LogicalType::Geography { crs: None, - algorithm: None + algorithm: EdgeInterpolationAlgorithm::default() })), ConvertedType::NONE ); diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index 36395b86003a..772abd44aefd 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -336,12 +336,15 @@ fn print_logical_and_converted( "GEOMETRY".to_string() } } - LogicalType::Geography { crs, algorithm } => match (crs, algorithm) { - (None, None) => "GEOGRAPHY".to_string(), - (Some(crs), None) => format!(r#"GEOGRAPHY("{crs}")"#), - (None, Some(algorithm)) => format!("GEOGRAPHY(None, {algorithm})"), - (Some(crs), Some(algorithm)) => format!(r#"GEOGRAPHY("{crs}", {algorithm})"#), - }, + LogicalType::Geography { crs, algorithm } => { + if let Some(crs) = crs { + // TODO: replace control characters quotes and backslashes + // with unicode escapes for CRS values + format!(r#"GEOGRAPHY({algorithm}, "{crs}")"#) + } else { + format!(r#"GEOGRAPHY({algorithm})"#) + } + } LogicalType::Unknown => "UNKNOWN".to_string(), }, None => { @@ -825,43 +828,13 @@ mod tests { PhysicalType::BYTE_ARRAY, Some(LogicalType::Geography { crs: None, - algorithm: None, - }), - ConvertedType::NONE, - Repetition::REQUIRED, - ) - .unwrap(), - "REQUIRED BYTE_ARRAY field (GEOGRAPHY);", - ), - ( - build_primitive_type( - "field", - None, - PhysicalType::BYTE_ARRAY, - Some(LogicalType::Geography { - crs: Some("non-missing CRS".to_string()), - algorithm: None, - }), - ConvertedType::NONE, - Repetition::REQUIRED, - ) - .unwrap(), - r#"REQUIRED BYTE_ARRAY field (GEOGRAPHY("non-missing CRS"));"#, - ), - ( - build_primitive_type( - "field", - None, - PhysicalType::BYTE_ARRAY, - Some(LogicalType::Geography { - crs: None, - algorithm: Some(EdgeInterpolationAlgorithm::KARNEY), + algorithm: EdgeInterpolationAlgorithm::default(), }), ConvertedType::NONE, Repetition::REQUIRED, ) .unwrap(), - r#"REQUIRED BYTE_ARRAY field (GEOGRAPHY(None, KARNEY));"#, + "REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL));", ), ( build_primitive_type( @@ -870,13 +843,13 @@ mod tests { PhysicalType::BYTE_ARRAY, Some(LogicalType::Geography { crs: Some("non-missing CRS".to_string()), - algorithm: Some(EdgeInterpolationAlgorithm::KARNEY), + algorithm: EdgeInterpolationAlgorithm::default(), }), ConvertedType::NONE, Repetition::REQUIRED, ) .unwrap(), - r#"REQUIRED BYTE_ARRAY field (GEOGRAPHY("non-missing CRS", KARNEY));"#, + r#"REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL, "non-missing CRS"));"#, ), ]; From ff35ece8ec151f71e418c404c1d4f3e07a839bce Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Sep 2025 14:59:38 -0500 Subject: [PATCH 4/8] move statistics test --- parquet/src/geospatial/statistics.rs | 46 +++++---------------- parquet/src/lib.rs | 3 +- parquet/tests/geospatial.rs | 60 ++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 37 deletions(-) create mode 100644 parquet/tests/geospatial.rs diff --git a/parquet/src/geospatial/statistics.rs b/parquet/src/geospatial/statistics.rs index 2a39c494bd0f..7eddae55610c 100644 --- a/parquet/src/geospatial/statistics.rs +++ b/parquet/src/geospatial/statistics.rs @@ -44,9 +44,7 @@ use crate::geospatial::bounding_box::BoundingBox; /// ``` #[derive(Clone, Debug, PartialEq, Default)] pub struct GeospatialStatistics { - /// Optional bounding defining the spatial extent, where None represents a lack of information. bbox: Option, - /// Optional list of geometry type identifiers, where None represents lack of information geospatial_types: Option>, } @@ -58,6 +56,16 @@ impl GeospatialStatistics { geospatial_types, } } + + /// Optional list of geometry type identifiers, where None represents lack of information + pub fn geospatial_types(&self) -> Option<&Vec> { + self.geospatial_types.as_ref() + } + + /// Optional bounding defining the spatial extent, where None represents a lack of information. + pub fn bounding_box(&self) -> Option<&BoundingBox> { + self.bbox.as_ref() + } } /// Converts a Thrift-generated geospatial statistics object to the internal representation. @@ -127,38 +135,4 @@ mod tests { assert_eq!(thrift_bbox_m.mmin, Some(OrderedFloat(10.0))); assert_eq!(thrift_bbox_m.mmax, Some(OrderedFloat(20.0))); } - - #[test] - fn test_read_geospatial_statistics_from_file() { - use crate::file::reader::{FileReader, SerializedFileReader}; - use std::fs::File; - - let path = format!( - "{}/geospatial/geospatial.parquet", - arrow::util::test_util::parquet_test_data(), - ); - let file = File::open(path).unwrap(); - let reader = SerializedFileReader::try_from(file).unwrap(); - let metadata = reader.metadata(); - - // geospatial.parquet schema: - // optional binary field_id=-1 group (String); - // optional binary field_id=-1 wkt (String); - // optional binary field_id=-1 geometry (Geometry(crs=)); - let geo_statistics = metadata.row_group(0).column(2).geo_statistics(); - assert!(geo_statistics.is_some()); - - let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0) - .with_zrange(30.0, 80.0) - .with_mrange(200.0, 1600.0); - let expected_geospatial_types = vec![ - 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007, - ]; - assert_eq!( - geo_statistics.unwrap().geospatial_types, - Some(expected_geospatial_types) - ); - assert_eq!(geo_statistics.unwrap().bbox, Some(expected_bbox)); - } } diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 19a69bce900b..16f07a0ad24e 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -206,4 +206,5 @@ pub enum DecodeResult { #[cfg(feature = "variant_experimental")] pub mod variant; -experimental!(pub mod geospatial); + +pub mod geospatial; diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs new file mode 100644 index 000000000000..11fc379f9fed --- /dev/null +++ b/parquet/tests/geospatial.rs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for Geometry and Geography logical types + +use parquet::{ + basic::LogicalType, + file::reader::{FileReader, SerializedFileReader}, + geospatial::bounding_box::BoundingBox, +}; +use std::fs::File; + +#[test] +fn test_read_geospatial_statistics_from_file() { + let path = format!( + "{}/geospatial/geospatial.parquet", + arrow::util::test_util::parquet_test_data(), + ); + let file = File::open(path).unwrap(); + let reader = SerializedFileReader::try_from(file).unwrap(); + let metadata = reader.metadata(); + + // geospatial.parquet schema: + // optional binary field_id=-1 group (String); + // optional binary field_id=-1 wkt (String); + // optional binary field_id=-1 geometry (Geometry(crs=)); + let fields = metadata.file_metadata().schema().get_fields(); + let logical_type = fields[2].get_basic_info().logical_type().unwrap(); + assert_eq!(logical_type, LogicalType::Geometry { crs: None }); + + let geo_statistics = metadata.row_group(0).column(2).geo_statistics(); + assert!(geo_statistics.is_some()); + + let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0) + .with_zrange(30.0, 80.0) + .with_mrange(200.0, 1600.0); + let expected_geospatial_types = vec![ + 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004, + 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007, + ]; + assert_eq!( + geo_statistics.unwrap().geospatial_types(), + Some(&expected_geospatial_types) + ); + assert_eq!(geo_statistics.unwrap().bounding_box(), Some(&expected_bbox)); +} From 16f5332db4703a2391597e595556ba3e88c0edcf Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Sep 2025 16:12:50 -0500 Subject: [PATCH 5/8] add logical type tests --- parquet/tests/geospatial.rs | 73 ++++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs index 11fc379f9fed..ebaea5ece912 100644 --- a/parquet/tests/geospatial.rs +++ b/parquet/tests/geospatial.rs @@ -18,21 +18,82 @@ //! Tests for Geometry and Geography logical types use parquet::{ - basic::LogicalType, - file::reader::{FileReader, SerializedFileReader}, + basic::{EdgeInterpolationAlgorithm, LogicalType}, + file::{ + metadata::ParquetMetaData, + reader::{FileReader, SerializedFileReader}, + }, geospatial::bounding_box::BoundingBox, }; +use serde_json::Value; use std::fs::File; -#[test] -fn test_read_geospatial_statistics_from_file() { +fn read_metadata(geospatial_test_file: &str) -> ParquetMetaData { let path = format!( - "{}/geospatial/geospatial.parquet", + "{}/geospatial/{geospatial_test_file}", arrow::util::test_util::parquet_test_data(), ); let file = File::open(path).unwrap(); let reader = SerializedFileReader::try_from(file).unwrap(); - let metadata = reader.metadata(); + reader.metadata().clone() +} + +#[test] +fn test_read_logical_type() { + let expected_logical_type = [ + ("crs-default.parquet", LogicalType::Geometry { crs: None }), + ( + "crs-srid.parquet", + LogicalType::Geometry { + crs: Some("srid:5070".to_string()), + }, + ), + ( + "crs-projjson.parquet", + LogicalType::Geometry { + crs: Some("projjson:projjson_epsg_5070".to_string()), + }, + ), + ( + "crs-geography.parquet", + LogicalType::Geography { + crs: None, + algorithm: EdgeInterpolationAlgorithm::SPHERICAL, + }, + ), + ]; + + for (geospatial_file, expected_type) in expected_logical_type { + let metadata = read_metadata(geospatial_file); + let logical_type = metadata + .file_metadata() + .schema_descr() + .column(1) + .logical_type() + .unwrap(); + + assert_eq!(logical_type, expected_type); + } + + let metadata = read_metadata("crs-arbitrary-value.parquet"); + let logical_type = metadata + .file_metadata() + .schema_descr() + .column(1) + .logical_type() + .unwrap(); + + if let LogicalType::Geometry { crs } = logical_type { + let crs_parsed: Value = serde_json::from_str(&crs.unwrap()).unwrap(); + assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070); + } else { + panic!("Expected geometry type but got {logical_type:?}"); + } +} + +#[test] +fn test_read_geospatial_statistics() { + let metadata = read_metadata("geospatial.parquet"); // geospatial.parquet schema: // optional binary field_id=-1 group (String); From 747e459d191ffb1e1127eaff69ab2709fc308d0d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Sep 2025 16:21:49 -0500 Subject: [PATCH 6/8] more --- parquet/src/basic.rs | 10 +++++++++- parquet/src/schema/printer.rs | 14 +++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 94054fae3704..fa183ebd58dd 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -2344,7 +2344,15 @@ mod tests { check_sort_order(signed, SortOrder::SIGNED); // Undefined comparison - let undefined = vec![LogicalType::List, LogicalType::Map]; + let undefined = vec![ + LogicalType::List, + LogicalType::Map, + LogicalType::Geometry { crs: None }, + LogicalType::Geography { + crs: None, + algorithm: EdgeInterpolationAlgorithm::default(), + }, + ]; check_sort_order(undefined, SortOrder::UNDEFINED); } diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index 772abd44aefd..ee8e2a54de4e 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -329,20 +329,16 @@ fn print_logical_and_converted( LogicalType::Variant => "VARIANT".to_string(), LogicalType::Geometry { crs } => { if let Some(crs) = crs { - // TODO: replace control characters quotes and backslashes - // with unicode escapes for CRS values - format!(r#"GEOMETRY("{crs}")"#) + format!("GEOMETRY({crs})") } else { "GEOMETRY".to_string() } } LogicalType::Geography { crs, algorithm } => { if let Some(crs) = crs { - // TODO: replace control characters quotes and backslashes - // with unicode escapes for CRS values - format!(r#"GEOGRAPHY({algorithm}, "{crs}")"#) + format!("GEOGRAPHY({algorithm}, {crs})") } else { - format!(r#"GEOGRAPHY({algorithm})"#) + format!("GEOGRAPHY({algorithm})") } } LogicalType::Unknown => "UNKNOWN".to_string(), @@ -819,7 +815,7 @@ mod tests { Repetition::REQUIRED, ) .unwrap(), - r#"REQUIRED BYTE_ARRAY field (GEOMETRY("non-missing CRS"));"#, + "REQUIRED BYTE_ARRAY field (GEOMETRY(non-missing CRS));", ), ( build_primitive_type( @@ -849,7 +845,7 @@ mod tests { Repetition::REQUIRED, ) .unwrap(), - r#"REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL, "non-missing CRS"));"#, + "REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL, non-missing CRS));", ), ]; From fd9f31428b300da2c60b2c444296c4901b5430fc Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Sep 2025 16:36:54 -0500 Subject: [PATCH 7/8] undo lib.rs change --- parquet/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 16f07a0ad24e..19a69bce900b 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -206,5 +206,4 @@ pub enum DecodeResult { #[cfg(feature = "variant_experimental")] pub mod variant; - -pub mod geospatial; +experimental!(pub mod geospatial); From f7e69fb013fe6dc05bbc7d464e9459933d45e8e5 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Sep 2025 16:39:23 -0500 Subject: [PATCH 8/8] document tests --- parquet/tests/geospatial.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs index ebaea5ece912..733aa8864f3b 100644 --- a/parquet/tests/geospatial.rs +++ b/parquet/tests/geospatial.rs @@ -40,6 +40,7 @@ fn read_metadata(geospatial_test_file: &str) -> ParquetMetaData { #[test] fn test_read_logical_type() { + // Some crs values are short strings let expected_logical_type = [ ("crs-default.parquet", LogicalType::Geometry { crs: None }), ( @@ -75,6 +76,8 @@ fn test_read_logical_type() { assert_eq!(logical_type, expected_type); } + // The crs value may also contain arbitrary values (in this case some JSON + // a bit too lengthy to type out) let metadata = read_metadata("crs-arbitrary-value.parquet"); let logical_type = metadata .file_metadata()