Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions parquet/src/arrow/schema/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,8 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> Result<D
(Some(LogicalType::Json), _) => Ok(DataType::Utf8),
(Some(LogicalType::Bson), _) => Ok(DataType::Binary),
(Some(LogicalType::Enum), _) => Ok(DataType::Binary),
(Some(LogicalType::Geometry), _) => Ok(DataType::Binary),
(Some(LogicalType::Geography), _) => Ok(DataType::Binary),
(Some(LogicalType::Geometry { .. }), _) => Ok(DataType::Binary),
(Some(LogicalType::Geography { .. }), _) => Ok(DataType::Binary),
(None, ConvertedType::NONE) => Ok(DataType::Binary),
(None, ConvertedType::JSON) => Ok(DataType::Utf8),
(None, ConvertedType::BSON) => Ok(DataType::Binary),
Expand Down
161 changes: 145 additions & 16 deletions parquet/src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ use crate::errors::{ParquetError, Result};

// Re-export crate::format types used in this module
pub use crate::format::{
BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, NullType,
StringType, TimeType, TimeUnit, TimestampType, UUIDType,
BsonType, DateType, DecimalType, EnumType, GeographyType, GeometryType, IntType, JsonType,
ListType, MapType, NullType, StringType, TimeType, TimeUnit, TimestampType, UUIDType,
};

// ----------------------------------------------------------------------
Expand Down Expand Up @@ -231,9 +231,18 @@ pub enum LogicalType {
/// A Variant value.
Variant,
/// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation.
Geometry,
Geometry {
/// A custom CRS. If unset, it defaults to "OGC:CRS84", which means that the geometries
/// must be stored in longitude, latitude based on the WGS84 datum.
crs: Option<String>,
},
/// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation.
Geography,
Geography {
/// A custom CRS. If unset, the CRS defaults to "OGC:CRS84".
crs: Option<String>,
/// Edge interpolation method.
algorithm: EdgeInterpolationAlgorithm,
},
}

// ----------------------------------------------------------------------
Expand Down Expand Up @@ -584,9 +593,9 @@ impl ColumnOrder {
LogicalType::Unknown => SortOrder::UNDEFINED,
LogicalType::Uuid => SortOrder::UNSIGNED,
LogicalType::Float16 => SortOrder::SIGNED,
LogicalType::Variant | LogicalType::Geometry | LogicalType::Geography => {
SortOrder::UNDEFINED
}
LogicalType::Variant
| LogicalType::Geometry { .. }
| LogicalType::Geography { .. } => SortOrder::UNDEFINED,
},
// Fall back to converted type
None => Self::get_converted_sort_order(converted_type, physical_type),
Expand Down Expand Up @@ -660,6 +669,37 @@ impl ColumnOrder {
}
}

// ----------------------------------------------------------------------
// Mirrors `parquet::EdgeInterpolationAlgorithm`

/// Edge interpolation algorithm for Geography logical type
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum EdgeInterpolationAlgorithm {
/// Edges are interpolated as geodesics on a sphere.
SPHERICAL,
Comment on lines +672 to +679
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that some enums are copied out here (e.g., Encoding) and some are just references to the Thrift enum (e.g., TimeUnit). I don't mind which one of these we use...I did this version because it makes it harder for a non-geospatial aware user to do the wrong thing (i.e., it applies the "default" of SPHERICAL whose value would otherwise be squirrelled away in the Parquet specification where probably nobody will ever look for it).


/// <https://en.wikipedia.org/wiki/Vincenty%27s_formulae>
VINCENTY,

/// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970
THOMAS,

/// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965.
ANDOYER,

/// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55
KARNEY,

/// An unknown/unrecognized algorithm
UNKNOWN(i32),
}

impl Default for EdgeInterpolationAlgorithm {
fn default() -> Self {
Self::SPHERICAL
}
}

impl fmt::Display for Type {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{self:?}")
Expand Down Expand Up @@ -708,6 +748,12 @@ impl fmt::Display for ColumnOrder {
}
}

impl fmt::Display for EdgeInterpolationAlgorithm {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{self:?}")
}
}

// ----------------------------------------------------------------------
// parquet::Type <=> Type conversion

Expand Down Expand Up @@ -850,8 +896,31 @@ impl From<parquet::LogicalType> for LogicalType {
parquet::LogicalType::UUID(_) => LogicalType::Uuid,
parquet::LogicalType::FLOAT16(_) => LogicalType::Float16,
parquet::LogicalType::VARIANT(_) => LogicalType::Variant,
parquet::LogicalType::GEOMETRY(_) => LogicalType::Geometry,
parquet::LogicalType::GEOGRAPHY(_) => LogicalType::Geography,
parquet::LogicalType::GEOMETRY(t) => LogicalType::Geometry { crs: t.crs },
parquet::LogicalType::GEOGRAPHY(t) => LogicalType::Geography {
crs: t.crs,
algorithm: t
.algorithm
.map(|algorithm| match algorithm {
parquet::EdgeInterpolationAlgorithm::SPHERICAL => {
EdgeInterpolationAlgorithm::SPHERICAL
}
parquet::EdgeInterpolationAlgorithm::VINCENTY => {
EdgeInterpolationAlgorithm::VINCENTY
}
parquet::EdgeInterpolationAlgorithm::ANDOYER => {
EdgeInterpolationAlgorithm::ANDOYER
}
parquet::EdgeInterpolationAlgorithm::THOMAS => {
EdgeInterpolationAlgorithm::THOMAS
}
parquet::EdgeInterpolationAlgorithm::KARNEY => {
EdgeInterpolationAlgorithm::KARNEY
}
_ => EdgeInterpolationAlgorithm::UNKNOWN(algorithm.0),
})
.unwrap_or(EdgeInterpolationAlgorithm::SPHERICAL),
},
}
}
}
Expand Down Expand Up @@ -894,8 +963,30 @@ impl From<LogicalType> for parquet::LogicalType {
LogicalType::Uuid => parquet::LogicalType::UUID(Default::default()),
LogicalType::Float16 => parquet::LogicalType::FLOAT16(Default::default()),
LogicalType::Variant => parquet::LogicalType::VARIANT(Default::default()),
LogicalType::Geometry => parquet::LogicalType::GEOMETRY(Default::default()),
LogicalType::Geography => parquet::LogicalType::GEOGRAPHY(Default::default()),
LogicalType::Geometry { crs } => parquet::LogicalType::GEOMETRY(GeometryType { crs }),
LogicalType::Geography { crs, algorithm } => {
parquet::LogicalType::GEOGRAPHY(GeographyType {
crs,
algorithm: match algorithm {
EdgeInterpolationAlgorithm::SPHERICAL => None,
EdgeInterpolationAlgorithm::VINCENTY => {
Some(parquet::EdgeInterpolationAlgorithm::VINCENTY)
}
EdgeInterpolationAlgorithm::THOMAS => {
Some(parquet::EdgeInterpolationAlgorithm::THOMAS)
}
EdgeInterpolationAlgorithm::ANDOYER => {
Some(parquet::EdgeInterpolationAlgorithm::ANDOYER)
}
EdgeInterpolationAlgorithm::KARNEY => {
Some(parquet::EdgeInterpolationAlgorithm::KARNEY)
}
EdgeInterpolationAlgorithm::UNKNOWN(code) => {
Some(parquet::EdgeInterpolationAlgorithm(code))
}
},
})
}
}
}
}
Expand Down Expand Up @@ -950,8 +1041,8 @@ impl From<Option<LogicalType>> for ConvertedType {
LogicalType::Uuid
| LogicalType::Float16
| LogicalType::Variant
| LogicalType::Geometry
| LogicalType::Geography
| LogicalType::Geometry { .. }
| LogicalType::Geography { .. }
| LogicalType::Unknown => ConvertedType::NONE,
},
None => ConvertedType::NONE,
Expand Down Expand Up @@ -1201,8 +1292,11 @@ impl str::FromStr for LogicalType {
"Interval parquet logical type not yet supported"
)),
"FLOAT16" => Ok(LogicalType::Float16),
"GEOMETRY" => Ok(LogicalType::Geometry),
"GEOGRAPHY" => Ok(LogicalType::Geography),
"GEOMETRY" => Ok(LogicalType::Geometry { crs: None }),
"GEOGRAPHY" => Ok(LogicalType::Geography {
crs: None,
algorithm: EdgeInterpolationAlgorithm::SPHERICAL,
}),
other => Err(general_err!("Invalid parquet logical type {}", other)),
}
}
Expand Down Expand Up @@ -1852,6 +1946,17 @@ mod tests {
ConvertedType::from(Some(LogicalType::Float16)),
ConvertedType::NONE
);
assert_eq!(
ConvertedType::from(Some(LogicalType::Geometry { crs: None })),
ConvertedType::NONE
);
assert_eq!(
ConvertedType::from(Some(LogicalType::Geography {
crs: None,
algorithm: EdgeInterpolationAlgorithm::default()
})),
ConvertedType::NONE
);
assert_eq!(
ConvertedType::from(Some(LogicalType::Unknown)),
ConvertedType::NONE
Expand Down Expand Up @@ -2239,7 +2344,15 @@ mod tests {
check_sort_order(signed, SortOrder::SIGNED);

// Undefined comparison
let undefined = vec![LogicalType::List, LogicalType::Map];
let undefined = vec![
LogicalType::List,
LogicalType::Map,
LogicalType::Geometry { crs: None },
LogicalType::Geography {
crs: None,
algorithm: EdgeInterpolationAlgorithm::default(),
},
];
check_sort_order(undefined, SortOrder::UNDEFINED);
}

Expand Down Expand Up @@ -2428,4 +2541,20 @@ mod tests {
"Parquet error: unknown encoding: gzip(-10)"
);
}

#[test]
fn test_display_edge_algorithm() {
assert_eq!(
EdgeInterpolationAlgorithm::SPHERICAL.to_string(),
"SPHERICAL"
);
assert_eq!(EdgeInterpolationAlgorithm::VINCENTY.to_string(), "VINCENTY");
assert_eq!(EdgeInterpolationAlgorithm::THOMAS.to_string(), "THOMAS");
assert_eq!(EdgeInterpolationAlgorithm::ANDOYER.to_string(), "ANDOYER");
assert_eq!(EdgeInterpolationAlgorithm::KARNEY.to_string(), "KARNEY");
assert_eq!(
EdgeInterpolationAlgorithm::UNKNOWN(99).to_string(),
"UNKNOWN(99)"
);
}
}
46 changes: 10 additions & 36 deletions parquet/src/geospatial/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,7 @@ use crate::geospatial::bounding_box::BoundingBox;
/// ```
#[derive(Clone, Debug, PartialEq, Default)]
pub struct GeospatialStatistics {
/// Optional bounding defining the spatial extent, where None represents a lack of information.
bbox: Option<BoundingBox>,
/// Optional list of geometry type identifiers, where None represents lack of information
geospatial_types: Option<Vec<i32>>,
}

Expand All @@ -58,6 +56,16 @@ impl GeospatialStatistics {
geospatial_types,
}
}

/// Optional list of geometry type identifiers, where None represents lack of information
pub fn geospatial_types(&self) -> Option<&Vec<i32>> {
self.geospatial_types.as_ref()
}

/// Optional bounding defining the spatial extent, where None represents a lack of information.
pub fn bounding_box(&self) -> Option<&BoundingBox> {
self.bbox.as_ref()
}
}

/// Converts a Thrift-generated geospatial statistics object to the internal representation.
Expand Down Expand Up @@ -127,38 +135,4 @@ mod tests {
assert_eq!(thrift_bbox_m.mmin, Some(OrderedFloat(10.0)));
assert_eq!(thrift_bbox_m.mmax, Some(OrderedFloat(20.0)));
}

#[test]
fn test_read_geospatial_statistics_from_file() {
use crate::file::reader::{FileReader, SerializedFileReader};
use std::fs::File;

let path = format!(
"{}/geospatial/geospatial.parquet",
arrow::util::test_util::parquet_test_data(),
);
let file = File::open(path).unwrap();
let reader = SerializedFileReader::try_from(file).unwrap();
let metadata = reader.metadata();

// geospatial.parquet schema:
// optional binary field_id=-1 group (String);
// optional binary field_id=-1 wkt (String);
// optional binary field_id=-1 geometry (Geometry(crs=));
let geo_statistics = metadata.row_group(0).column(2).geo_statistics();
assert!(geo_statistics.is_some());

let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0)
.with_zrange(30.0, 80.0)
.with_mrange(200.0, 1600.0);
let expected_geospatial_types = vec![
1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004,
2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007,
];
assert_eq!(
geo_statistics.unwrap().geospatial_types,
Some(expected_geospatial_types)
);
assert_eq!(geo_statistics.unwrap().bbox, Some(expected_bbox));
}
}
Loading
Loading