@@ -51,6 +51,7 @@ use datafusion_execution::parquet_encryption::EncryptionFactory;
5151use futures:: { ready, Stream , StreamExt , TryStreamExt } ;
5252use itertools:: Itertools ;
5353use log:: debug;
54+ use parquet:: arrow:: arrow_reader:: metrics:: ArrowReaderMetrics ;
5455use parquet:: arrow:: arrow_reader:: { ArrowReaderMetadata , ArrowReaderOptions } ;
5556use parquet:: arrow:: async_reader:: AsyncFileReader ;
5657use parquet:: arrow:: { ParquetRecordBatchStreamBuilder , ProjectionMask } ;
@@ -105,6 +106,9 @@ pub(super) struct ParquetOpener {
105106 #[ cfg( feature = "parquet_encryption" ) ]
106107 pub encryption_factory :
107108 Option < ( Arc < dyn EncryptionFactory > , EncryptionFactoryOptions ) > ,
109+ /// Maximum size of the predicate cache, in bytes. If none, uses
110+ /// the arrow-rs default.
111+ pub max_predicate_cache_size : Option < usize > ,
108112}
109113
110114impl FileOpener for ParquetOpener {
@@ -152,6 +156,7 @@ impl FileOpener for ParquetOpener {
152156
153157 let enable_page_index = self . enable_page_index ;
154158 let encryption_context = self . get_encryption_context ( ) ;
159+ let max_predicate_cache_size = self . max_predicate_cache_size ;
155160
156161 Ok ( Box :: pin ( async move {
157162 let file_decryption_properties = encryption_context
@@ -401,21 +406,42 @@ impl FileOpener for ParquetOpener {
401406 builder = builder. with_limit ( limit)
402407 }
403408
409+ if let Some ( max_predicate_cache_size) = max_predicate_cache_size {
410+ builder = builder. with_max_predicate_cache_size ( max_predicate_cache_size) ;
411+ }
412+
413+ // metrics from the arrow reader itself
414+ let arrow_reader_metrics = ArrowReaderMetrics :: enabled ( ) ;
415+
404416 let stream = builder
405417 . with_projection ( mask)
406418 . with_batch_size ( batch_size)
407419 . with_row_groups ( row_group_indexes)
420+ . with_metrics ( arrow_reader_metrics. clone ( ) )
408421 . build ( ) ?;
409422
410- let stream = stream
411- . map_err ( DataFusionError :: from)
412- . map ( move |b| b. and_then ( |b| schema_mapping. map_batch ( b) ) ) ;
423+ let files_ranges_pruned_statistics =
424+ file_metrics. files_ranges_pruned_statistics . clone ( ) ;
425+ let predicate_cache_inner_records =
426+ file_metrics. predicate_cache_inner_records . clone ( ) ;
427+ let predicate_cache_records = file_metrics. predicate_cache_records . clone ( ) ;
428+
429+ let stream = stream. map_err ( DataFusionError :: from) . map ( move |b| {
430+ b. and_then ( |b| {
431+ copy_arrow_reader_metrics (
432+ & arrow_reader_metrics,
433+ & predicate_cache_inner_records,
434+ & predicate_cache_records,
435+ ) ;
436+ schema_mapping. map_batch ( b)
437+ } )
438+ } ) ;
413439
414440 if let Some ( file_pruner) = file_pruner {
415441 Ok ( EarlyStoppingStream :: new (
416442 stream,
417443 file_pruner,
418- file_metrics . files_ranges_pruned_statistics . clone ( ) ,
444+ files_ranges_pruned_statistics,
419445 )
420446 . boxed ( ) )
421447 } else {
@@ -425,6 +451,22 @@ impl FileOpener for ParquetOpener {
425451 }
426452}
427453
454+ /// Copies metrics from ArrowReaderMetrics (the metrics collected by the
455+ /// arrow-rs parquet reader) to the parquet file metrics for DataFusion
456+ fn copy_arrow_reader_metrics (
457+ arrow_reader_metrics : & ArrowReaderMetrics ,
458+ predicate_cache_inner_records : & Count ,
459+ predicate_cache_records : & Count ,
460+ ) {
461+ if let Some ( v) = arrow_reader_metrics. records_read_from_inner ( ) {
462+ predicate_cache_inner_records. add ( v) ;
463+ }
464+
465+ if let Some ( v) = arrow_reader_metrics. records_read_from_cache ( ) {
466+ predicate_cache_records. add ( v) ;
467+ }
468+ }
469+
428470/// Wraps an inner RecordBatchStream and a [`FilePruner`]
429471///
430472/// This can terminate the scan early when some dynamic filters is updated after
@@ -823,6 +865,7 @@ mod test {
823865 expr_adapter_factory : Some ( Arc :: new ( DefaultPhysicalExprAdapterFactory ) ) ,
824866 #[ cfg( feature = "parquet_encryption" ) ]
825867 encryption_factory : None ,
868+ max_predicate_cache_size : None ,
826869 }
827870 } ;
828871
@@ -911,6 +954,7 @@ mod test {
911954 expr_adapter_factory : Some ( Arc :: new ( DefaultPhysicalExprAdapterFactory ) ) ,
912955 #[ cfg( feature = "parquet_encryption" ) ]
913956 encryption_factory : None ,
957+ max_predicate_cache_size : None ,
914958 }
915959 } ;
916960
@@ -1015,6 +1059,7 @@ mod test {
10151059 expr_adapter_factory : Some ( Arc :: new ( DefaultPhysicalExprAdapterFactory ) ) ,
10161060 #[ cfg( feature = "parquet_encryption" ) ]
10171061 encryption_factory : None ,
1062+ max_predicate_cache_size : None ,
10181063 }
10191064 } ;
10201065 let make_meta = || FileMeta {
@@ -1129,6 +1174,7 @@ mod test {
11291174 expr_adapter_factory : Some ( Arc :: new ( DefaultPhysicalExprAdapterFactory ) ) ,
11301175 #[ cfg( feature = "parquet_encryption" ) ]
11311176 encryption_factory : None ,
1177+ max_predicate_cache_size : None ,
11321178 }
11331179 } ;
11341180
@@ -1244,6 +1290,7 @@ mod test {
12441290 expr_adapter_factory : Some ( Arc :: new ( DefaultPhysicalExprAdapterFactory ) ) ,
12451291 #[ cfg( feature = "parquet_encryption" ) ]
12461292 encryption_factory : None ,
1293+ max_predicate_cache_size : None ,
12471294 }
12481295 } ;
12491296
@@ -1426,6 +1473,7 @@ mod test {
14261473 expr_adapter_factory : None ,
14271474 #[ cfg( feature = "parquet_encryption" ) ]
14281475 encryption_factory : None ,
1476+ max_predicate_cache_size : None ,
14291477 } ;
14301478
14311479 let predicate = logical2physical ( & col ( "a" ) . eq ( lit ( 1u64 ) ) , & table_schema) ;
0 commit comments