13
13
#include < Storages/ObjectStorage/StorageObjectStorageSource.h>
14
14
#include < Storages/ObjectStorage/StorageObjectStorageSettings.h>
15
15
#include < Interpreters/ExpressionActions.h>
16
+ #include < IO/CompressedReadBufferWrapper.h>
16
17
17
18
#include < Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h>
18
19
#include < Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
@@ -106,10 +107,16 @@ std::string normalizeUuid(const std::string & uuid)
106
107
}
107
108
108
109
Poco::JSON::Object::Ptr
109
- readJSON (const String & metadata_file_path, ObjectStoragePtr object_storage, const ContextPtr & local_context, LoggerPtr log)
110
+ readJSON (const String & metadata_file_path, ObjectStoragePtr object_storage, const ContextPtr & local_context, LoggerPtr log, CompressionMethod compression_method )
110
111
{
111
112
ObjectInfo object_info (metadata_file_path);
112
- auto buf = StorageObjectStorageSource::createReadBuffer (object_info, object_storage, local_context, log);
113
+ auto source_buf = StorageObjectStorageSource::createReadBuffer (object_info, object_storage, local_context, log);
114
+
115
+ std::unique_ptr<ReadBuffer> buf;
116
+ if (compression_method != CompressionMethod::None)
117
+ buf = wrapReadBufferWithCompressionMethod (std::move (source_buf), compression_method);
118
+ else
119
+ buf = std::move (source_buf);
113
120
114
121
String json_str;
115
122
readJSONObjectPossiblyInvalid (json_str, *buf);
@@ -263,7 +270,30 @@ Int32 IcebergMetadata::parseTableSchema(
263
270
}
264
271
}
265
272
266
- static std::pair<Int32, String> getMetadataFileAndVersion (const std::string & path)
273
+ struct MetadataFileWithInfo
274
+ {
275
+ Int32 version;
276
+ String path;
277
+ CompressionMethod compression_method;
278
+ };
279
+
280
+ static CompressionMethod getCompressionMethodFromMetadataFile (const String & path)
281
+ {
282
+ constexpr std::string_view metadata_suffix = " .metadata.json" ;
283
+
284
+ auto compression_method = chooseCompressionMethod (path, " auto" );
285
+
286
+ // / NOTE you will be surprised, but some metadata files store compression not in the end of the file name,
287
+ // / but somewhere in the middle of the file name, before metadata.json suffix.
288
+ // / Maybe history of Iceberg metadata files is not so long, but it is already full of surprises.
289
+ // / Example of weird engineering decisions: 00000-85befd5a-69c7-46d4-bca6-cfbd67f0f7e6.gz.metadata.json
290
+ if (compression_method == CompressionMethod::None && path.ends_with (metadata_suffix))
291
+ compression_method = chooseCompressionMethod (path.substr (0 , path.size () - metadata_suffix.size ()), " auto" );
292
+
293
+ return compression_method;
294
+ }
295
+
296
+ static MetadataFileWithInfo getMetadataFileAndVersion (const std::string & path)
267
297
{
268
298
String file_name (path.begin () + path.find_last_of (' /' ) + 1 , path.end ());
269
299
String version_str;
@@ -278,7 +308,10 @@ static std::pair<Int32, String> getMetadataFileAndVersion(const std::string & pa
278
308
throw Exception (
279
309
ErrorCodes::BAD_ARGUMENTS, " Bad metadata file name: {}. Expected vN.metadata.json where N is a number" , file_name);
280
310
281
- return std::make_pair (std::stoi (version_str), path);
311
+ return MetadataFileWithInfo{
312
+ .version = std::stoi (version_str),
313
+ .path = path,
314
+ .compression_method = getCompressionMethodFromMetadataFile (path)};
282
315
}
283
316
284
317
enum class MostRecentMetadataFileSelectionWay
@@ -289,7 +322,7 @@ enum class MostRecentMetadataFileSelectionWay
289
322
290
323
struct ShortMetadataFileInfo
291
324
{
292
- UInt32 version;
325
+ Int32 version;
293
326
UInt64 last_updated_ms;
294
327
String path;
295
328
};
@@ -301,7 +334,7 @@ struct ShortMetadataFileInfo
301
334
* 1) v<V>.metadata.json, where V - metadata version.
302
335
* 2) <V>-<random-uuid>.metadata.json, where V - metadata version
303
336
*/
304
- static std::pair<Int32, String> getLatestMetadataFileAndVersion (
337
+ static MetadataFileWithInfo getLatestMetadataFileAndVersion (
305
338
const ObjectStoragePtr & object_storage,
306
339
const StorageObjectStorage::Configuration & configuration,
307
340
const ContextPtr & local_context,
@@ -324,10 +357,10 @@ static std::pair<Int32, String> getLatestMetadataFileAndVersion(
324
357
metadata_files_with_versions.reserve (metadata_files.size ());
325
358
for (const auto & path : metadata_files)
326
359
{
327
- auto [version, metadata_file_path] = getMetadataFileAndVersion (path);
360
+ auto [version, metadata_file_path, compression_method ] = getMetadataFileAndVersion (path);
328
361
if (need_all_metadata_files_parsing)
329
362
{
330
- auto metadata_file_object = readJSON (metadata_file_path, object_storage, local_context, log);
363
+ auto metadata_file_object = readJSON (metadata_file_path, object_storage, local_context, log, compression_method );
331
364
if (table_uuid.has_value ())
332
365
{
333
366
if (metadata_file_object->has (" table-uuid" ))
@@ -377,10 +410,11 @@ static std::pair<Int32, String> getLatestMetadataFileAndVersion(
377
410
[](const ShortMetadataFileInfo & a, const ShortMetadataFileInfo & b) { return a.version < b.version ; });
378
411
}
379
412
}();
380
- return {latest_metadata_file_info.version , latest_metadata_file_info.path };
413
+
414
+ return {latest_metadata_file_info.version , latest_metadata_file_info.path , getCompressionMethodFromMetadataFile (latest_metadata_file_info.path )};
381
415
}
382
416
383
- static std::pair<Int32, String> getLatestOrExplicitMetadataFileAndVersion (
417
+ static MetadataFileWithInfo getLatestOrExplicitMetadataFileAndVersion (
384
418
const ObjectStoragePtr & object_storage,
385
419
const StorageObjectStorage::Configuration & configuration,
386
420
const ContextPtr & local_context,
@@ -425,14 +459,14 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
425
459
{
426
460
auto configuration_ptr = configuration.lock ();
427
461
428
- const auto [metadata_version, metadata_file_path]
462
+ const auto [metadata_version, metadata_file_path, compression_method ]
429
463
= getLatestOrExplicitMetadataFileAndVersion (object_storage, *configuration_ptr, local_context, log.get ());
430
464
431
465
bool metadata_file_changed = false ;
432
466
if (last_metadata_version != metadata_version)
433
467
{
434
468
last_metadata_version = metadata_version;
435
- last_metadata_object = ::DB::readJSON (metadata_file_path, object_storage, local_context, log);
469
+ last_metadata_object = ::DB::readJSON (metadata_file_path, object_storage, local_context, log, compression_method );
436
470
metadata_file_changed = true ;
437
471
}
438
472
@@ -594,12 +628,18 @@ DataLakeMetadataPtr IcebergMetadata::create(
594
628
else
595
629
LOG_TRACE (log, " Not using in-memory cache for iceberg metadata files, because the setting use_iceberg_metadata_files_cache is false." );
596
630
597
- const auto [metadata_version, metadata_file_path] = getLatestOrExplicitMetadataFileAndVersion (object_storage, *configuration_ptr, local_context, log.get ());
631
+ const auto [metadata_version, metadata_file_path, compression_method ] = getLatestOrExplicitMetadataFileAndVersion (object_storage, *configuration_ptr, local_context, log.get ());
598
632
599
633
auto create_fn = [&]()
600
634
{
601
635
ObjectInfo object_info (metadata_file_path); // NOLINT
602
- auto buf = StorageObjectStorageSource::createReadBuffer (object_info, object_storage, local_context, log);
636
+ auto source_buf = StorageObjectStorageSource::createReadBuffer (object_info, object_storage, local_context, log);
637
+
638
+ std::unique_ptr<ReadBuffer> buf;
639
+ if (compression_method != CompressionMethod::None)
640
+ buf = wrapReadBufferWithCompressionMethod (std::move (source_buf), compression_method);
641
+ else
642
+ buf = std::move (source_buf);
603
643
604
644
String json_str;
605
645
readJSONObjectPossiblyInvalid (json_str, *buf);
0 commit comments