Skip to content

Commit ecb24ce

Browse files
authored
Merge branch 'customizations/24.8.14' into backports/24.8/71539_enable_merge_filters_optimization
2 parents bd4d269 + 64505a8 commit ecb24ce

File tree

12 files changed

+381
-217
lines changed

12 files changed

+381
-217
lines changed

.github/create_combined_ci_report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def get_checks_known_fails(client: Client, job_url: str, known_fails: dict):
5454
len(df.columns) - 1,
5555
"reason",
5656
df["test_name"]
57-
.cat.remove_unused_categories()
57+
.astype(str)
5858
.apply(
5959
lambda test_name: known_fails[test_name].get("reason", "No reason given")
6060
),

.github/workflows/release_branches.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ jobs:
539539
secrets: inherit
540540
with:
541541
runner_type: altinity-on-demand, altinity-type-cpx51, altinity-image-x86-app-docker-ce, altinity-setup-regression
542-
commit: 53d73ed32155a8a17ee0d0cdb15aee96c98010a2
542+
commit: 11dcb1ad771e6afebcb06bcc7bf1c1d8b184d838
543543
arch: release
544544
build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
545545
timeout_minutes: 300
@@ -550,7 +550,7 @@ jobs:
550550
secrets: inherit
551551
with:
552552
runner_type: altinity-on-demand, altinity-type-cax41, altinity-image-arm-app-docker-ce, altinity-setup-regression
553-
commit: 53d73ed32155a8a17ee0d0cdb15aee96c98010a2
553+
commit: 11dcb1ad771e6afebcb06bcc7bf1c1d8b184d838
554554
arch: aarch64
555555
build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
556556
timeout_minutes: 300

src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,22 @@ class ParquetDataBuffer
4848
consume(bytes);
4949
}
5050

51+
template <typename TValue, typename ParquetType>
52+
void ALWAYS_INLINE readValuesOfDifferentSize(TValue * dst, size_t count)
53+
{
54+
auto necessary_bytes = count * sizeof(ParquetType);
55+
checkAvaible(necessary_bytes);
56+
57+
const ParquetType* src = reinterpret_cast<const ParquetType*>(data);
58+
59+
for (std::size_t i = 0; i < count; i++)
60+
{
61+
dst[i] = static_cast<TValue>(src[i]);
62+
}
63+
64+
consume(necessary_bytes);
65+
}
66+
5167
void ALWAYS_INLINE readDateTime64FromInt96(DateTime64 & dst)
5268
{
5369
static const int max_scale_num = 9;

src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,8 @@ TValue * getResizedPrimitiveData(TColumn & column, size_t size)
240240
} // anoynomous namespace
241241

242242

243-
template <>
244-
void ParquetPlainValuesReader<ColumnString>::readBatch(
243+
template <typename TColumn>
244+
void ParquetPlainByteArrayValuesReader<TColumn>::readBatch(
245245
MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values)
246246
{
247247
auto & column = *assert_cast<ColumnString *>(col_ptr.get());
@@ -322,8 +322,8 @@ void ParquetBitPlainReader<TColumn>::readBatch(
322322
}
323323

324324

325-
template <>
326-
void ParquetPlainValuesReader<ColumnDecimal<DateTime64>, ParquetReaderTypes::TimestampInt96>::readBatch(
325+
template <typename TColumn>
326+
void ParquetPlainInt96ValuesReader<TColumn>::readBatch(
327327
MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values)
328328
{
329329
auto cursor = col_ptr->size();
@@ -350,8 +350,8 @@ void ParquetPlainValuesReader<ColumnDecimal<DateTime64>, ParquetReaderTypes::Tim
350350
);
351351
}
352352

353-
template <typename TColumn, ParquetReaderTypes reader_type>
354-
void ParquetPlainValuesReader<TColumn, reader_type>::readBatch(
353+
template <typename TColumn, typename ParquetType>
354+
void ParquetPlainValuesReader<TColumn, ParquetType>::readBatch(
355355
MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values)
356356
{
357357
auto cursor = col_ptr->size();
@@ -365,11 +365,11 @@ void ParquetPlainValuesReader<TColumn, reader_type>::readBatch(
365365
null_map,
366366
/* individual_visitor */ [&](size_t nest_cursor)
367367
{
368-
plain_data_buffer.readValue(column_data[nest_cursor]);
368+
plain_data_buffer.readValuesOfDifferentSize<TValue, ParquetType>(column_data + nest_cursor, 1);
369369
},
370370
/* repeated_visitor */ [&](size_t nest_cursor, UInt32 count)
371371
{
372-
plain_data_buffer.readBytes(column_data + nest_cursor, count * sizeof(TValue));
372+
plain_data_buffer.readValuesOfDifferentSize<TValue, ParquetType>(column_data + nest_cursor, count);
373373
}
374374
);
375375
}
@@ -576,17 +576,19 @@ void ParquetRleDictReader<TColumnVector>::readBatch(
576576
}
577577

578578

579-
template class ParquetPlainValuesReader<ColumnInt32>;
580-
template class ParquetPlainValuesReader<ColumnUInt32>;
581-
template class ParquetPlainValuesReader<ColumnInt64>;
582-
template class ParquetPlainValuesReader<ColumnUInt64>;
583-
template class ParquetPlainValuesReader<ColumnFloat32>;
584-
template class ParquetPlainValuesReader<ColumnFloat64>;
585-
template class ParquetPlainValuesReader<ColumnDecimal<Decimal32>>;
586-
template class ParquetPlainValuesReader<ColumnDecimal<Decimal64>>;
587-
template class ParquetPlainValuesReader<ColumnDecimal<DateTime64>>;
588-
template class ParquetPlainValuesReader<ColumnString>;
589-
template class ParquetPlainValuesReader<ColumnUInt8>;
579+
template class ParquetPlainValuesReader<ColumnUInt8, int32_t>;
580+
template class ParquetPlainValuesReader<ColumnInt8, int32_t>;
581+
template class ParquetPlainValuesReader<ColumnUInt16, int32_t>;
582+
template class ParquetPlainValuesReader<ColumnInt16, int32_t>;
583+
template class ParquetPlainValuesReader<ColumnUInt32, int32_t>;
584+
template class ParquetPlainValuesReader<ColumnInt32, int32_t>;
585+
template class ParquetPlainValuesReader<ColumnUInt64, int64_t>;
586+
template class ParquetPlainValuesReader<ColumnInt64, int64_t>;
587+
template class ParquetPlainValuesReader<ColumnFloat32, float>;
588+
template class ParquetPlainValuesReader<ColumnFloat64, double>;
589+
template class ParquetPlainValuesReader<ColumnDecimal<Decimal32>, int32_t>;
590+
template class ParquetPlainValuesReader<ColumnDecimal<Decimal64>, int64_t>;
591+
template class ParquetPlainValuesReader<ColumnDecimal<DateTime64>, int64_t>;
590592

591593
template class ParquetBitPlainReader<ColumnUInt8>;
592594

@@ -597,7 +599,6 @@ template class ParquetRleLCReader<ColumnUInt8>;
597599
template class ParquetRleLCReader<ColumnUInt16>;
598600
template class ParquetRleLCReader<ColumnUInt32>;
599601

600-
template class ParquetRleDictReader<ColumnUInt8>;
601602
template class ParquetRleDictReader<ColumnInt32>;
602603
template class ParquetRleDictReader<ColumnUInt32>;
603604
template class ParquetRleDictReader<ColumnInt64>;
@@ -611,4 +612,8 @@ template class ParquetRleDictReader<ColumnDecimal<Decimal256>>;
611612
template class ParquetRleDictReader<ColumnDecimal<DateTime64>>;
612613
template class ParquetRleDictReader<ColumnString>;
613614

615+
template class ParquetPlainByteArrayValuesReader<ColumnString>;
616+
617+
template class ParquetPlainInt96ValuesReader<ColumnDecimal<DateTime64>>;
618+
614619
}

src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ enum class ParquetReaderTypes
150150
/**
151151
* The definition level is RLE or BitPacked encoding, while data is read directly
152152
*/
153-
template <typename TColumn, ParquetReaderTypes reader_type = ParquetReaderTypes::Normal>
153+
template <typename TColumn, typename ParquetType>
154154
class ParquetPlainValuesReader : public ParquetDataValuesReader
155155
{
156156
public:
@@ -172,6 +172,50 @@ class ParquetPlainValuesReader : public ParquetDataValuesReader
172172
ParquetDataBuffer plain_data_buffer;
173173
};
174174

175+
template <typename TColumn>
176+
class ParquetPlainInt96ValuesReader : public ParquetDataValuesReader
177+
{
178+
public:
179+
180+
ParquetPlainInt96ValuesReader(
181+
Int32 max_def_level_,
182+
std::unique_ptr<RleValuesReader> def_level_reader_,
183+
ParquetDataBuffer data_buffer_)
184+
: max_def_level(max_def_level_)
185+
, def_level_reader(std::move(def_level_reader_))
186+
, plain_data_buffer(std::move(data_buffer_))
187+
{}
188+
189+
void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override;
190+
191+
private:
192+
Int32 max_def_level;
193+
std::unique_ptr<RleValuesReader> def_level_reader;
194+
ParquetDataBuffer plain_data_buffer;
195+
};
196+
197+
template <typename TColumn>
198+
class ParquetPlainByteArrayValuesReader : public ParquetDataValuesReader
199+
{
200+
public:
201+
202+
ParquetPlainByteArrayValuesReader(
203+
Int32 max_def_level_,
204+
std::unique_ptr<RleValuesReader> def_level_reader_,
205+
ParquetDataBuffer data_buffer_)
206+
: max_def_level(max_def_level_)
207+
, def_level_reader(std::move(def_level_reader_))
208+
, plain_data_buffer(std::move(data_buffer_))
209+
{}
210+
211+
void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override;
212+
213+
private:
214+
Int32 max_def_level;
215+
std::unique_ptr<RleValuesReader> def_level_reader;
216+
ParquetDataBuffer plain_data_buffer;
217+
};
218+
175219
template <typename TColumn>
176220
class ParquetBitPlainReader : public ParquetDataValuesReader
177221
{

0 commit comments

Comments
 (0)