Skip to content

Commit dc9c28a

Browse files
authored
Merge pull request #681 from Altinity/backports/24.8/merge_parquet_bf_minmax_eval
24.8 Backport of ClickHouse#71383 - Merge parquet bloom filter and min/max evaluation
2 parents 64505a8 + 1a7dddd commit dc9c28a

10 files changed

+570
-633
lines changed

src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp

Lines changed: 0 additions & 525 deletions
This file was deleted.

src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h

Lines changed: 0 additions & 73 deletions
This file was deleted.
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#include <Processors/Formats/Impl/Parquet/parquetBloomFilterHash.h>
2+
3+
#if USE_PARQUET
4+
5+
#include <parquet/metadata.h>
6+
#include <parquet/xxhasher.h>
7+
8+
namespace DB
9+
{
10+
11+
bool isParquetStringTypeSupportedForBloomFilters(
12+
const std::shared_ptr<const parquet::LogicalType> & logical_type,
13+
parquet::ConvertedType::type converted_type)
14+
{
15+
if (logical_type &&
16+
!logical_type->is_none()
17+
&& !(logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON()))
18+
{
19+
return false;
20+
}
21+
22+
if (parquet::ConvertedType::type::NONE != converted_type &&
23+
!(converted_type == parquet::ConvertedType::JSON || converted_type == parquet::ConvertedType::UTF8
24+
|| converted_type == parquet::ConvertedType::BSON))
25+
{
26+
return false;
27+
}
28+
29+
return true;
30+
}
31+
32+
bool isParquetIntegerTypeSupportedForBloomFilters(const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
33+
{
34+
if (logical_type && !logical_type->is_none() && !logical_type->is_int())
35+
{
36+
return false;
37+
}
38+
39+
if (parquet::ConvertedType::type::NONE != converted_type && !(converted_type == parquet::ConvertedType::INT_8 || converted_type == parquet::ConvertedType::INT_16
40+
|| converted_type == parquet::ConvertedType::INT_32 || converted_type == parquet::ConvertedType::INT_64
41+
|| converted_type == parquet::ConvertedType::UINT_8 || converted_type == parquet::ConvertedType::UINT_16
42+
|| converted_type == parquet::ConvertedType::UINT_32 || converted_type == parquet::ConvertedType::UINT_64))
43+
{
44+
return false;
45+
}
46+
47+
return true;
48+
}
49+
50+
template <typename T>
51+
uint64_t hashSpecialFLBATypes(const Field & field)
52+
{
53+
const T & value = field.safeGet<T>();
54+
55+
parquet::FLBA flba(reinterpret_cast<const uint8_t*>(&value));
56+
57+
parquet::XxHasher hasher;
58+
59+
return hasher.Hash(&flba, sizeof(T));
60+
};
61+
62+
std::optional<uint64_t> tryHashStringWithoutCompatibilityCheck(const Field & field)
63+
{
64+
const auto field_type = field.getType();
65+
66+
if (field_type != Field::Types::Which::String)
67+
{
68+
return std::nullopt;
69+
}
70+
71+
parquet::XxHasher hasher;
72+
parquet::ByteArray ba { field.safeGet<std::string>() };
73+
74+
return hasher.Hash(&ba);
75+
}
76+
77+
std::optional<uint64_t> tryHashString(
78+
const Field & field,
79+
const std::shared_ptr<const parquet::LogicalType> & logical_type,
80+
parquet::ConvertedType::type converted_type)
81+
{
82+
if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
83+
{
84+
return std::nullopt;
85+
}
86+
87+
return tryHashStringWithoutCompatibilityCheck(field);
88+
}
89+
90+
std::optional<uint64_t> tryHashFLBA(
91+
const Field & field,
92+
const std::shared_ptr<const parquet::LogicalType> & logical_type,
93+
parquet::ConvertedType::type converted_type,
94+
std::size_t parquet_column_length)
95+
{
96+
if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
97+
{
98+
return std::nullopt;
99+
}
100+
101+
const auto field_type = field.getType();
102+
103+
if (field_type == Field::Types::Which::IPv6 && parquet_column_length == sizeof(IPv6))
104+
{
105+
return hashSpecialFLBATypes<IPv6>(field);
106+
}
107+
108+
return tryHashStringWithoutCompatibilityCheck(field);
109+
}
110+
111+
template <typename ParquetPhysicalType>
112+
std::optional<uint64_t> tryHashInt(const Field & field, const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
113+
{
114+
if (!isParquetIntegerTypeSupportedForBloomFilters(logical_type, converted_type))
115+
{
116+
return std::nullopt;
117+
}
118+
119+
parquet::XxHasher hasher;
120+
121+
if (field.getType() == Field::Types::Which::Int64)
122+
{
123+
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<int64_t>()));
124+
}
125+
else if (field.getType() == Field::Types::Which::UInt64)
126+
{
127+
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<uint64_t>()));
128+
}
129+
else if (field.getType() == Field::Types::IPv4)
130+
{
131+
/*
132+
* In theory, we could accept IPv4 over 64 bits variables. It would only be a problem in case it was hashed using the byte array api
133+
* with a zero-ed buffer that had a 32 bits variable copied into it.
134+
*
135+
* To be on the safe side, accept only in case physical type is 32 bits.
136+
* */
137+
if constexpr (std::is_same_v<int32_t, ParquetPhysicalType>)
138+
{
139+
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<IPv4>()));
140+
}
141+
}
142+
143+
return std::nullopt;
144+
}
145+
146+
std::optional<uint64_t> parquetTryHashField(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor)
147+
{
148+
const auto physical_type = parquet_column_descriptor->physical_type();
149+
const auto & logical_type = parquet_column_descriptor->logical_type();
150+
const auto converted_type = parquet_column_descriptor->converted_type();
151+
152+
switch (physical_type)
153+
{
154+
case parquet::Type::type::INT32:
155+
return tryHashInt<int32_t>(field, logical_type, converted_type);
156+
case parquet::Type::type::INT64:
157+
return tryHashInt<int64_t>(field, logical_type, converted_type);
158+
case parquet::Type::type::BYTE_ARRAY:
159+
return tryHashString(field, logical_type, converted_type);
160+
case parquet::Type::type::FIXED_LEN_BYTE_ARRAY:
161+
return tryHashFLBA(field, logical_type, converted_type, parquet_column_descriptor->type_length());
162+
default:
163+
return std::nullopt;
164+
}
165+
}
166+
167+
std::optional<std::vector<uint64_t>> parquetTryHashColumn(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor)
168+
{
169+
std::vector<uint64_t> hashes;
170+
171+
for (size_t i = 0u; i < data_column->size(); i++)
172+
{
173+
Field f;
174+
data_column->get(i, f);
175+
176+
auto hashed_value = parquetTryHashField(f, parquet_column_descriptor);
177+
178+
if (!hashed_value)
179+
{
180+
return std::nullopt;
181+
}
182+
183+
hashes.emplace_back(*hashed_value);
184+
}
185+
186+
return hashes;
187+
}
188+
189+
}
190+
191+
#endif
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#pragma once
2+
3+
#include <config.h>
4+
5+
#if USE_PARQUET
6+
7+
#include <Processors/Formats/Impl/ArrowFieldIndexUtil.h>
8+
9+
namespace DB
10+
{
11+
12+
/*
13+
* Try to hash a ClickHouse field, nullopt in case it can't be done
14+
* */
15+
std::optional<uint64_t> parquetTryHashField(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor);
16+
17+
18+
/*
19+
* Try to hash elements in a ClickHouse column; Will return std::nullopt in case one of them can't be hashed
20+
* */
21+
std::optional<std::vector<uint64_t>> parquetTryHashColumn(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor);
22+
23+
}
24+
25+
#endif

0 commit comments

Comments
 (0)