Skip to content

Commit 8c06abe

Browse files
Merge pull request ClickHouse#25902 from Avogar/arrow-nested
Refactor ArrowColumnToCHColumn, support inserting Nested as Array(Struct) in Arrow/ORC/Parquet
2 parents fbd04a5 + 188c737 commit 8c06abe

30 files changed

+396
-350
lines changed

src/Columns/ColumnLowCardinality.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ class ColumnLowCardinality final : public COWHelper<IColumn, ColumnLowCardinalit
194194
const IColumnUnique & getDictionary() const { return dictionary.getColumnUnique(); }
195195
IColumnUnique & getDictionary() { return dictionary.getColumnUnique(); }
196196
const ColumnPtr & getDictionaryPtr() const { return dictionary.getColumnUniquePtr(); }
197+
ColumnPtr & getDictionaryPtr() { return dictionary.getColumnUniquePtr(); }
197198
/// IColumnUnique & getUnique() { return static_cast<IColumnUnique &>(*column_unique); }
198199
/// ColumnPtr getUniquePtr() const { return column_unique; }
199200

src/Core/Settings.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,9 @@ class IColumn;
528528
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
529529
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
530530
M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \
531+
M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \
532+
M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \
533+
M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \
531534
\
532535
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \
533536
M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \

src/DataTypes/NestedUtils.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,18 @@ void validateArraySizes(const Block & block)
208208
}
209209
}
210210

211+
std::unordered_set<String> getAllTableNames(const Block & block)
212+
{
213+
std::unordered_set<String> nested_table_names;
214+
for (auto & name : block.getNames())
215+
{
216+
auto nested_table_name = Nested::extractTableName(name);
217+
if (!nested_table_name.empty())
218+
nested_table_names.insert(nested_table_name);
219+
}
220+
return nested_table_names;
221+
}
222+
211223
}
212224

213225
}

src/DataTypes/NestedUtils.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ namespace Nested
2828

2929
/// Check that sizes of arrays - elements of nested data structures - are equal.
3030
void validateArraySizes(const Block & block);
31+
32+
/// Get all nested tables names from a block.
33+
std::unordered_set<String> getAllTableNames(const Block & block);
3134
}
3235

3336
}

src/Formats/FormatFactory.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
8888
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
8989
format_settings.null_as_default = settings.input_format_null_as_default;
9090
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
91+
format_settings.parquet.import_nested = settings.input_format_parquet_import_nested;
9192
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
9293
format_settings.pretty.color = settings.output_format_pretty_color;
9394
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
@@ -114,6 +115,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
114115
format_settings.with_names_use_header = settings.input_format_with_names_use_header;
115116
format_settings.write_statistics = settings.output_format_write_statistics;
116117
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
118+
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
119+
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
117120

118121
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
119122
if (format_settings.schema.is_server)

src/Formats/FormatSettings.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ struct FormatSettings
5353
{
5454
UInt64 row_group_size = 1000000;
5555
bool low_cardinality_as_dictionary = false;
56+
bool import_nested = false;
5657
} arrow;
5758

5859
struct
@@ -100,6 +101,7 @@ struct FormatSettings
100101
struct
101102
{
102103
UInt64 row_group_size = 1000000;
104+
bool import_nested = false;
103105
} parquet;
104106

105107
struct Pretty
@@ -174,6 +176,11 @@ struct FormatSettings
174176
bool deduce_templates_of_expressions = true;
175177
bool accurate_types_of_literals = true;
176178
} values;
179+
180+
struct
181+
{
182+
bool import_nested = false;
183+
} orc;
177184
};
178185

179186
}

src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ namespace ErrorCodes
2222
extern const int CANNOT_READ_ALL_DATA;
2323
}
2424

25-
ArrowBlockInputFormat::ArrowBlockInputFormat(ReadBuffer & in_, const Block & header_, bool stream_)
26-
: IInputFormat(header_, in_), stream{stream_}
25+
ArrowBlockInputFormat::ArrowBlockInputFormat(ReadBuffer & in_, const Block & header_, bool stream_, const FormatSettings & format_settings_)
26+
: IInputFormat(header_, in_), stream{stream_}, format_settings(format_settings_)
2727
{
2828
}
2929

@@ -102,7 +102,7 @@ void ArrowBlockInputFormat::prepareReader()
102102
schema = file_reader->schema();
103103
}
104104

105-
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), std::move(schema), "Arrow");
105+
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "Arrow", format_settings.arrow.import_nested);
106106

107107
if (stream)
108108
record_batch_total = -1;
@@ -119,19 +119,19 @@ void registerInputFormatProcessorArrow(FormatFactory & factory)
119119
[](ReadBuffer & buf,
120120
const Block & sample,
121121
const RowInputFormatParams & /* params */,
122-
const FormatSettings & /* format_settings */)
122+
const FormatSettings & format_settings)
123123
{
124-
return std::make_shared<ArrowBlockInputFormat>(buf, sample, false);
124+
return std::make_shared<ArrowBlockInputFormat>(buf, sample, false, format_settings);
125125
});
126126
factory.markFormatAsColumnOriented("Arrow");
127127
factory.registerInputFormatProcessor(
128128
"ArrowStream",
129129
[](ReadBuffer & buf,
130130
const Block & sample,
131131
const RowInputFormatParams & /* params */,
132-
const FormatSettings & /* format_settings */)
132+
const FormatSettings & format_settings)
133133
{
134-
return std::make_shared<ArrowBlockInputFormat>(buf, sample, true);
134+
return std::make_shared<ArrowBlockInputFormat>(buf, sample, true, format_settings);
135135
});
136136
}
137137

src/Processors/Formats/Impl/ArrowBlockInputFormat.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#if USE_ARROW
77

88
#include <Processors/Formats/IInputFormat.h>
9+
#include <Formats/FormatSettings.h>
910

1011
namespace arrow { class RecordBatchReader; }
1112
namespace arrow::ipc { class RecordBatchFileReader; }
@@ -19,7 +20,7 @@ class ArrowColumnToCHColumn;
1920
class ArrowBlockInputFormat : public IInputFormat
2021
{
2122
public:
22-
ArrowBlockInputFormat(ReadBuffer & in_, const Block & header_, bool stream_);
23+
ArrowBlockInputFormat(ReadBuffer & in_, const Block & header_, bool stream_, const FormatSettings & format_settings_);
2324

2425
void resetParser() override;
2526

@@ -41,6 +42,8 @@ class ArrowBlockInputFormat : public IInputFormat
4142
int record_batch_total = 0;
4243
int record_batch_current = 0;
4344

45+
const FormatSettings format_settings;
46+
4447
void prepareReader();
4548
};
4649

0 commit comments

Comments
 (0)