mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
Support inserting nested as Array of structs, add some refactoring
This commit is contained in:
parent
7fdf3cc263
commit
e4c5d7e3b1
@ -193,6 +193,7 @@ public:
|
||||
const IColumnUnique & getDictionary() const { return dictionary.getColumnUnique(); }
|
||||
IColumnUnique & getDictionary() { return dictionary.getColumnUnique(); }
|
||||
const ColumnPtr & getDictionaryPtr() const { return dictionary.getColumnUniquePtr(); }
|
||||
ColumnPtr & getDictionaryPtr() { return dictionary.getColumnUniquePtr(); }
|
||||
/// IColumnUnique & getUnique() { return static_cast<IColumnUnique &>(*column_unique); }
|
||||
/// ColumnPtr getUniquePtr() const { return column_unique; }
|
||||
|
||||
|
@ -522,6 +522,9 @@ class IColumn;
|
||||
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
|
||||
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
|
||||
M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \
|
||||
M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \
|
||||
M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \
|
||||
M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \
|
||||
\
|
||||
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \
|
||||
M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \
|
||||
|
@ -208,6 +208,18 @@ void validateArraySizes(const Block & block)
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_set<String> getAllTableNames(const Block & block)
|
||||
{
|
||||
std::unordered_set<String> nested_table_names;
|
||||
for (auto & name : block.getNames())
|
||||
{
|
||||
auto nested_table_name = Nested::extractTableName(name);
|
||||
if (!nested_table_name.empty())
|
||||
nested_table_names.insert(nested_table_name);
|
||||
}
|
||||
return nested_table_names;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -28,6 +28,9 @@ namespace Nested
|
||||
|
||||
/// Check that sizes of arrays - elements of nested data structures - are equal.
|
||||
void validateArraySizes(const Block & block);
|
||||
|
||||
/// Get all nested tables names from a block.
|
||||
std::unordered_set<String> getAllTableNames(const Block & block);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -88,6 +88,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
|
||||
format_settings.null_as_default = settings.input_format_null_as_default;
|
||||
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
|
||||
format_settings.parquet.import_nested = settings.input_format_parquet_import_nested;
|
||||
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
||||
format_settings.pretty.color = settings.output_format_pretty_color;
|
||||
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
|
||||
@ -114,6 +115,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.with_names_use_header = settings.input_format_with_names_use_header;
|
||||
format_settings.write_statistics = settings.output_format_write_statistics;
|
||||
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
|
||||
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
|
||||
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
||||
|
||||
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
|
||||
if (format_settings.schema.is_server)
|
||||
|
@ -53,6 +53,7 @@ struct FormatSettings
|
||||
{
|
||||
UInt64 row_group_size = 1000000;
|
||||
bool low_cardinality_as_dictionary = false;
|
||||
bool import_nested = false;
|
||||
} arrow;
|
||||
|
||||
struct
|
||||
@ -100,6 +101,7 @@ struct FormatSettings
|
||||
struct
|
||||
{
|
||||
UInt64 row_group_size = 1000000;
|
||||
bool import_nested = false;
|
||||
} parquet;
|
||||
|
||||
struct Pretty
|
||||
@ -174,6 +176,11 @@ struct FormatSettings
|
||||
bool deduce_templates_of_expressions = true;
|
||||
bool accurate_types_of_literals = true;
|
||||
} values;
|
||||
|
||||
struct
|
||||
{
|
||||
bool import_nested = false;
|
||||
} orc;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -22,8 +22,8 @@ namespace ErrorCodes
|
||||
extern const int CANNOT_READ_ALL_DATA;
|
||||
}
|
||||
|
||||
ArrowBlockInputFormat::ArrowBlockInputFormat(ReadBuffer & in_, const Block & header_, bool stream_)
|
||||
: IInputFormat(header_, in_), stream{stream_}
|
||||
ArrowBlockInputFormat::ArrowBlockInputFormat(ReadBuffer & in_, const Block & header_, bool stream_, const FormatSettings & format_settings_)
|
||||
: IInputFormat(header_, in_), stream{stream_}, format_settings(format_settings_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -102,7 +102,7 @@ void ArrowBlockInputFormat::prepareReader()
|
||||
schema = file_reader->schema();
|
||||
}
|
||||
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), std::move(schema), "Arrow");
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "Arrow", format_settings.arrow.import_nested);
|
||||
|
||||
if (stream)
|
||||
record_batch_total = -1;
|
||||
@ -119,9 +119,9 @@ void registerInputFormatProcessorArrow(FormatFactory & factory)
|
||||
[](ReadBuffer & buf,
|
||||
const Block & sample,
|
||||
const RowInputFormatParams & /* params */,
|
||||
const FormatSettings & /* format_settings */)
|
||||
const FormatSettings & format_settings)
|
||||
{
|
||||
return std::make_shared<ArrowBlockInputFormat>(buf, sample, false);
|
||||
return std::make_shared<ArrowBlockInputFormat>(buf, sample, false, format_settings);
|
||||
});
|
||||
factory.markFormatAsColumnOriented("Arrow");
|
||||
factory.registerInputFormatProcessor(
|
||||
@ -129,9 +129,9 @@ void registerInputFormatProcessorArrow(FormatFactory & factory)
|
||||
[](ReadBuffer & buf,
|
||||
const Block & sample,
|
||||
const RowInputFormatParams & /* params */,
|
||||
const FormatSettings & /* format_settings */)
|
||||
const FormatSettings & format_settings)
|
||||
{
|
||||
return std::make_shared<ArrowBlockInputFormat>(buf, sample, true);
|
||||
return std::make_shared<ArrowBlockInputFormat>(buf, sample, true, format_settings);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
#if USE_ARROW
|
||||
|
||||
#include <Processors/Formats/IInputFormat.h>
|
||||
#include <Formats/FormatSettings.h>
|
||||
|
||||
namespace arrow { class RecordBatchReader; }
|
||||
namespace arrow::ipc { class RecordBatchFileReader; }
|
||||
@ -19,7 +20,7 @@ class ArrowColumnToCHColumn;
|
||||
class ArrowBlockInputFormat : public IInputFormat
|
||||
{
|
||||
public:
|
||||
ArrowBlockInputFormat(ReadBuffer & in_, const Block & header_, bool stream_);
|
||||
ArrowBlockInputFormat(ReadBuffer & in_, const Block & header_, bool stream_, const FormatSettings & format_settings_);
|
||||
|
||||
void resetParser() override;
|
||||
|
||||
@ -41,6 +42,8 @@ private:
|
||||
int record_batch_total = 0;
|
||||
int record_batch_current = 0;
|
||||
|
||||
const FormatSettings format_settings;
|
||||
|
||||
void prepareReader();
|
||||
};
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,7 @@
|
||||
#if USE_ARROW || USE_ORC || USE_PARQUET
|
||||
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <Core/ColumnWithTypeAndName.h>
|
||||
#include <arrow/table.h>
|
||||
|
||||
|
||||
@ -19,19 +20,22 @@ class Chunk;
|
||||
class ArrowColumnToCHColumn
|
||||
{
|
||||
public:
|
||||
ArrowColumnToCHColumn(const Block & header_, std::shared_ptr<arrow::Schema> schema_, const std::string & format_name_);
|
||||
ArrowColumnToCHColumn(const Block & header_, const std::string & format_name_, bool import_nested_);
|
||||
|
||||
/// Create header by arrow schema. It will be useful for inserting
|
||||
/// data from file without knowing table structure.
|
||||
ArrowColumnToCHColumn(const arrow::Schema & schema, const std::string & format_name, bool import_nested_);
|
||||
|
||||
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table);
|
||||
|
||||
private:
|
||||
const Block & header;
|
||||
std::unordered_map<std::string, DataTypePtr> name_to_internal_type;
|
||||
const std::string format_name;
|
||||
|
||||
bool import_nested;
|
||||
/// Map {column name : dictionary column}.
|
||||
/// To avoid converting dictionary from Arrow Dictionary
|
||||
/// to LowCardinality every chunk we save it and reuse.
|
||||
std::unordered_map<std::string, ColumnPtr> dictionary_values;
|
||||
std::unordered_map<std::string, std::shared_ptr<ColumnWithTypeAndName>> dictionary_values;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -295,6 +295,7 @@ namespace DB
|
||||
FOR_ARROW_TYPES(DISPATCH)
|
||||
#undef DISPATCH
|
||||
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with {} data.", column_type->getName());
|
||||
}
|
||||
|
||||
template <typename ColumnType>
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <arrow/io/memory.h>
|
||||
#include "ArrowBufferedStreams.h"
|
||||
#include "ArrowColumnToCHColumn.h"
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -26,7 +27,8 @@ namespace ErrorCodes
|
||||
throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \
|
||||
} while (false)
|
||||
|
||||
ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_) : IInputFormat(std::move(header_), in_)
|
||||
ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_)
|
||||
: IInputFormat(std::move(header_), in_), format_settings(format_settings_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -98,7 +100,11 @@ void ORCBlockInputFormat::prepareReader()
|
||||
std::shared_ptr<arrow::Schema> schema;
|
||||
THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema));
|
||||
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), schema, "ORC");
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "ORC", format_settings.orc.import_nested);
|
||||
|
||||
std::unordered_set<String> nested_table_names;
|
||||
if (format_settings.orc.import_nested)
|
||||
nested_table_names = Nested::getAllTableNames(getPort().getHeader());
|
||||
|
||||
/// In ReadStripe column indices should be started from 1,
|
||||
/// because 0 indicates to select all columns.
|
||||
@ -108,7 +114,8 @@ void ORCBlockInputFormat::prepareReader()
|
||||
/// LIST type require 2 indices, STRUCT - the number of elements + 1,
|
||||
/// so we should recursively count the number of indices we need for this type.
|
||||
int indexes_count = countIndicesForType(schema->field(i)->type());
|
||||
if (getPort().getHeader().has(schema->field(i)->name()))
|
||||
const auto & name = schema->field(i)->name();
|
||||
if (getPort().getHeader().has(name) || nested_table_names.contains(name))
|
||||
{
|
||||
for (int j = 0; j != indexes_count; ++j)
|
||||
include_indices.push_back(index + j);
|
||||
@ -124,9 +131,9 @@ void registerInputFormatProcessorORC(FormatFactory &factory)
|
||||
[](ReadBuffer &buf,
|
||||
const Block &sample,
|
||||
const RowInputFormatParams &,
|
||||
const FormatSettings & /* settings */)
|
||||
const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ORCBlockInputFormat>(buf, sample);
|
||||
return std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
|
||||
});
|
||||
factory.markFormatAsColumnOriented("ORC");
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
#if USE_ORC
|
||||
|
||||
#include <Processors/Formats/IInputFormat.h>
|
||||
#include <Formats/FormatSettings.h>
|
||||
|
||||
namespace arrow::adapters::orc { class ORCFileReader; }
|
||||
|
||||
@ -14,7 +15,7 @@ class ArrowColumnToCHColumn;
|
||||
class ORCBlockInputFormat : public IInputFormat
|
||||
{
|
||||
public:
|
||||
ORCBlockInputFormat(ReadBuffer & in_, Block header_);
|
||||
ORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_);
|
||||
|
||||
String getName() const override { return "ORCBlockInputFormat"; }
|
||||
|
||||
@ -38,6 +39,8 @@ private:
|
||||
// indices of columns to read from ORC file
|
||||
std::vector<int> include_indices;
|
||||
|
||||
const FormatSettings format_settings;
|
||||
|
||||
void prepareReader();
|
||||
};
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <parquet/file_reader.h>
|
||||
#include "ArrowBufferedStreams.h"
|
||||
#include "ArrowColumnToCHColumn.h"
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
|
||||
#include <common/logger_useful.h>
|
||||
|
||||
@ -30,8 +31,8 @@ namespace ErrorCodes
|
||||
throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \
|
||||
} while (false)
|
||||
|
||||
ParquetBlockInputFormat::ParquetBlockInputFormat(ReadBuffer & in_, Block header_)
|
||||
: IInputFormat(std::move(header_), in_)
|
||||
ParquetBlockInputFormat::ParquetBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_)
|
||||
: IInputFormat(std::move(header_), in_), format_settings(format_settings_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -98,7 +99,11 @@ void ParquetBlockInputFormat::prepareReader()
|
||||
std::shared_ptr<arrow::Schema> schema;
|
||||
THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema));
|
||||
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), schema, "Parquet");
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested);
|
||||
|
||||
std::unordered_set<String> nested_table_names;
|
||||
if (format_settings.parquet.import_nested)
|
||||
nested_table_names = Nested::getAllTableNames(getPort().getHeader());
|
||||
|
||||
int index = 0;
|
||||
for (int i = 0; i < schema->num_fields(); ++i)
|
||||
@ -107,7 +112,8 @@ void ParquetBlockInputFormat::prepareReader()
|
||||
/// nested elements, so we should recursively
|
||||
/// count the number of indices we need for this type.
|
||||
int indexes_count = countIndicesForType(schema->field(i)->type());
|
||||
if (getPort().getHeader().has(schema->field(i)->name()))
|
||||
const auto & name = schema->field(i)->name();
|
||||
if (getPort().getHeader().has(name) || nested_table_names.contains(name))
|
||||
{
|
||||
for (int j = 0; j != indexes_count; ++j)
|
||||
column_indices.push_back(index + j);
|
||||
@ -123,9 +129,9 @@ void registerInputFormatProcessorParquet(FormatFactory &factory)
|
||||
[](ReadBuffer &buf,
|
||||
const Block &sample,
|
||||
const RowInputFormatParams &,
|
||||
const FormatSettings & /* settings */)
|
||||
const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ParquetBlockInputFormat>(buf, sample);
|
||||
return std::make_shared<ParquetBlockInputFormat>(buf, sample, settings);
|
||||
});
|
||||
factory.markFormatAsColumnOriented("Parquet");
|
||||
}
|
||||
|
@ -4,6 +4,7 @@
|
||||
#if USE_PARQUET
|
||||
|
||||
#include <Processors/Formats/IInputFormat.h>
|
||||
#include <Formats/FormatSettings.h>
|
||||
|
||||
namespace parquet::arrow { class FileReader; }
|
||||
|
||||
@ -17,7 +18,7 @@ class ArrowColumnToCHColumn;
|
||||
class ParquetBlockInputFormat : public IInputFormat
|
||||
{
|
||||
public:
|
||||
ParquetBlockInputFormat(ReadBuffer & in_, Block header_);
|
||||
ParquetBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_);
|
||||
|
||||
void resetParser() override;
|
||||
|
||||
@ -36,6 +37,7 @@ private:
|
||||
std::vector<int> column_indices;
|
||||
std::unique_ptr<ArrowColumnToCHColumn> arrow_column_to_ch_column;
|
||||
int row_group_current = 0;
|
||||
const FormatSettings format_settings;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,21 @@
|
||||
Arrow
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
||||
Parquet
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
||||
ORC
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414] [[(1,'123',9.8),(2,'456',10.12),(3,'789',11.14)],[(4,'101112',123.8),(5,'131415',10.2),(6,'161718',11.414)],[(7,'101',13.08),(8,'415',1.12),(9,'118',0.414)]]
|
36
tests/queries/0_stateless/00900_orc_arrow_parquet_nested.sh
Executable file
36
tests/queries/0_stateless/00900_orc_arrow_parquet_nested.sh
Executable file
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_table"
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_nested_table"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE nested_table (table Nested(elem1 Int32, elem2 String, elem3 Float32)) engine=Memory"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE nested_nested_table (table Nested(elem1 Int32, elem2 String, elem3 Float32, nested Nested(elem1 Int32, elem2 String, elem3 Float32))) engine=Memory"
|
||||
|
||||
|
||||
formats=('Arrow' 'Parquet' 'ORC')
|
||||
format_files=('arrow' 'parquet' 'orc')
|
||||
|
||||
for ((i = 0; i < 3; i++)) do
|
||||
echo ${formats[i]}
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE nested_table"
|
||||
cat $CUR_DIR/data_orc_arrow_parquet_nested/nested_table.${format_files[i]} | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nested_table FORMAT ${formats[i]} SETTINGS input_format_${format_files[i]}_import_nested = 1"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM nested_table"
|
||||
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE nested_nested_table"
|
||||
cat $CUR_DIR/data_orc_arrow_parquet_nested/nested_nested_table.${format_files[i]} | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nested_nested_table FORMAT ${formats[i]} SETTINGS input_format_${format_files[i]}_import_nested = 1"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM nested_nested_table"
|
||||
|
||||
|
||||
done
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE nested_table"
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE nested_nested_table"
|
@ -1,2 +1,5 @@
|
||||
1 ['a','b','c'] ('z','6')
|
||||
2 ['d','e'] ('x','9')
|
||||
1 ['a','b','c'] ('z','6')
|
||||
2 ['d','e'] ('x','9')
|
||||
20000000
|
||||
|
@ -20,5 +20,7 @@ ${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_dicts FORMAT Arrow SETTINGS ou
|
||||
|
||||
cat "${CLICKHOUSE_TMP}"/dicts.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_dicts FORMAT Arrow"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM arrow_dicts"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_dicts"
|
||||
|
||||
|
@ -0,0 +1,178 @@
|
||||
#include <arrow/io/memory.h>
|
||||
#include <arrow/io/file.h>
|
||||
#include <arrow/table.h>
|
||||
#include <arrow/api.h>
|
||||
#include <arrow/util/memory.h>
|
||||
#include <arrow/ipc/writer.h>
|
||||
#include <arrow/adapters/orc/adapter.h>
|
||||
#include <parquet/arrow/writer.h>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
void write_arrow(const arrow::Table & table)
|
||||
{
|
||||
auto file = arrow::io::FileOutputStream::Open("nested_nested_table.arrow");
|
||||
|
||||
auto writer = arrow::ipc::MakeFileWriter(file->get(), table.schema()).ValueOrDie();
|
||||
|
||||
auto status = writer->WriteTable(table, 100000);
|
||||
|
||||
if (!status.ok())
|
||||
throw std::runtime_error(status.ToString());
|
||||
|
||||
status = writer->Close();
|
||||
|
||||
if (!status.ok())
|
||||
throw std::runtime_error(status.ToString());
|
||||
}
|
||||
|
||||
void write_parquet(const arrow::Table & table)
|
||||
{
|
||||
auto file = arrow::io::FileOutputStream::Open("nested_nested_table.parquet");
|
||||
|
||||
std::unique_ptr<parquet::arrow::FileWriter> writer;
|
||||
parquet::WriterProperties::Builder prop_builder;
|
||||
auto props = prop_builder.build();
|
||||
auto status = parquet::arrow::FileWriter::Open(
|
||||
*table.schema(),
|
||||
arrow::default_memory_pool(),
|
||||
*file,
|
||||
props,
|
||||
&writer);
|
||||
|
||||
|
||||
status = writer->WriteTable(table, 100000);
|
||||
|
||||
if (!status.ok())
|
||||
throw std::runtime_error(status.ToString());
|
||||
|
||||
status = writer->Close();
|
||||
|
||||
if (!status.ok())
|
||||
throw std::runtime_error(status.ToString());
|
||||
}
|
||||
|
||||
void write_orc(const arrow::Table & table)
|
||||
{
|
||||
auto file = arrow::io::FileOutputStream::Open("nested_nested_table.orc");
|
||||
|
||||
auto writer = arrow::adapters::orc::ORCFileWriter::Open(file->get()).ValueOrDie();
|
||||
|
||||
auto status = writer->Write(table);
|
||||
|
||||
if (!status.ok())
|
||||
throw std::runtime_error(status.ToString());
|
||||
|
||||
status = writer->Close();
|
||||
|
||||
if (!status.ok())
|
||||
throw std::runtime_error(status.ToString());
|
||||
}
|
||||
|
||||
|
||||
void fillNested(arrow::ArrayBuilder * builder, bool nested)
|
||||
{
|
||||
arrow::ListBuilder * list_builder = static_cast<arrow::ListBuilder *>(builder);
|
||||
arrow::StructBuilder * struct_builder = static_cast<arrow::StructBuilder *>(list_builder->value_builder());
|
||||
arrow::Int32Builder * elem1_builder = static_cast<arrow::Int32Builder *>(struct_builder->field_builder(0));
|
||||
arrow::BinaryBuilder * elem2_builder = static_cast<arrow::BinaryBuilder *>(struct_builder->field_builder(1));
|
||||
arrow::FloatBuilder * elem3_builder = static_cast<arrow::FloatBuilder *>(struct_builder->field_builder(2));
|
||||
|
||||
arrow::ListBuilder * nested_list_builder = nullptr;
|
||||
if (nested)
|
||||
nested_list_builder = static_cast<arrow::ListBuilder *>(struct_builder->field_builder(3));
|
||||
|
||||
arrow::Status status;
|
||||
status = list_builder->Append();
|
||||
|
||||
std::vector<int> elem1 = {1, 2, 3};
|
||||
std::vector<std::string> elem2 = {"123", "456", "789"};
|
||||
std::vector<float> elem3 = {9.8, 10.12, 11.14};
|
||||
status = elem1_builder->AppendValues(elem1);
|
||||
status = elem2_builder->AppendValues(elem2);
|
||||
status = elem3_builder->AppendValues(elem3);
|
||||
if (nested)
|
||||
fillNested(nested_list_builder, false);
|
||||
|
||||
for (size_t i = 0; i != elem1.size(); ++i)
|
||||
status = struct_builder->Append();
|
||||
|
||||
status = list_builder->Append();
|
||||
|
||||
elem1 = {4, 5, 6};
|
||||
elem2 = {"101112", "131415", "161718"};
|
||||
elem3 = {123.8, 10.2, 11.414};
|
||||
status = elem1_builder->AppendValues(elem1);
|
||||
status = elem2_builder->AppendValues(elem2);
|
||||
status = elem3_builder->AppendValues(elem3);
|
||||
if (nested)
|
||||
fillNested(nested_list_builder, false);
|
||||
|
||||
for (size_t i = 0; i != elem1.size(); ++i)
|
||||
status = struct_builder->Append();
|
||||
|
||||
status = list_builder->Append();
|
||||
|
||||
elem1 = {7, 8, 9};
|
||||
elem2 = {"101", "415", "118"};
|
||||
elem3 = {13.08, 1.12, 0.414};
|
||||
status = elem1_builder->AppendValues(elem1);
|
||||
status = elem2_builder->AppendValues(elem2);
|
||||
status = elem3_builder->AppendValues(elem3);
|
||||
if (nested)
|
||||
fillNested(nested_list_builder, false);
|
||||
|
||||
for (size_t i = 0; i != elem1.size(); ++i)
|
||||
status = struct_builder->Append();
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
std::vector<std::shared_ptr<arrow::Field>> nested_struct_fields;
|
||||
nested_struct_fields.push_back(std::make_shared<arrow::Field>("elem1", arrow::int32()));
|
||||
nested_struct_fields.push_back(std::make_shared<arrow::Field>("elem2", arrow::binary()));
|
||||
nested_struct_fields.push_back(std::make_shared<arrow::Field>("elem3", arrow::float32()));
|
||||
auto nested_struct_type = arrow::struct_(nested_struct_fields);
|
||||
auto nested_field = std::make_shared<arrow::Field>("nested", nested_struct_type);
|
||||
auto nested_list_type = arrow::list(nested_field);
|
||||
auto nested_list_field = std::make_shared<arrow::Field>("nested", nested_list_type);
|
||||
|
||||
std::vector<std::shared_ptr<arrow::Field>> struct_fields;
|
||||
struct_fields.push_back(std::make_shared<arrow::Field>("elem1", arrow::int32()));
|
||||
struct_fields.push_back(std::make_shared<arrow::Field>("elem2", arrow::binary()));
|
||||
struct_fields.push_back(std::make_shared<arrow::Field>("elem3", arrow::float32()));
|
||||
struct_fields.push_back(std::make_shared<arrow::Field>("nested", nested_list_type));
|
||||
|
||||
|
||||
auto struct_type = arrow::struct_(struct_fields);
|
||||
auto field = std::make_shared<arrow::Field>("table", struct_type);
|
||||
auto list_type = arrow::list(field);
|
||||
|
||||
arrow::MemoryPool* pool = arrow::default_memory_pool();
|
||||
std::unique_ptr<arrow::ArrayBuilder> tmp;
|
||||
auto status = MakeBuilder(pool, list_type, &tmp);
|
||||
|
||||
if (!status.ok())
|
||||
throw std::runtime_error(status.ToString());
|
||||
|
||||
fillNested(tmp.get(), true);
|
||||
|
||||
std::shared_ptr<arrow::Array> array;
|
||||
status = tmp->Finish(&array);
|
||||
|
||||
if (!status.ok())
|
||||
throw std::runtime_error(status.ToString());
|
||||
|
||||
std::vector<std::shared_ptr<arrow::Field>> fields_for_schema = {std::make_shared<arrow::Field>("table", list_type)};
|
||||
auto schema = std::make_shared<arrow::Schema>(std::move(fields_for_schema));
|
||||
auto table = arrow::Table::Make(schema, {array});
|
||||
|
||||
if (!table)
|
||||
throw std::runtime_error("WTF");
|
||||
|
||||
write_orc(*table);
|
||||
write_arrow(*table);
|
||||
write_parquet(*table);
|
||||
|
||||
return 0;
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
tests/queries/0_stateless/dicts.arrow
Normal file
BIN
tests/queries/0_stateless/dicts.arrow
Normal file
Binary file not shown.
0
tests/queries/0_stateless/maps
Normal file
0
tests/queries/0_stateless/maps
Normal file
Loading…
Reference in New Issue
Block a user