2020-05-03 00:54:39 +00:00
|
|
|
#pragma once
|
2021-10-27 23:10:39 +00:00
|
|
|
#include "config_formats.h"
|
2020-05-03 00:54:39 +00:00
|
|
|
|
|
|
|
#if USE_ARROW || USE_PARQUET
|
|
|
|
|
|
|
|
#include <Core/Block.h>
|
|
|
|
#include <Processors/Chunk.h>
|
|
|
|
#include <arrow/table.h>
|
|
|
|
|
2021-07-24 11:46:00 +00:00
|
|
|
|
2020-05-03 00:54:39 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
class CHColumnToArrowColumn
|
|
|
|
{
|
2021-06-07 15:15:58 +00:00
|
|
|
public:
|
2022-05-18 14:51:21 +00:00
|
|
|
CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_);
|
2021-06-07 15:15:58 +00:00
|
|
|
|
|
|
|
void chChunkToArrowTable(std::shared_ptr<arrow::Table> & res, const Chunk & chunk, size_t columns_num);
|
2021-05-25 12:01:28 +00:00
|
|
|
|
2021-07-24 11:46:00 +00:00
|
|
|
private:
|
2021-06-07 15:15:58 +00:00
|
|
|
ColumnsWithTypeAndName header_columns;
|
|
|
|
std::vector<std::shared_ptr<arrow::Field>> arrow_fields;
|
|
|
|
const std::string format_name;
|
|
|
|
bool low_cardinality_as_dictionary;
|
2021-05-25 12:01:28 +00:00
|
|
|
/// Map {column name : arrow dictionary}.
|
|
|
|
/// To avoid converting dictionary from LowCardinality to Arrow
|
|
|
|
/// Dictionary every chunk we save it and reuse.
|
|
|
|
std::unordered_map<std::string, std::shared_ptr<arrow::Array>> dictionary_values;
|
2022-05-09 13:29:42 +00:00
|
|
|
|
|
|
|
/// We should initialize arrow fields on first call of chChunkToArrowTable, not in constructor
|
|
|
|
/// because LowCardinality column from header always has indexes type UInt8, so, we should get
|
|
|
|
/// proper indexes type from first chunk of data.
|
|
|
|
bool is_arrow_fields_initialized = false;
|
2022-05-18 14:51:21 +00:00
|
|
|
|
|
|
|
/// Output columns with String data type as Arrow::String type.
|
|
|
|
/// By default Arrow::Binary is used.
|
|
|
|
bool output_string_as_string = false;
|
2020-05-03 00:54:39 +00:00
|
|
|
};
|
2021-07-24 11:46:00 +00:00
|
|
|
|
2020-05-03 00:54:39 +00:00
|
|
|
}
|
2021-07-24 11:46:00 +00:00
|
|
|
|
2020-05-03 00:54:39 +00:00
|
|
|
#endif
|