mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
Merge pull request #53324 from bigo-sg/ch_gluten_2583
Implement native orc input format without arrow to improve performance
This commit is contained in:
commit
6009e1b293
2
contrib/orc
vendored
2
contrib/orc
vendored
@ -1 +1 @@
|
||||
Subproject commit 568d1d60c250af1890f226c182bc15bd8cc94cf1
|
||||
Subproject commit a20d1d9d7ad4a4be7b7ba97588e16ca8b9abb2b6
|
@ -878,6 +878,7 @@ class IColumn;
|
||||
M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \
|
||||
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
|
||||
M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
|
||||
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
|
||||
M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
|
||||
M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
|
||||
M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \
|
||||
|
@ -189,6 +189,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
|
||||
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
|
||||
format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
|
||||
format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
|
||||
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
|
||||
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
|
||||
format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;
|
||||
|
@ -347,6 +347,7 @@ struct FormatSettings
|
||||
std::unordered_set<int> skip_stripes = {};
|
||||
bool output_string_as_string = false;
|
||||
ORCCompression output_compression_method = ORCCompression::NONE;
|
||||
bool use_fast_decoder = true;
|
||||
} orc;
|
||||
|
||||
/// For capnProto format we should determine how to
|
||||
|
@ -19,7 +19,10 @@ public:
|
||||
class ReadBufferFromOwnString : public String, public ReadBufferFromString
|
||||
{
|
||||
public:
|
||||
explicit ReadBufferFromOwnString(const String & s_): String(s_), ReadBufferFromString(*this) {}
|
||||
template <typename S>
|
||||
explicit ReadBufferFromOwnString(S && s_) : String(std::forward<S>(s_)), ReadBufferFromString(*this)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
1019
src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
Normal file
1019
src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
Normal file
File diff suppressed because it is too large
Load Diff
129
src/Processors/Formats/Impl/NativeORCBlockInputFormat.h
Normal file
129
src/Processors/Formats/Impl/NativeORCBlockInputFormat.h
Normal file
@ -0,0 +1,129 @@
|
||||
#pragma once
|
||||
#include "config.h"
|
||||
|
||||
#if USE_ORC
|
||||
# include <Formats/FormatSettings.h>
|
||||
# include <IO/ReadBufferFromString.h>
|
||||
# include <Processors/Formats/IInputFormat.h>
|
||||
# include <Processors/Formats/ISchemaReader.h>
|
||||
# include <orc/OrcFile.hh>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ORCInputStream : public orc::InputStream
|
||||
{
|
||||
public:
|
||||
ORCInputStream(SeekableReadBuffer & in_, size_t file_size_);
|
||||
|
||||
uint64_t getLength() const override;
|
||||
uint64_t getNaturalReadSize() const override;
|
||||
void read(void * buf, uint64_t length, uint64_t offset) override;
|
||||
const std::string & getName() const override { return name; }
|
||||
|
||||
protected:
|
||||
SeekableReadBuffer & in;
|
||||
size_t file_size;
|
||||
std::string name = "ORCInputStream";
|
||||
};
|
||||
|
||||
class ORCInputStreamFromString : public ReadBufferFromOwnString, public ORCInputStream
|
||||
{
|
||||
public:
|
||||
template <typename S>
|
||||
ORCInputStreamFromString(S && s_, size_t file_size_)
|
||||
: ReadBufferFromOwnString(std::forward<S>(s_)), ORCInputStream(dynamic_cast<SeekableReadBuffer &>(*this), file_size_)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<orc::InputStream> asORCInputStream(ReadBuffer & in, const FormatSettings & settings, std::atomic<int> & is_cancelled);
|
||||
|
||||
// Reads the whole file into a memory buffer, owned by the returned RandomAccessFile.
|
||||
std::unique_ptr<orc::InputStream> asORCInputStreamLoadIntoMemory(ReadBuffer & in, std::atomic<int> & is_cancelled);
|
||||
|
||||
|
||||
class ORCColumnToCHColumn;
|
||||
class NativeORCBlockInputFormat : public IInputFormat
|
||||
{
|
||||
public:
|
||||
NativeORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_);
|
||||
|
||||
String getName() const override { return "ORCBlockInputFormat"; }
|
||||
|
||||
void resetParser() override;
|
||||
|
||||
const BlockMissingValues & getMissingValues() const override;
|
||||
|
||||
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
||||
|
||||
protected:
|
||||
Chunk generate() override;
|
||||
|
||||
void onCancel() override { is_stopped = 1; }
|
||||
|
||||
private:
|
||||
void prepareFileReader();
|
||||
bool prepareStripeReader();
|
||||
|
||||
std::unique_ptr<orc::Reader> file_reader;
|
||||
std::unique_ptr<orc::RowReader> stripe_reader;
|
||||
std::unique_ptr<ORCColumnToCHColumn> orc_column_to_ch_column;
|
||||
std::unique_ptr<orc::ColumnVectorBatch> batch;
|
||||
|
||||
// indices of columns to read from ORC file
|
||||
std::list<UInt64> include_indices;
|
||||
|
||||
BlockMissingValues block_missing_values;
|
||||
size_t approx_bytes_read_for_chunk;
|
||||
|
||||
const FormatSettings format_settings;
|
||||
const std::unordered_set<int> & skip_stripes;
|
||||
|
||||
int total_stripes = 0;
|
||||
int current_stripe = -1;
|
||||
std::unique_ptr<orc::StripeInformation> current_stripe_info;
|
||||
|
||||
std::atomic<int> is_stopped{0};
|
||||
};
|
||||
|
||||
class NativeORCSchemaReader : public ISchemaReader
|
||||
{
|
||||
public:
|
||||
NativeORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
|
||||
|
||||
NamesAndTypesList readSchema() override;
|
||||
|
||||
private:
|
||||
const FormatSettings format_settings;
|
||||
};
|
||||
|
||||
class ORCColumnToCHColumn
|
||||
{
|
||||
public:
|
||||
using ORCColumnPtr = const orc::ColumnVectorBatch *;
|
||||
using ORCTypePtr = const orc::Type *;
|
||||
using ORCColumnWithType = std::pair<ORCColumnPtr, ORCTypePtr>;
|
||||
using NameToColumnPtr = std::unordered_map<std::string, ORCColumnWithType>;
|
||||
|
||||
ORCColumnToCHColumn(const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_ = false);
|
||||
|
||||
void orcTableToCHChunk(
|
||||
Chunk & res,
|
||||
const orc::Type * schema,
|
||||
const orc::ColumnVectorBatch * table,
|
||||
size_t num_rows,
|
||||
BlockMissingValues * block_missing_values = nullptr);
|
||||
|
||||
void orcColumnsToCHChunk(
|
||||
Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values = nullptr);
|
||||
|
||||
private:
|
||||
const Block & header;
|
||||
/// If false, throw exception if some columns in header not exists in arrow table.
|
||||
bool allow_missing_columns;
|
||||
bool null_as_default;
|
||||
bool case_insensitive_matching;
|
||||
};
|
||||
}
|
||||
#endif
|
@ -1,16 +1,17 @@
|
||||
#include "ORCBlockInputFormat.h"
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
#if USE_ORC
|
||||
|
||||
#include <Formats/FormatFactory.h>
|
||||
#include <Formats/SchemaInferenceUtils.h>
|
||||
#include <IO/ReadBufferFromMemory.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/copyData.h>
|
||||
#include "ArrowBufferedStreams.h"
|
||||
#include "ArrowColumnToCHColumn.h"
|
||||
#include "ArrowFieldIndexUtil.h"
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#if USE_ORC
|
||||
# include <DataTypes/NestedUtils.h>
|
||||
# include <Formats/FormatFactory.h>
|
||||
# include <Formats/SchemaInferenceUtils.h>
|
||||
# include <IO/ReadBufferFromMemory.h>
|
||||
# include <IO/WriteHelpers.h>
|
||||
# include <IO/copyData.h>
|
||||
# include <boost/algorithm/string/case_conv.hpp>
|
||||
# include "ArrowBufferedStreams.h"
|
||||
# include "ArrowColumnToCHColumn.h"
|
||||
# include "ArrowFieldIndexUtil.h"
|
||||
# include "NativeORCBlockInputFormat.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -154,19 +155,24 @@ NamesAndTypesList ORCSchemaReader::readSchema()
|
||||
*schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference);
|
||||
if (format_settings.schema_inference_make_columns_nullable)
|
||||
return getNamesAndRecursivelyNullableTypes(header);
|
||||
return header.getNamesAndTypesList();}
|
||||
return header.getNamesAndTypesList();
|
||||
}
|
||||
|
||||
|
||||
void registerInputFormatORC(FormatFactory & factory)
|
||||
{
|
||||
factory.registerInputFormat(
|
||||
"ORC",
|
||||
[](ReadBuffer &buf,
|
||||
const Block &sample,
|
||||
const RowInputFormatParams &,
|
||||
const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
|
||||
});
|
||||
"ORC",
|
||||
[](ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, const FormatSettings & settings)
|
||||
{
|
||||
InputFormatPtr res;
|
||||
if (settings.orc.use_fast_decoder)
|
||||
res = std::make_shared<NativeORCBlockInputFormat>(buf, sample, settings);
|
||||
else
|
||||
res = std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
|
||||
|
||||
return res;
|
||||
});
|
||||
factory.markFormatSupportsSubsetOfColumns("ORC");
|
||||
}
|
||||
|
||||
@ -176,7 +182,13 @@ void registerORCSchemaReader(FormatFactory & factory)
|
||||
"ORC",
|
||||
[](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ORCSchemaReader>(buf, settings);
|
||||
SchemaReaderPtr res;
|
||||
if (settings.orc.use_fast_decoder)
|
||||
res = std::make_shared<NativeORCSchemaReader>(buf, settings);
|
||||
else
|
||||
res = std::make_shared<ORCSchemaReader>(buf, settings);
|
||||
|
||||
return res;
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -2,3 +2,9 @@ if (TARGET ch_contrib::hivemetastore)
|
||||
clickhouse_add_executable (comma_separated_streams comma_separated_streams.cpp)
|
||||
target_link_libraries (comma_separated_streams PRIVATE dbms)
|
||||
endif()
|
||||
|
||||
if (USE_ORC)
|
||||
clickhouse_add_executable (native_orc native_orc.cpp)
|
||||
target_link_libraries (native_orc PRIVATE dbms)
|
||||
target_include_directories (native_orc PRIVATE ${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include)
|
||||
endif ()
|
||||
|
36
src/Processors/examples/native_orc.cpp
Normal file
36
src/Processors/examples/native_orc.cpp
Normal file
@ -0,0 +1,36 @@
|
||||
#include <string>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <Processors/Formats/Impl/NativeORCBlockInputFormat.h>
|
||||
#include <IO/copyData.h>
|
||||
|
||||
using namespace DB;
|
||||
|
||||
int main()
|
||||
{
|
||||
/// Read schema from orc file
|
||||
String path = "/path/to/orc/file";
|
||||
// String path = "/data1/clickhouse_official/data/user_files/bigolive_audience_stats_orc.orc";
|
||||
{
|
||||
ReadBufferFromFile in(path);
|
||||
NativeORCSchemaReader schema_reader(in, {});
|
||||
auto schema = schema_reader.readSchema();
|
||||
std::cout << "schema:" << schema.toString() << std::endl;
|
||||
}
|
||||
|
||||
/// Read schema from string with orc data
|
||||
{
|
||||
ReadBufferFromFile in(path);
|
||||
|
||||
String content;
|
||||
WriteBufferFromString out(content);
|
||||
|
||||
copyData(in, out);
|
||||
|
||||
content.resize(out.count());
|
||||
ReadBufferFromString in2(content);
|
||||
NativeORCSchemaReader schema_reader(in2, {});
|
||||
auto schema = schema_reader.readSchema();
|
||||
std::cout << "schema:" << schema.toString() << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user