Merge pull request #53324 from bigo-sg/ch_gluten_2583

Implement native orc input format without arrow to improve performance
This commit is contained in:
Michael Kolupaev 2023-08-21 13:44:57 -07:00 committed by GitHub
commit 6009e1b293
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 1231 additions and 23 deletions

2
contrib/orc vendored

@ -1 +1 @@
Subproject commit 568d1d60c250af1890f226c182bc15bd8cc94cf1
Subproject commit a20d1d9d7ad4a4be7b7ba97588e16ca8b9abb2b6

View File

@ -878,6 +878,7 @@ class IColumn;
M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \

View File

@ -189,6 +189,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;

View File

@ -347,6 +347,7 @@ struct FormatSettings
std::unordered_set<int> skip_stripes = {};
bool output_string_as_string = false;
ORCCompression output_compression_method = ORCCompression::NONE;
bool use_fast_decoder = true;
} orc;
/// For capnProto format we should determine how to

View File

@ -19,7 +19,10 @@ public:
class ReadBufferFromOwnString : public String, public ReadBufferFromString
{
public:
explicit ReadBufferFromOwnString(const String & s_): String(s_), ReadBufferFromString(*this) {}
template <typename S>
explicit ReadBufferFromOwnString(S && s_) : String(std::forward<S>(s_)), ReadBufferFromString(*this)
{
}
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,129 @@
#pragma once
#include "config.h"
#if USE_ORC
# include <Formats/FormatSettings.h>
# include <IO/ReadBufferFromString.h>
# include <Processors/Formats/IInputFormat.h>
# include <Processors/Formats/ISchemaReader.h>
# include <orc/OrcFile.hh>
namespace DB
{
class ORCInputStream : public orc::InputStream
{
public:
ORCInputStream(SeekableReadBuffer & in_, size_t file_size_);
uint64_t getLength() const override;
uint64_t getNaturalReadSize() const override;
void read(void * buf, uint64_t length, uint64_t offset) override;
const std::string & getName() const override { return name; }
protected:
SeekableReadBuffer & in;
size_t file_size;
std::string name = "ORCInputStream";
};
class ORCInputStreamFromString : public ReadBufferFromOwnString, public ORCInputStream
{
public:
template <typename S>
ORCInputStreamFromString(S && s_, size_t file_size_)
: ReadBufferFromOwnString(std::forward<S>(s_)), ORCInputStream(dynamic_cast<SeekableReadBuffer &>(*this), file_size_)
{
}
};
std::unique_ptr<orc::InputStream> asORCInputStream(ReadBuffer & in, const FormatSettings & settings, std::atomic<int> & is_cancelled);
// Reads the whole file into a memory buffer, owned by the returned RandomAccessFile.
std::unique_ptr<orc::InputStream> asORCInputStreamLoadIntoMemory(ReadBuffer & in, std::atomic<int> & is_cancelled);
class ORCColumnToCHColumn;
class NativeORCBlockInputFormat : public IInputFormat
{
public:
NativeORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_);
String getName() const override { return "ORCBlockInputFormat"; }
void resetParser() override;
const BlockMissingValues & getMissingValues() const override;
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
protected:
Chunk generate() override;
void onCancel() override { is_stopped = 1; }
private:
void prepareFileReader();
bool prepareStripeReader();
std::unique_ptr<orc::Reader> file_reader;
std::unique_ptr<orc::RowReader> stripe_reader;
std::unique_ptr<ORCColumnToCHColumn> orc_column_to_ch_column;
std::unique_ptr<orc::ColumnVectorBatch> batch;
// indices of columns to read from ORC file
std::list<UInt64> include_indices;
BlockMissingValues block_missing_values;
size_t approx_bytes_read_for_chunk;
const FormatSettings format_settings;
const std::unordered_set<int> & skip_stripes;
int total_stripes = 0;
int current_stripe = -1;
std::unique_ptr<orc::StripeInformation> current_stripe_info;
std::atomic<int> is_stopped{0};
};
class NativeORCSchemaReader : public ISchemaReader
{
public:
NativeORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
NamesAndTypesList readSchema() override;
private:
const FormatSettings format_settings;
};
class ORCColumnToCHColumn
{
public:
using ORCColumnPtr = const orc::ColumnVectorBatch *;
using ORCTypePtr = const orc::Type *;
using ORCColumnWithType = std::pair<ORCColumnPtr, ORCTypePtr>;
using NameToColumnPtr = std::unordered_map<std::string, ORCColumnWithType>;
ORCColumnToCHColumn(const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_ = false);
void orcTableToCHChunk(
Chunk & res,
const orc::Type * schema,
const orc::ColumnVectorBatch * table,
size_t num_rows,
BlockMissingValues * block_missing_values = nullptr);
void orcColumnsToCHChunk(
Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values = nullptr);
private:
const Block & header;
/// If false, throw exception if some columns in header not exists in arrow table.
bool allow_missing_columns;
bool null_as_default;
bool case_insensitive_matching;
};
}
#endif

View File

@ -1,16 +1,17 @@
#include "ORCBlockInputFormat.h"
#include <boost/algorithm/string/case_conv.hpp>
#if USE_ORC
#if USE_ORC
# include <DataTypes/NestedUtils.h>
# include <Formats/FormatFactory.h>
# include <Formats/SchemaInferenceUtils.h>
# include <IO/ReadBufferFromMemory.h>
# include <IO/WriteHelpers.h>
# include <IO/copyData.h>
# include <boost/algorithm/string/case_conv.hpp>
# include "ArrowBufferedStreams.h"
# include "ArrowColumnToCHColumn.h"
# include "ArrowFieldIndexUtil.h"
#include <DataTypes/NestedUtils.h>
# include "NativeORCBlockInputFormat.h"
namespace DB
{
@ -154,18 +155,23 @@ NamesAndTypesList ORCSchemaReader::readSchema()
*schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference);
if (format_settings.schema_inference_make_columns_nullable)
return getNamesAndRecursivelyNullableTypes(header);
return header.getNamesAndTypesList();}
return header.getNamesAndTypesList();
}
void registerInputFormatORC(FormatFactory & factory)
{
factory.registerInputFormat(
"ORC",
[](ReadBuffer &buf,
const Block &sample,
const RowInputFormatParams &,
const FormatSettings & settings)
[](ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, const FormatSettings & settings)
{
return std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
InputFormatPtr res;
if (settings.orc.use_fast_decoder)
res = std::make_shared<NativeORCBlockInputFormat>(buf, sample, settings);
else
res = std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
return res;
});
factory.markFormatSupportsSubsetOfColumns("ORC");
}
@ -176,7 +182,13 @@ void registerORCSchemaReader(FormatFactory & factory)
"ORC",
[](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<ORCSchemaReader>(buf, settings);
SchemaReaderPtr res;
if (settings.orc.use_fast_decoder)
res = std::make_shared<NativeORCSchemaReader>(buf, settings);
else
res = std::make_shared<ORCSchemaReader>(buf, settings);
return res;
}
);

View File

@ -2,3 +2,9 @@ if (TARGET ch_contrib::hivemetastore)
clickhouse_add_executable (comma_separated_streams comma_separated_streams.cpp)
target_link_libraries (comma_separated_streams PRIVATE dbms)
endif()
if (USE_ORC)
clickhouse_add_executable (native_orc native_orc.cpp)
target_link_libraries (native_orc PRIVATE dbms)
target_include_directories (native_orc PRIVATE ${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include)
endif ()

View File

@ -0,0 +1,36 @@
#include <string>
#include <IO/ReadBufferFromFile.h>
#include <Processors/Formats/Impl/NativeORCBlockInputFormat.h>
#include <IO/copyData.h>
using namespace DB;
int main()
{
/// Read schema from orc file
String path = "/path/to/orc/file";
// String path = "/data1/clickhouse_official/data/user_files/bigolive_audience_stats_orc.orc";
{
ReadBufferFromFile in(path);
NativeORCSchemaReader schema_reader(in, {});
auto schema = schema_reader.readSchema();
std::cout << "schema:" << schema.toString() << std::endl;
}
/// Read schema from string with orc data
{
ReadBufferFromFile in(path);
String content;
WriteBufferFromString out(content);
copyData(in, out);
content.resize(out.count());
ReadBufferFromString in2(content);
NativeORCSchemaReader schema_reader(in2, {});
auto schema = schema_reader.readSchema();
std::cout << "schema:" << schema.toString() << std::endl;
}
return 0;
}