Merge pull request #53324 from bigo-sg/ch_gluten_2583

Implement native orc input format without arrow to improve performance
2024-09-20 00:30:49 +00:00 · 2023-08-21 13:44:57 -07:00 · 2023-08-21 13:44:57 -07:00 · 6009e1b293
commit 6009e1b293
parent a5fbac9e6b e5a399d0e9
10 changed files with 1231 additions and 23 deletions
--- a/contrib/orc
+++ b/contrib/orc
@ -1 +1 @@
-Subproject commit 568d1d60c250af1890f226c182bc15bd8cc94cf1
+Subproject commit a20d1d9d7ad4a4be7b7ba97588e16ca8b9abb2b6
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -878,6 +878,7 @@ class IColumn;
    M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \
    M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
    M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
+    M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
    M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
    M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
    M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -189,6 +189,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
    format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
    format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
+    format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
    format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
    format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
    format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -347,6 +347,7 @@ struct FormatSettings
        std::unordered_set<int> skip_stripes = {};
        bool output_string_as_string = false;
        ORCCompression output_compression_method = ORCCompression::NONE;
+        bool use_fast_decoder = true;
    } orc;

    /// For capnProto format we should determine how to
--- a/src/IO/ReadBufferFromString.h
+++ b/src/IO/ReadBufferFromString.h
@ -19,7 +19,10 @@ public:
 class ReadBufferFromOwnString : public String, public ReadBufferFromString
 {
 public:
-    explicit ReadBufferFromOwnString(const String & s_): String(s_), ReadBufferFromString(*this) {}
+    template <typename S>
+    explicit ReadBufferFromOwnString(S && s_) : String(std::forward<S>(s_)), ReadBufferFromString(*this)
+    {
+    }
 };

 }
--- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
--- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h
+++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h
@ -0,0 +1,129 @@
+#pragma once
+#include "config.h"
+
+#if USE_ORC
+#    include <Formats/FormatSettings.h>
+#    include <IO/ReadBufferFromString.h>
+#    include <Processors/Formats/IInputFormat.h>
+#    include <Processors/Formats/ISchemaReader.h>
+#    include <orc/OrcFile.hh>
+
+namespace DB
+{
+
+class ORCInputStream : public orc::InputStream
+{
+public:
+    ORCInputStream(SeekableReadBuffer & in_, size_t file_size_);
+
+    uint64_t getLength() const override;
+    uint64_t getNaturalReadSize() const override;
+    void read(void * buf, uint64_t length, uint64_t offset) override;
+    const std::string & getName() const override { return name; }
+
+protected:
+    SeekableReadBuffer & in;
+    size_t file_size;
+    std::string name = "ORCInputStream";
+};
+
+class ORCInputStreamFromString : public ReadBufferFromOwnString, public ORCInputStream
+{
+public:
+    template <typename S>
+    ORCInputStreamFromString(S && s_, size_t file_size_)
+        : ReadBufferFromOwnString(std::forward<S>(s_)), ORCInputStream(dynamic_cast<SeekableReadBuffer &>(*this), file_size_)
+    {
+    }
+};
+
+std::unique_ptr<orc::InputStream> asORCInputStream(ReadBuffer & in, const FormatSettings & settings, std::atomic<int> & is_cancelled);
+
+// Reads the whole file into a memory buffer, owned by the returned RandomAccessFile.
+std::unique_ptr<orc::InputStream> asORCInputStreamLoadIntoMemory(ReadBuffer & in, std::atomic<int> & is_cancelled);
+
+
+class ORCColumnToCHColumn;
+class NativeORCBlockInputFormat : public IInputFormat
+{
+public:
+    NativeORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_);
+
+    String getName() const override { return "ORCBlockInputFormat"; }
+
+    void resetParser() override;
+
+    const BlockMissingValues & getMissingValues() const override;
+
+    size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
+
+protected:
+    Chunk generate() override;
+
+    void onCancel() override { is_stopped = 1; }
+
+private:
+    void prepareFileReader();
+    bool prepareStripeReader();
+
+    std::unique_ptr<orc::Reader> file_reader;
+    std::unique_ptr<orc::RowReader> stripe_reader;
+    std::unique_ptr<ORCColumnToCHColumn> orc_column_to_ch_column;
+    std::unique_ptr<orc::ColumnVectorBatch> batch;
+
+    // indices of columns to read from ORC file
+    std::list<UInt64> include_indices;
+
+    BlockMissingValues block_missing_values;
+    size_t approx_bytes_read_for_chunk;
+
+    const FormatSettings format_settings;
+    const std::unordered_set<int> & skip_stripes;
+
+    int total_stripes = 0;
+    int current_stripe = -1;
+    std::unique_ptr<orc::StripeInformation> current_stripe_info;
+
+    std::atomic<int> is_stopped{0};
+};
+
+class NativeORCSchemaReader : public ISchemaReader
+{
+public:
+    NativeORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
+
+    NamesAndTypesList readSchema() override;
+
+private:
+    const FormatSettings format_settings;
+};
+
+class ORCColumnToCHColumn
+{
+public:
+    using ORCColumnPtr = const orc::ColumnVectorBatch *;
+    using ORCTypePtr = const orc::Type *;
+    using ORCColumnWithType = std::pair<ORCColumnPtr, ORCTypePtr>;
+    using NameToColumnPtr = std::unordered_map<std::string, ORCColumnWithType>;
+
+    ORCColumnToCHColumn(const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_ = false);
+
+    void orcTableToCHChunk(
+        Chunk & res,
+        const orc::Type * schema,
+        const orc::ColumnVectorBatch * table,
+        size_t num_rows,
+        BlockMissingValues * block_missing_values = nullptr);
+
+    void orcColumnsToCHChunk(
+        Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values = nullptr);
+
+private:
+    const Block & header;
+    /// If false, throw exception if some columns in header not exists in arrow table.
+    bool allow_missing_columns;
+    bool null_as_default;
+    bool case_insensitive_matching;
+};
+}
+#endif
--- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp
@ -1,16 +1,17 @@
 #include "ORCBlockInputFormat.h"
-#include <boost/algorithm/string/case_conv.hpp>
-#if USE_ORC

+#if USE_ORC
+#    include <DataTypes/NestedUtils.h>
 #    include <Formats/FormatFactory.h>
 #    include <Formats/SchemaInferenceUtils.h>
 #    include <IO/ReadBufferFromMemory.h>
 #    include <IO/WriteHelpers.h>
 #    include <IO/copyData.h>
+#    include <boost/algorithm/string/case_conv.hpp>
 #    include "ArrowBufferedStreams.h"
 #    include "ArrowColumnToCHColumn.h"
 #    include "ArrowFieldIndexUtil.h"
-#include <DataTypes/NestedUtils.h>
+#    include "NativeORCBlockInputFormat.h"

 namespace DB
 {
@ -154,18 +155,23 @@ NamesAndTypesList ORCSchemaReader::readSchema()
        *schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference);
    if (format_settings.schema_inference_make_columns_nullable)
        return getNamesAndRecursivelyNullableTypes(header);
-    return header.getNamesAndTypesList();}
+    return header.getNamesAndTypesList();
+}
+

 void registerInputFormatORC(FormatFactory & factory)
 {
    factory.registerInputFormat(
        "ORC",
-            [](ReadBuffer &buf,
-                const Block &sample,
-                const RowInputFormatParams &,
-                const FormatSettings & settings)
+        [](ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, const FormatSettings & settings)
        {
-                return std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
+            InputFormatPtr res;
+            if (settings.orc.use_fast_decoder)
+                res = std::make_shared<NativeORCBlockInputFormat>(buf, sample, settings);
+            else
+                res = std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
+
+            return res;
        });
    factory.markFormatSupportsSubsetOfColumns("ORC");
 }
@ -176,7 +182,13 @@ void registerORCSchemaReader(FormatFactory & factory)
        "ORC",
        [](ReadBuffer & buf, const FormatSettings & settings)
        {
-            return std::make_shared<ORCSchemaReader>(buf, settings);
+            SchemaReaderPtr res;
+            if (settings.orc.use_fast_decoder)
+                res = std::make_shared<NativeORCSchemaReader>(buf, settings);
+            else
+                res = std::make_shared<ORCSchemaReader>(buf, settings);
+
+            return res;
        }
        );

--- a/src/Processors/examples/CMakeLists.txt
+++ b/src/Processors/examples/CMakeLists.txt
@ -2,3 +2,9 @@ if (TARGET ch_contrib::hivemetastore)
    clickhouse_add_executable (comma_separated_streams comma_separated_streams.cpp)
    target_link_libraries (comma_separated_streams PRIVATE dbms)
 endif()
+
+if (USE_ORC)
+    clickhouse_add_executable (native_orc native_orc.cpp)
+    target_link_libraries (native_orc PRIVATE dbms)
+    target_include_directories (native_orc PRIVATE ${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include)
+endif ()
--- a/src/Processors/examples/native_orc.cpp
+++ b/src/Processors/examples/native_orc.cpp
@ -0,0 +1,36 @@
+#include <string>
+#include <IO/ReadBufferFromFile.h>
+#include <Processors/Formats/Impl/NativeORCBlockInputFormat.h>
+#include <IO/copyData.h>
+
+using namespace DB;
+
+int main()
+{
+    /// Read schema from orc file
+    String path = "/path/to/orc/file";
+    // String path = "/data1/clickhouse_official/data/user_files/bigolive_audience_stats_orc.orc";
+    {
+        ReadBufferFromFile in(path);
+        NativeORCSchemaReader schema_reader(in, {});
+        auto schema = schema_reader.readSchema();
+        std::cout << "schema:" << schema.toString() << std::endl;
+    }
+
+    /// Read schema from string with orc data
+    {
+        ReadBufferFromFile in(path);
+
+        String content;
+        WriteBufferFromString out(content);
+
+        copyData(in, out);
+
+        content.resize(out.count());
+        ReadBufferFromString in2(content);
+        NativeORCSchemaReader schema_reader(in2, {});
+        auto schema = schema_reader.readSchema();
+        std::cout << "schema:" << schema.toString() << std::endl;
+    }
+    return 0;
+}