diff --git a/contrib/orc b/contrib/orc index 568d1d60c25..a20d1d9d7ad 160000 --- a/contrib/orc +++ b/contrib/orc @@ -1 +1 @@ -Subproject commit 568d1d60c250af1890f226c182bc15bd8cc94cf1 +Subproject commit a20d1d9d7ad4a4be7b7ba97588e16ca8b9abb2b6 diff --git a/src/Core/Settings.h b/src/Core/Settings.h index bfdb5a997bf..28027711d16 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -878,6 +878,7 @@ class IColumn; M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \ + M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \ M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \ M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \ M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 8487883730c..ae6ec462057 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -189,6 +189,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string; format_settings.orc.output_compression_method = settings.output_format_orc_compression_method; + format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index ebecf107007..29ec7c69ed9 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -347,6 +347,7 @@ struct FormatSettings std::unordered_set skip_stripes = {}; bool output_string_as_string = false; ORCCompression output_compression_method = ORCCompression::NONE; + bool use_fast_decoder = true; } orc; /// For capnProto format we should determine how to diff --git a/src/IO/ReadBufferFromString.h b/src/IO/ReadBufferFromString.h index 7ea6afc3543..f20e319b931 100644 --- a/src/IO/ReadBufferFromString.h +++ b/src/IO/ReadBufferFromString.h @@ -19,7 +19,10 @@ public: class ReadBufferFromOwnString : public String, public ReadBufferFromString { public: - explicit ReadBufferFromOwnString(const String & s_): String(s_), ReadBufferFromString(*this) {} + template + explicit ReadBufferFromOwnString(S && s_) : String(std::forward(s_)), ReadBufferFromString(*this) + { + } }; } diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp new file mode 100644 index 00000000000..6948bb31e75 --- /dev/null +++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp @@ -0,0 +1,1019 @@ +#include "NativeORCBlockInputFormat.h" + +#if USE_ORC +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include "ArrowBufferedStreams.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNKNOWN_TYPE; + extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE; + extern const int THERE_IS_NO_COLUMN; + extern const int INCORRECT_DATA; + extern const int ARGUMENT_OUT_OF_BOUND; +} + +ORCInputStream::ORCInputStream(SeekableReadBuffer & in_, size_t file_size_) : in(in_), file_size(file_size_) +{ +} + +uint64_t ORCInputStream::getLength() const +{ + return file_size; +} + +uint64_t ORCInputStream::getNaturalReadSize() const +{ + return 128 * 1024; +} + +void ORCInputStream::read(void * buf, uint64_t length, uint64_t offset) +{ + if (offset != static_cast(in.getPosition())) + in.seek(offset, SEEK_SET); + + in.readStrict(reinterpret_cast(buf), length); +} + +std::unique_ptr asORCInputStream(ReadBuffer & in, const FormatSettings & settings, std::atomic & is_cancelled) +{ + bool has_file_size = isBufferWithFileSize(in); + auto * seekable_in = dynamic_cast(&in); + + if (has_file_size && seekable_in && settings.seekable_read && seekable_in->checkIfActuallySeekable()) + return std::make_unique(*seekable_in, getFileSizeFromReadBuffer(in)); + + /// Fallback to loading the entire file in memory + return asORCInputStreamLoadIntoMemory(in, is_cancelled); +} + +std::unique_ptr asORCInputStreamLoadIntoMemory(ReadBuffer & in, std::atomic & is_cancelled) +{ + size_t magic_size = strlen(ORC_MAGIC_BYTES); + std::string file_data(magic_size, '\0'); + + /// Avoid loading the whole file if it doesn't seem to even be in the correct format. + size_t bytes_read = in.read(file_data.data(), magic_size); + if (bytes_read < magic_size || file_data != ORC_MAGIC_BYTES) + throw Exception(ErrorCodes::INCORRECT_DATA, "Not an ORC file"); + + WriteBufferFromString file_buffer(file_data, AppendModeTag{}); + copyData(in, file_buffer, is_cancelled); + file_buffer.finalize(); + + size_t file_size = file_data.size(); + return std::make_unique(std::move(file_data), file_size); +} + +static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_with_unsupported_types, bool & skipped) +{ + assert(orc_type != nullptr); + + const int subtype_count = static_cast(orc_type->getSubtypeCount()); + switch (orc_type->getKind()) + { + case orc::TypeKind::BOOLEAN: + return DataTypeFactory::instance().get("Bool"); + case orc::TypeKind::BYTE: + return std::make_shared(); + case orc::TypeKind::SHORT: + return std::make_shared(); + case orc::TypeKind::INT: + return std::make_shared(); + case orc::TypeKind::LONG: + return std::make_shared(); + case orc::TypeKind::FLOAT: + return std::make_shared(); + case orc::TypeKind::DOUBLE: + return std::make_shared(); + case orc::TypeKind::DATE: + return std::make_shared(); + case orc::TypeKind::TIMESTAMP: + return std::make_shared(9); + case orc::TypeKind::VARCHAR: + case orc::TypeKind::BINARY: + case orc::TypeKind::STRING: + return std::make_shared(); + case orc::TypeKind::CHAR: + return std::make_shared(orc_type->getMaximumLength()); + case orc::TypeKind::DECIMAL: { + UInt64 precision = orc_type->getPrecision(); + UInt64 scale = orc_type->getScale(); + if (precision == 0) + { + // In HIVE 0.11/0.12 precision is set as 0, but means max precision + return createDecimal(38, 6); + } + else + return createDecimal(precision, scale); + } + case orc::TypeKind::LIST: { + if (subtype_count != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc List type {}", orc_type->toString()); + + DataTypePtr nested_type = parseORCType(orc_type->getSubtype(0), skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; + + return std::make_shared(nested_type); + } + case orc::TypeKind::MAP: { + if (subtype_count != 2) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc Map type {}", orc_type->toString()); + + DataTypePtr key_type = parseORCType(orc_type->getSubtype(0), skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; + + DataTypePtr value_type = parseORCType(orc_type->getSubtype(1), skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; + + return std::make_shared(key_type, value_type); + } + case orc::TypeKind::STRUCT: { + DataTypes nested_types; + Strings nested_names; + nested_types.reserve(subtype_count); + nested_names.reserve(subtype_count); + + for (size_t i = 0; i < orc_type->getSubtypeCount(); ++i) + { + auto parsed_type = parseORCType(orc_type->getSubtype(i), skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; + + nested_types.push_back(parsed_type); + nested_names.push_back(orc_type->getFieldName(i)); + } + return std::make_shared(nested_types, nested_names); + } + default: { + if (skip_columns_with_unsupported_types) + { + skipped = true; + return {}; + } + + throw Exception( + ErrorCodes::UNKNOWN_TYPE, + "Unsupported ORC type '{}'." + "If you want to skip columns with unsupported types, " + "you can enable setting input_format_orc_skip_columns_with_unsupported_types_in_schema_inference", + orc_type->toString()); + } + } +} + + +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + Block & header, + const FormatSettings & format_settings, + std::atomic & is_stopped) +{ + if (is_stopped) + return; + + orc::ReaderOptions options; + auto input_stream = asORCInputStream(in, format_settings, is_stopped); + file_reader = orc::createReader(std::move(input_stream), options); + const auto & schema = file_reader->getType(); + + for (size_t i = 0; i < schema.getSubtypeCount(); ++i) + { + const std::string & name = schema.getFieldName(i); + const orc::Type * orc_type = schema.getSubtype(i); + + bool skipped = false; + DataTypePtr type = parseORCType(orc_type, format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference, skipped); + if (!skipped) + header.insert(ColumnWithTypeAndName{type, name}); + } +} + +NativeORCBlockInputFormat::NativeORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) + : IInputFormat(std::move(header_), &in_), format_settings(format_settings_), skip_stripes(format_settings.orc.skip_stripes) +{ +} + +void NativeORCBlockInputFormat::prepareFileReader() +{ + Block schema; + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); + if (is_stopped) + return; + + total_stripes = static_cast(file_reader->getNumberOfStripes()); + current_stripe = -1; + + orc_column_to_ch_column = std::make_unique( + getPort().getHeader(), + format_settings.orc.allow_missing_columns, + format_settings.null_as_default, + format_settings.orc.case_insensitive_column_matching); + + const bool ignore_case = format_settings.orc.case_insensitive_column_matching; + std::unordered_set nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case); + + for (size_t i = 0; i < schema.columns(); ++i) + { + const auto & name = schema.getByPosition(i).name; + if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name)) + include_indices.push_back(static_cast(i)); + } +} + +bool NativeORCBlockInputFormat::prepareStripeReader() +{ + assert(file_reader); + + ++current_stripe; + for (; current_stripe < total_stripes && skip_stripes.contains(current_stripe); ++current_stripe) + ; + + /// No more stripes to read + if (current_stripe >= total_stripes) + return false; + + current_stripe_info = file_reader->getStripe(current_stripe); + if (!current_stripe_info->getNumberOfRows()) + throw Exception(ErrorCodes::INCORRECT_DATA, "ORC stripe {} has no rows", current_stripe); + + orc::RowReaderOptions row_reader_options; + row_reader_options.include(include_indices); + row_reader_options.range(current_stripe_info->getOffset(), current_stripe_info->getLength()); + stripe_reader = file_reader->createRowReader(row_reader_options); + + if (!batch) + batch = stripe_reader->createRowBatch(format_settings.orc.row_batch_size); + + return true; +} + +Chunk NativeORCBlockInputFormat::generate() +{ + block_missing_values.clear(); + + if (!file_reader) + prepareFileReader(); + + if (!stripe_reader) + { + if (!prepareStripeReader()) + return {}; + } + + if (is_stopped) + return {}; + + while (true) + { + bool ok = stripe_reader->next(*batch); + if (ok) + break; + + /// No more rows to read in current stripe, continue to prepare reading next stripe + if (!prepareStripeReader()) + return {}; + } + + Chunk res; + size_t num_rows = batch->numElements; + const auto & schema = stripe_reader->getSelectedType(); + orc_column_to_ch_column->orcTableToCHChunk(res, &schema, batch.get(), num_rows, &block_missing_values); + + approx_bytes_read_for_chunk = num_rows * current_stripe_info->getLength() / current_stripe_info->getNumberOfRows(); + return res; +} + +void NativeORCBlockInputFormat::resetParser() +{ + IInputFormat::resetParser(); + + file_reader.reset(); + stripe_reader.reset(); + include_indices.clear(); + batch.reset(); + block_missing_values.clear(); +} + +const BlockMissingValues & NativeORCBlockInputFormat::getMissingValues() const +{ + return block_missing_values; +} + +NativeORCSchemaReader::NativeORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList NativeORCSchemaReader::readSchema() +{ + Block header; + std::unique_ptr file_reader; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, header, format_settings, is_stopped); + + if (format_settings.schema_inference_make_columns_nullable) + return getNamesAndRecursivelyNullableTypes(header); + return header.getNamesAndTypesList(); +} + + +ORCColumnToCHColumn::ORCColumnToCHColumn( + const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_) + : header(header_) + , allow_missing_columns(allow_missing_columns_) + , null_as_default(null_as_default_) + , case_insensitive_matching(case_insensitive_matching_) +{ +} + +void ORCColumnToCHColumn::orcTableToCHChunk( + Chunk & res, const orc::Type * schema, const orc::ColumnVectorBatch * table, size_t num_rows, BlockMissingValues * block_missing_values) +{ + const auto * struct_batch = dynamic_cast(table); + if (!struct_batch) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ORC table must be StructVectorBatch but is {}", struct_batch->toString()); + + if (schema->getSubtypeCount() != struct_batch->fields.size()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, "ORC table has {} fields but schema has {}", struct_batch->fields.size(), schema->getSubtypeCount()); + + size_t field_num = struct_batch->fields.size(); + NameToColumnPtr name_to_column_ptr; + for (size_t i = 0; i < field_num; ++i) + { + auto name = schema->getFieldName(i); + const auto * field = struct_batch->fields[i]; + if (!field) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ORC table field {} is null", name); + + if (case_insensitive_matching) + boost::to_lower(name); + + name_to_column_ptr[std::move(name)] = {field, schema->getSubtype(i)}; + } + + orcColumnsToCHChunk(res, name_to_column_ptr, num_rows, block_missing_values); +} + +/// Creates a null bytemap from ORC's not-null bytemap +static ColumnPtr readByteMapFromORCColumn(const orc::ColumnVectorBatch * orc_column) +{ + if (!orc_column->hasNulls) + return ColumnUInt8::create(orc_column->numElements, 0); + + auto nullmap_column = ColumnUInt8::create(); + PaddedPODArray & bytemap_data = assert_cast &>(*nullmap_column).getData(); + bytemap_data.resize(orc_column->numElements); + + for (size_t i = 0; i < orc_column->numElements; ++i) + bytemap_data[i] = 1 - orc_column->notNull[i]; + return nullmap_column; +} + + +static const orc::ColumnVectorBatch * getNestedORCColumn(const orc::ListVectorBatch * orc_column) +{ + return orc_column->elements.get(); +} + +template +static ColumnPtr readOffsetsFromORCListColumn(const BatchType * orc_column) +{ + auto offsets_column = ColumnUInt64::create(); + ColumnArray::Offsets & offsets_data = assert_cast &>(*offsets_column).getData(); + offsets_data.reserve(orc_column->numElements); + + for (size_t i = 0; i < orc_column->numElements; ++i) + offsets_data.push_back(orc_column->offsets[i + 1]); + + return offsets_column; +} + +static ColumnWithTypeAndName +readColumnWithBooleanData(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name) +{ + const auto * orc_bool_column = dynamic_cast(orc_column); + auto internal_type = DataTypeFactory::instance().get("Bool"); + auto internal_column = internal_type->createColumn(); + auto & column_data = assert_cast &>(*internal_column).getData(); + column_data.reserve(orc_bool_column->numElements); + + for (size_t i = 0; i < orc_bool_column->numElements; ++i) + column_data.push_back(static_cast(orc_bool_column->data[i])); + + return {std::move(internal_column), internal_type, column_name}; +} + +/// Inserts numeric data right into internal column data to reduce an overhead +template > +static ColumnWithTypeAndName +readColumnWithNumericData(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name) +{ + auto internal_type = std::make_shared>(); + auto internal_column = internal_type->createColumn(); + auto & column_data = static_cast(*internal_column).getData(); + column_data.reserve(orc_column->numElements); + + const auto * orc_int_column = dynamic_cast(orc_column); + column_data.insert_assume_reserved(orc_int_column->data.data(), orc_int_column->data.data() + orc_int_column->numElements); + + return {std::move(internal_column), std::move(internal_type), column_name}; +} + +template > +static ColumnWithTypeAndName +readColumnWithNumericDataCast(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name) +{ + auto internal_type = std::make_shared>(); + auto internal_column = internal_type->createColumn(); + auto & column_data = static_cast(*internal_column).getData(); + column_data.reserve(orc_column->numElements); + + const auto * orc_int_column = dynamic_cast(orc_column); + for (size_t i = 0; i < orc_int_column->numElements; ++i) + column_data.push_back(static_cast(orc_int_column->data[i])); + + return {std::move(internal_column), std::move(internal_type), column_name}; +} + +static ColumnWithTypeAndName +readColumnWithStringData(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name) +{ + auto internal_type = std::make_shared(); + auto internal_column = internal_type->createColumn(); + PaddedPODArray & column_chars_t = assert_cast(*internal_column).getChars(); + PaddedPODArray & column_offsets = assert_cast(*internal_column).getOffsets(); + + const auto * orc_str_column = dynamic_cast(orc_column); + size_t reserver_size = 0; + for (size_t i = 0; i < orc_str_column->numElements; ++i) + reserver_size += orc_str_column->length[i] + 1; + column_chars_t.reserve(reserver_size); + column_offsets.reserve(orc_str_column->numElements); + + size_t curr_offset = 0; + for (size_t i = 0; i < orc_str_column->numElements; ++i) + { + const auto * buf = orc_str_column->data[i]; + if (buf) + { + size_t buf_size = orc_str_column->length[i]; + column_chars_t.insert_assume_reserved(buf, buf + buf_size); + curr_offset += buf_size; + } + + column_chars_t.push_back(0); + ++curr_offset; + + column_offsets.push_back(curr_offset); + } + return {std::move(internal_column), std::move(internal_type), column_name}; +} + +static ColumnWithTypeAndName +readColumnWithFixedStringData(const orc::ColumnVectorBatch * orc_column, const orc::Type * orc_type, const String & column_name) +{ + size_t fixed_len = orc_type->getMaximumLength(); + auto internal_type = std::make_shared(fixed_len); + auto internal_column = internal_type->createColumn(); + PaddedPODArray & column_chars_t = assert_cast(*internal_column).getChars(); + column_chars_t.reserve(orc_column->numElements * fixed_len); + + const auto * orc_str_column = dynamic_cast(orc_column); + for (size_t i = 0; i < orc_str_column->numElements; ++i) + { + if (orc_str_column->data[i]) + column_chars_t.insert_assume_reserved(orc_str_column->data[i], orc_str_column->data[i] + orc_str_column->length[i]); + else + column_chars_t.resize_fill(column_chars_t.size() + fixed_len); + } + + return {std::move(internal_column), std::move(internal_type), column_name}; +} + + +template > +static ColumnWithTypeAndName readColumnWithDecimalDataCast( + const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name, DataTypePtr internal_type) +{ + using NativeType = typename DecimalType::NativeType; + static_assert(std::is_same_v || std::is_same_v); + + auto internal_column = internal_type->createColumn(); + auto & column_data = static_cast(*internal_column).getData(); + column_data.reserve(orc_column->numElements); + + const auto * orc_decimal_column = dynamic_cast(orc_column); + for (size_t i = 0; i < orc_decimal_column->numElements; ++i) + { + DecimalType decimal_value; + if constexpr (std::is_same_v) + { + Int128 int128_value; + int128_value.items[0] = orc_decimal_column->values[i].getLowBits(); + int128_value.items[1] = orc_decimal_column->values[i].getHighBits(); + decimal_value.value = static_cast(int128_value); + } + else + decimal_value.value = static_cast(orc_decimal_column->values[i]); + + column_data.push_back(std::move(decimal_value)); + } + + return {std::move(internal_column), internal_type, column_name}; +} + +static ColumnWithTypeAndName +readIPv6ColumnFromBinaryData(const orc::ColumnVectorBatch * orc_column, const orc::Type * orc_type, const String & column_name) +{ + const auto * orc_str_column = dynamic_cast(orc_column); + + for (size_t i = 0; i < orc_str_column->numElements; ++i) + { + /// If at least one value size is not 16 bytes, fallback to reading String column and further cast to IPv6. + if (orc_str_column->data[i] && orc_str_column->length[i] != sizeof(IPv6)) + return readColumnWithStringData(orc_column, orc_type, column_name); + } + + auto internal_type = std::make_shared(); + auto internal_column = internal_type->createColumn(); + auto & ipv6_column = assert_cast(*internal_column); + ipv6_column.reserve(orc_str_column->numElements); + + for (size_t i = 0; i < orc_str_column->numElements; ++i) + { + if (!orc_str_column->data[i]) [[unlikely]] + ipv6_column.insertDefault(); + else + ipv6_column.insertData(orc_str_column->data[i], orc_str_column->length[i]); + } + + return {std::move(internal_column), std::move(internal_type), column_name}; +} + +static ColumnWithTypeAndName +readIPv4ColumnWithInt32Data(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name) +{ + const auto * orc_int_column = dynamic_cast(orc_column); + + auto internal_type = std::make_shared(); + auto internal_column = internal_type->createColumn(); + auto & column_data = assert_cast(*internal_column).getData(); + column_data.reserve(orc_int_column->numElements); + + for (size_t i = 0; i < orc_int_column->numElements; ++i) + column_data.push_back(static_cast(orc_int_column->data[i])); + + return {std::move(internal_column), std::move(internal_type), column_name}; +} + +template +static ColumnWithTypeAndName readColumnWithBigNumberFromBinaryData( + const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name, const DataTypePtr & column_type) +{ + const auto * orc_str_column = dynamic_cast(orc_column); + + auto internal_column = column_type->createColumn(); + auto & integer_column = assert_cast(*internal_column); + integer_column.reserve(orc_str_column->numElements); + + for (size_t i = 0; i < orc_str_column->numElements; ++i) + { + if (!orc_str_column->data[i]) [[unlikely]] + integer_column.insertDefault(); + else + { + if (sizeof(typename ColumnType::ValueType) != orc_str_column->length[i]) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "ValueType size {} of column {} is not equal to size of binary data {}", + sizeof(typename ColumnType::ValueType), + integer_column.getName(), + orc_str_column->length[i]); + + integer_column.insertData(orc_str_column->data[i], orc_str_column->length[i]); + } + } + return {std::move(internal_column), column_type, column_name}; +} + +static ColumnWithTypeAndName readColumnWithDateData( + const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name, const DataTypePtr & type_hint) +{ + DataTypePtr internal_type; + bool check_date_range = false; + /// Make result type Date32 when requested type is actually Date32 or when we use schema inference + if (!type_hint || (type_hint && isDate32(*type_hint))) + { + internal_type = std::make_shared(); + check_date_range = true; + } + else + { + internal_type = std::make_shared(); + } + + const auto * orc_int_column = dynamic_cast(orc_column); + auto internal_column = internal_type->createColumn(); + PaddedPODArray & column_data = assert_cast &>(*internal_column).getData(); + column_data.reserve(orc_int_column->numElements); + + for (size_t i = 0; i < orc_int_column->numElements; ++i) + { + Int32 days_num = static_cast(orc_int_column->data[i]); + if (check_date_range && (days_num > DATE_LUT_MAX_EXTEND_DAY_NUM || days_num < -DAYNUM_OFFSET_EPOCH)) + throw Exception( + ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, + "Input value {} of a column \"{}\" exceeds the range of type Date32", + days_num, + column_name); + + column_data.push_back(days_num); + } + + return {std::move(internal_column), internal_type, column_name}; +} + +static ColumnWithTypeAndName +readColumnWithTimestampData(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name) +{ + const auto * orc_ts_column = dynamic_cast(orc_column); + + auto internal_type = std::make_shared(9); + auto internal_column = internal_type->createColumn(); + auto & column_data = assert_cast(*internal_column).getData(); + column_data.reserve(orc_ts_column->numElements); + + constexpr Int64 multiplier = 1e9L; + for (size_t i = 0; i < orc_ts_column->numElements; ++i) + { + Decimal64 decimal64; + decimal64.value = orc_ts_column->data[i] * multiplier + orc_ts_column->nanoseconds[i]; + column_data.emplace_back(std::move(decimal64)); + } + return {std::move(internal_column), std::move(internal_type), column_name}; +} + +static ColumnWithTypeAndName readColumnFromORCColumn( + const orc::ColumnVectorBatch * orc_column, + const orc::Type * orc_type, + const std::string & column_name, + bool inside_nullable, + DataTypePtr type_hint = nullptr) +{ + bool skipped = false; + + if (!inside_nullable && (orc_column->hasNulls || (type_hint && type_hint->isNullable())) + && (orc_type->getKind() != orc::LIST && orc_type->getKind() != orc::MAP && orc_type->getKind() != orc::STRUCT)) + { + DataTypePtr nested_type_hint; + if (type_hint) + nested_type_hint = removeNullable(type_hint); + + auto nested_column = readColumnFromORCColumn(orc_column, orc_type, column_name, true, nested_type_hint); + + auto nullmap_column = readByteMapFromORCColumn(orc_column); + auto nullable_type = std::make_shared(std::move(nested_column.type)); + auto nullable_column = ColumnNullable::create(nested_column.column, nullmap_column); + return {std::move(nullable_column), std::move(nullable_type), column_name}; + } + + switch (orc_type->getKind()) + { + case orc::STRING: + case orc::BINARY: + case orc::VARCHAR: { + if (type_hint) + { + switch (type_hint->getTypeId()) + { + case TypeIndex::IPv6: + return readIPv6ColumnFromBinaryData(orc_column, orc_type, column_name); + /// ORC format outputs big integers as binary column, because there is no fixed binary in ORC. + case TypeIndex::Int128: + return readColumnWithBigNumberFromBinaryData(orc_column, orc_type, column_name, type_hint); + case TypeIndex::UInt128: + return readColumnWithBigNumberFromBinaryData(orc_column, orc_type, column_name, type_hint); + case TypeIndex::Int256: + return readColumnWithBigNumberFromBinaryData(orc_column, orc_type, column_name, type_hint); + case TypeIndex::UInt256: + return readColumnWithBigNumberFromBinaryData(orc_column, orc_type, column_name, type_hint); + /// ORC doesn't support Decimal256 as separate type. We read and write it as binary data. + case TypeIndex::Decimal256: + return readColumnWithBigNumberFromBinaryData>( + orc_column, orc_type, column_name, type_hint); + default:; + } + } + return readColumnWithStringData(orc_column, orc_type, column_name); + } + case orc::CHAR: { + if (type_hint) + { + switch (type_hint->getTypeId()) + { + case TypeIndex::Int128: + return readColumnWithBigNumberFromBinaryData(orc_column, orc_type, column_name, type_hint); + case TypeIndex::UInt128: + return readColumnWithBigNumberFromBinaryData(orc_column, orc_type, column_name, type_hint); + case TypeIndex::Int256: + return readColumnWithBigNumberFromBinaryData(orc_column, orc_type, column_name, type_hint); + case TypeIndex::UInt256: + return readColumnWithBigNumberFromBinaryData(orc_column, orc_type, column_name, type_hint); + default:; + } + } + return readColumnWithFixedStringData(orc_column, orc_type, column_name); + } + case orc::BOOLEAN: + return readColumnWithBooleanData(orc_column, orc_type, column_name); + case orc::BYTE: + return readColumnWithNumericDataCast(orc_column, orc_type, column_name); + case orc::SHORT: + return readColumnWithNumericDataCast(orc_column, orc_type, column_name); + case orc::INT: { + /// ORC format doesn't have unsigned integers and we output IPv4 as Int32. + /// We should allow to read it back from Int32. + if (type_hint && isIPv4(type_hint)) + return readIPv4ColumnWithInt32Data(orc_column, orc_type, column_name); + return readColumnWithNumericDataCast(orc_column, orc_type, column_name); + } + case orc::LONG: + return readColumnWithNumericData(orc_column, orc_type, column_name); + case orc::FLOAT: + return readColumnWithNumericDataCast(orc_column, orc_type, column_name); + case orc::DOUBLE: + return readColumnWithNumericData(orc_column, orc_type, column_name); + case orc::DATE: + return readColumnWithDateData(orc_column, orc_type, column_name, type_hint); + case orc::TIMESTAMP: + return readColumnWithTimestampData(orc_column, orc_type, column_name); + case orc::DECIMAL: { + auto interal_type = parseORCType(orc_type, false, skipped); + + auto precision = orc_type->getPrecision(); + if (precision == 0) + precision = 38; + + if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataCast(orc_column, orc_type, column_name, interal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataCast(orc_column, orc_type, column_name, interal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataCast( + orc_column, orc_type, column_name, interal_type); + else + throw Exception( + ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "Decimal precision {} in ORC type {} is out of bound", + precision, + orc_type->toString()); + } + case orc::MAP: { + DataTypePtr key_type_hint; + DataTypePtr value_type_hint; + if (type_hint) + { + const auto * map_type_hint = typeid_cast(type_hint.get()); + if (map_type_hint) + { + key_type_hint = map_type_hint->getKeyType(); + value_type_hint = map_type_hint->getValueType(); + } + } + + const auto * orc_map_column = dynamic_cast(orc_column); + const auto * orc_key_column = orc_map_column->keys.get(); + const auto * orc_value_column = orc_map_column->elements.get(); + const auto * orc_key_type = orc_type->getSubtype(0); + const auto * orc_value_type = orc_type->getSubtype(1); + + auto key_column = readColumnFromORCColumn(orc_key_column, orc_key_type, "key", false, key_type_hint); + if (key_type_hint && !key_type_hint->equals(*key_column.type)) + { + /// Cast key column to target type, because it can happen + /// that parsed type cannot be ClickHouse Map key type. + key_column.column = castColumn(key_column, key_type_hint); + key_column.type = key_type_hint; + } + + auto value_column = readColumnFromORCColumn(orc_value_column, orc_value_type, "value", false, value_type_hint); + if (skipped) + return {}; + + if (value_type_hint && !value_type_hint->equals(*value_column.type)) + { + /// Cast value column to target type, because it can happen + /// that parsed type cannot be ClickHouse Map value type. + value_column.column = castColumn(value_column, value_type_hint); + value_column.type = value_type_hint; + } + + auto offsets_column = readOffsetsFromORCListColumn(orc_map_column); + auto map_column = ColumnMap::create(key_column.column, value_column.column, offsets_column); + auto map_type = std::make_shared(key_column.type, value_column.type); + return {std::move(map_column), std::move(map_type), column_name}; + } + case orc::LIST: { + DataTypePtr nested_type_hint; + if (type_hint) + { + const auto * array_type_hint = typeid_cast(type_hint.get()); + if (array_type_hint) + nested_type_hint = array_type_hint->getNestedType(); + } + + const auto * orc_list_column = dynamic_cast(orc_column); + const auto * orc_nested_column = getNestedORCColumn(orc_list_column); + const auto * orc_nested_type = orc_type->getSubtype(0); + auto nested_column = readColumnFromORCColumn(orc_nested_column, orc_nested_type, column_name, false, nested_type_hint); + + auto offsets_column = readOffsetsFromORCListColumn(orc_list_column); + auto array_column = ColumnArray::create(nested_column.column, offsets_column); + auto array_type = std::make_shared(nested_column.type); + return {std::move(array_column), std::move(array_type), column_name}; + } + case orc::STRUCT: { + Columns tuple_elements; + DataTypes tuple_types; + std::vector tuple_names; + const auto * tuple_type_hint = type_hint ? typeid_cast(type_hint.get()) : nullptr; + + const auto * orc_struct_column = dynamic_cast(orc_column); + for (size_t i = 0; i < orc_type->getSubtypeCount(); ++i) + { + const auto & field_name = orc_type->getFieldName(i); + + DataTypePtr nested_type_hint; + if (tuple_type_hint) + { + if (tuple_type_hint->haveExplicitNames()) + { + auto pos = tuple_type_hint->tryGetPositionByName(field_name); + if (pos) + nested_type_hint = tuple_type_hint->getElement(*pos); + } + else if (size_t(i) < tuple_type_hint->getElements().size()) + nested_type_hint = tuple_type_hint->getElement(i); + } + + const auto * nested_orc_column = orc_struct_column->fields[i]; + const auto * nested_orc_type = orc_type->getSubtype(i); + auto element = readColumnFromORCColumn(nested_orc_column, nested_orc_type, field_name, false, nested_type_hint); + + tuple_elements.emplace_back(std::move(element.column)); + tuple_types.emplace_back(std::move(element.type)); + tuple_names.emplace_back(std::move(element.name)); + } + + auto tuple_column = ColumnTuple::create(std::move(tuple_elements)); + auto tuple_type = std::make_shared(std::move(tuple_types), std::move(tuple_names)); + return {std::move(tuple_column), std::move(tuple_type), column_name}; + } + default: + throw Exception( + ErrorCodes::UNKNOWN_TYPE, "Unsupported ORC type {} while reading column {}.", orc_type->toString(), column_name); + } +} + +void ORCColumnToCHColumn::orcColumnsToCHChunk( + Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values) +{ + Columns columns_list; + columns_list.reserve(header.columns()); + std::unordered_map>> nested_tables; + for (size_t column_i = 0, columns = header.columns(); column_i < columns; ++column_i) + { + const ColumnWithTypeAndName & header_column = header.getByPosition(column_i); + + auto search_column_name = header_column.name; + if (case_insensitive_matching) + boost::to_lower(search_column_name); + + ColumnWithTypeAndName column; + if (!name_to_column_ptr.contains(search_column_name)) + { + bool read_from_nested = false; + + /// Check if it's a column from nested table. + String nested_table_name = Nested::extractTableName(header_column.name); + String search_nested_table_name = nested_table_name; + if (case_insensitive_matching) + boost::to_lower(search_nested_table_name); + if (name_to_column_ptr.contains(search_nested_table_name)) + { + if (!nested_tables.contains(search_nested_table_name)) + { + NamesAndTypesList nested_columns; + for (const auto & name_and_type : header.getNamesAndTypesList()) + { + if (name_and_type.name.starts_with(nested_table_name + ".")) + nested_columns.push_back(name_and_type); + } + auto nested_table_type = Nested::collect(nested_columns).front().type; + + auto orc_column_with_type = name_to_column_ptr[search_nested_table_name]; + ColumnsWithTypeAndName cols = {readColumnFromORCColumn( + orc_column_with_type.first, orc_column_with_type.second, nested_table_name, false, nested_table_type)}; + BlockPtr block_ptr = std::make_shared(cols); + auto column_extractor = std::make_shared(*block_ptr, case_insensitive_matching); + nested_tables[search_nested_table_name] = {block_ptr, column_extractor}; + } + + auto nested_column = nested_tables[search_nested_table_name].second->extractColumn(search_column_name); + if (nested_column) + { + column = *nested_column; + if (case_insensitive_matching) + column.name = header_column.name; + read_from_nested = true; + } + } + + if (!read_from_nested) + { + if (!allow_missing_columns) + throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", header_column.name}; + else + { + column.name = header_column.name; + column.type = header_column.type; + column.column = header_column.column->cloneResized(num_rows); + columns_list.push_back(std::move(column.column)); + if (block_missing_values) + block_missing_values->setBits(column_i, num_rows); + continue; + } + } + } + else + { + auto orc_column_with_type = name_to_column_ptr[search_column_name]; + column = readColumnFromORCColumn( + orc_column_with_type.first, orc_column_with_type.second, header_column.name, false, header_column.type); + } + + if (null_as_default) + insertNullAsDefaultIfNeeded(column, header_column, column_i, block_missing_values); + + try + { + column.column = castColumn(column, header_column.type); + } + catch (Exception & e) + { + e.addMessage(fmt::format( + "while converting column {} from type {} to type {}", + backQuote(header_column.name), + column.type->getName(), + header_column.type->getName())); + throw; + } + + column.type = header_column.type; + columns_list.push_back(std::move(column.column)); + } + + res.setColumns(columns_list, num_rows); +} + +} + +#endif diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h new file mode 100644 index 00000000000..c54eb0520bc --- /dev/null +++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h @@ -0,0 +1,129 @@ +#pragma once +#include "config.h" + +#if USE_ORC +# include +# include +# include +# include +# include + +namespace DB +{ + +class ORCInputStream : public orc::InputStream +{ +public: + ORCInputStream(SeekableReadBuffer & in_, size_t file_size_); + + uint64_t getLength() const override; + uint64_t getNaturalReadSize() const override; + void read(void * buf, uint64_t length, uint64_t offset) override; + const std::string & getName() const override { return name; } + +protected: + SeekableReadBuffer & in; + size_t file_size; + std::string name = "ORCInputStream"; +}; + +class ORCInputStreamFromString : public ReadBufferFromOwnString, public ORCInputStream +{ +public: + template + ORCInputStreamFromString(S && s_, size_t file_size_) + : ReadBufferFromOwnString(std::forward(s_)), ORCInputStream(dynamic_cast(*this), file_size_) + { + } +}; + +std::unique_ptr asORCInputStream(ReadBuffer & in, const FormatSettings & settings, std::atomic & is_cancelled); + +// Reads the whole file into a memory buffer, owned by the returned RandomAccessFile. +std::unique_ptr asORCInputStreamLoadIntoMemory(ReadBuffer & in, std::atomic & is_cancelled); + + +class ORCColumnToCHColumn; +class NativeORCBlockInputFormat : public IInputFormat +{ +public: + NativeORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_); + + String getName() const override { return "ORCBlockInputFormat"; } + + void resetParser() override; + + const BlockMissingValues & getMissingValues() const override; + + size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; } + +protected: + Chunk generate() override; + + void onCancel() override { is_stopped = 1; } + +private: + void prepareFileReader(); + bool prepareStripeReader(); + + std::unique_ptr file_reader; + std::unique_ptr stripe_reader; + std::unique_ptr orc_column_to_ch_column; + std::unique_ptr batch; + + // indices of columns to read from ORC file + std::list include_indices; + + BlockMissingValues block_missing_values; + size_t approx_bytes_read_for_chunk; + + const FormatSettings format_settings; + const std::unordered_set & skip_stripes; + + int total_stripes = 0; + int current_stripe = -1; + std::unique_ptr current_stripe_info; + + std::atomic is_stopped{0}; +}; + +class NativeORCSchemaReader : public ISchemaReader +{ +public: + NativeORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + +class ORCColumnToCHColumn +{ +public: + using ORCColumnPtr = const orc::ColumnVectorBatch *; + using ORCTypePtr = const orc::Type *; + using ORCColumnWithType = std::pair; + using NameToColumnPtr = std::unordered_map; + + ORCColumnToCHColumn(const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_ = false); + + void orcTableToCHChunk( + Chunk & res, + const orc::Type * schema, + const orc::ColumnVectorBatch * table, + size_t num_rows, + BlockMissingValues * block_missing_values = nullptr); + + void orcColumnsToCHChunk( + Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values = nullptr); + +private: + const Block & header; + /// If false, throw exception if some columns in header not exists in arrow table. + bool allow_missing_columns; + bool null_as_default; + bool case_insensitive_matching; +}; +} +#endif diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index ab4e07376f3..37b660bc8e0 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -1,16 +1,17 @@ #include "ORCBlockInputFormat.h" -#include -#if USE_ORC -#include -#include -#include -#include -#include -#include "ArrowBufferedStreams.h" -#include "ArrowColumnToCHColumn.h" -#include "ArrowFieldIndexUtil.h" -#include +#if USE_ORC +# include +# include +# include +# include +# include +# include +# include +# include "ArrowBufferedStreams.h" +# include "ArrowColumnToCHColumn.h" +# include "ArrowFieldIndexUtil.h" +# include "NativeORCBlockInputFormat.h" namespace DB { @@ -154,19 +155,24 @@ NamesAndTypesList ORCSchemaReader::readSchema() *schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference); if (format_settings.schema_inference_make_columns_nullable) return getNamesAndRecursivelyNullableTypes(header); - return header.getNamesAndTypesList();} + return header.getNamesAndTypesList(); +} + void registerInputFormatORC(FormatFactory & factory) { factory.registerInputFormat( - "ORC", - [](ReadBuffer &buf, - const Block &sample, - const RowInputFormatParams &, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, settings); - }); + "ORC", + [](ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, const FormatSettings & settings) + { + InputFormatPtr res; + if (settings.orc.use_fast_decoder) + res = std::make_shared(buf, sample, settings); + else + res = std::make_shared(buf, sample, settings); + + return res; + }); factory.markFormatSupportsSubsetOfColumns("ORC"); } @@ -176,7 +182,13 @@ void registerORCSchemaReader(FormatFactory & factory) "ORC", [](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, settings); + SchemaReaderPtr res; + if (settings.orc.use_fast_decoder) + res = std::make_shared(buf, settings); + else + res = std::make_shared(buf, settings); + + return res; } ); diff --git a/src/Processors/examples/CMakeLists.txt b/src/Processors/examples/CMakeLists.txt index 5d43a0d7d08..0c8734aee3c 100644 --- a/src/Processors/examples/CMakeLists.txt +++ b/src/Processors/examples/CMakeLists.txt @@ -2,3 +2,9 @@ if (TARGET ch_contrib::hivemetastore) clickhouse_add_executable (comma_separated_streams comma_separated_streams.cpp) target_link_libraries (comma_separated_streams PRIVATE dbms) endif() + +if (USE_ORC) + clickhouse_add_executable (native_orc native_orc.cpp) + target_link_libraries (native_orc PRIVATE dbms) + target_include_directories (native_orc PRIVATE ${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include) +endif () diff --git a/src/Processors/examples/native_orc.cpp b/src/Processors/examples/native_orc.cpp new file mode 100644 index 00000000000..201e87b1f56 --- /dev/null +++ b/src/Processors/examples/native_orc.cpp @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +using namespace DB; + +int main() +{ + /// Read schema from orc file + String path = "/path/to/orc/file"; + // String path = "/data1/clickhouse_official/data/user_files/bigolive_audience_stats_orc.orc"; + { + ReadBufferFromFile in(path); + NativeORCSchemaReader schema_reader(in, {}); + auto schema = schema_reader.readSchema(); + std::cout << "schema:" << schema.toString() << std::endl; + } + + /// Read schema from string with orc data + { + ReadBufferFromFile in(path); + + String content; + WriteBufferFromString out(content); + + copyData(in, out); + + content.resize(out.count()); + ReadBufferFromString in2(content); + NativeORCSchemaReader schema_reader(in2, {}); + auto schema = schema_reader.readSchema(); + std::cout << "schema:" << schema.toString() << std::endl; + } + return 0; +}