mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
Merge 4e155fae74
into b94a7167a8
This commit is contained in:
commit
23fb4794f1
@ -1089,6 +1089,7 @@ class IColumn;
|
||||
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
|
||||
M(Bool, input_format_orc_filter_push_down, true, "When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata.", 0) \
|
||||
M(String, input_format_orc_reader_time_zone_name, "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT.", 0) \
|
||||
M(Bool, input_format_orc_dictionary_as_low_cardinality, true, "Treat ORC dictionary encoded columns as LowCardinality columns while reading ORC files", 0) \
|
||||
M(Bool, input_format_parquet_allow_missing_columns, true, "Allow missing columns while reading Parquet input formats", 0) \
|
||||
M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
|
||||
M(Bool, input_format_arrow_allow_missing_columns, true, "Allow missing columns while reading Arrow input formats", 0) \
|
||||
|
@ -77,6 +77,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
|
||||
{"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
|
||||
{"create_if_not_exists", false, false, "New setting."},
|
||||
{"allow_materialized_view_with_bad_select", true, true, "Support (but not enable yet) stricter validation in CREATE MATERIALIZED VIEW"},
|
||||
{"input_format_orc_dictionary_as_low_cardinality", true, true, "Treat ORC dictionary encoded columns as LowCardinality columns while reading ORC files"},
|
||||
{"output_format_always_quote_identifiers", false, false, "New setting."},
|
||||
{"output_format_identifier_quoting_style", "Backticks", "Backticks", "New setting."},
|
||||
{"parallel_replicas_mark_segment_size", 128, 0, "Value for this setting now determined automatically"},
|
||||
|
@ -237,9 +237,6 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
|
||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
|
||||
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
|
||||
format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
|
||||
@ -248,6 +245,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
|
||||
format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
|
||||
format_settings.orc.filter_push_down = settings.input_format_orc_filter_push_down;
|
||||
format_settings.orc.reader_time_zone_name = settings.input_format_orc_reader_time_zone_name;
|
||||
format_settings.orc.dictionary_as_low_cardinality = settings.input_format_orc_dictionary_as_low_cardinality;
|
||||
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
|
||||
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
|
||||
format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;
|
||||
|
@ -415,6 +415,7 @@ struct FormatSettings
|
||||
bool filter_push_down = true;
|
||||
UInt64 output_row_index_stride = 10'000;
|
||||
String reader_time_zone_name = "GMT";
|
||||
bool dictionary_as_low_cardinality = true;
|
||||
double output_dictionary_key_size_threshold = 0.0;
|
||||
} orc{};
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
# include <Columns/ColumnMap.h>
|
||||
# include <Columns/ColumnNullable.h>
|
||||
# include <Columns/ColumnString.h>
|
||||
# include <Columns/ColumnsCommon.h>
|
||||
# include <Columns/ColumnsDateTime.h>
|
||||
# include <Columns/ColumnsNumber.h>
|
||||
# include <DataTypes/DataTypeArray.h>
|
||||
@ -16,13 +17,13 @@
|
||||
# include <DataTypes/DataTypeIPv4andIPv6.h>
|
||||
# include <DataTypes/DataTypeLowCardinality.h>
|
||||
# include <DataTypes/DataTypeMap.h>
|
||||
# include <DataTypes/DataTypeNested.h>
|
||||
# include <DataTypes/DataTypeNullable.h>
|
||||
# include <DataTypes/DataTypeString.h>
|
||||
# include <DataTypes/DataTypeTuple.h>
|
||||
# include <DataTypes/DataTypesDecimal.h>
|
||||
# include <DataTypes/DataTypesNumber.h>
|
||||
# include <DataTypes/NestedUtils.h>
|
||||
# include <DataTypes/DataTypeNested.h>
|
||||
# include <Formats/FormatFactory.h>
|
||||
# include <Formats/SchemaInferenceUtils.h>
|
||||
# include <Formats/insertNullAsDefaultIfNeeded.h>
|
||||
@ -35,6 +36,8 @@
|
||||
# include <Common/FieldVisitorsAccurateComparison.h>
|
||||
# include "ArrowBufferedStreams.h"
|
||||
|
||||
# include <orc/Vector.hh>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -110,7 +113,21 @@ static const orc::Type * getORCTypeByName(const orc::Type & schema, const String
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_with_unsupported_types, bool & skipped)
|
||||
static bool isDictionaryEncoded(const orc::StripeInformation * stripe_info, const orc::Type * orc_type)
|
||||
{
|
||||
if (!stripe_info)
|
||||
return false;
|
||||
|
||||
auto encoding = stripe_info->getColumnEncoding(orc_type->getColumnId());
|
||||
return encoding == orc::ColumnEncodingKind_DICTIONARY || encoding == orc::ColumnEncodingKind_DICTIONARY_V2;
|
||||
}
|
||||
|
||||
static DataTypePtr parseORCType(
|
||||
const orc::Type * orc_type,
|
||||
bool skip_columns_with_unsupported_types,
|
||||
bool dictionary_as_low_cardinality,
|
||||
const orc::StripeInformation * stripe_info,
|
||||
bool & skipped)
|
||||
{
|
||||
assert(orc_type != nullptr);
|
||||
|
||||
@ -137,12 +154,25 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
|
||||
return std::make_shared<DataTypeDateTime64>(9);
|
||||
case orc::TypeKind::TIMESTAMP_INSTANT:
|
||||
return std::make_shared<DataTypeDateTime64>(9, "UTC");
|
||||
case orc::TypeKind::CHAR:
|
||||
case orc::TypeKind::VARCHAR:
|
||||
case orc::TypeKind::BINARY:
|
||||
case orc::TypeKind::STRING:
|
||||
return std::make_shared<DataTypeString>();
|
||||
case orc::TypeKind::CHAR:
|
||||
return std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength());
|
||||
case orc::TypeKind::STRING: {
|
||||
DataTypePtr type;
|
||||
if (orc_type->getKind() == orc::TypeKind::CHAR)
|
||||
type = std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength());
|
||||
else
|
||||
type = std::make_shared<DataTypeString>();
|
||||
|
||||
// std::cout << "type:" << type->getName() << std::endl;
|
||||
// std::cout << "dictionary_as_low_cardinality:" << dictionary_as_low_cardinality << std::endl;
|
||||
|
||||
/// Wrap type in LowCardinality if orc column is dictionary encoded and dictionary_as_low_cardinality is true
|
||||
if (dictionary_as_low_cardinality && isDictionaryEncoded(stripe_info, orc_type))
|
||||
type = std::make_shared<DataTypeLowCardinality>(type);
|
||||
|
||||
return type;
|
||||
}
|
||||
case orc::TypeKind::DECIMAL: {
|
||||
UInt64 precision = orc_type->getPrecision();
|
||||
UInt64 scale = orc_type->getScale();
|
||||
@ -158,7 +188,8 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
|
||||
if (subtype_count != 1)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc List type {}", orc_type->toString());
|
||||
|
||||
DataTypePtr nested_type = parseORCType(orc_type->getSubtype(0), skip_columns_with_unsupported_types, skipped);
|
||||
DataTypePtr nested_type = parseORCType(
|
||||
orc_type->getSubtype(0), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
|
||||
if (skipped)
|
||||
return {};
|
||||
|
||||
@ -168,11 +199,12 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
|
||||
if (subtype_count != 2)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc Map type {}", orc_type->toString());
|
||||
|
||||
DataTypePtr key_type = parseORCType(orc_type->getSubtype(0), skip_columns_with_unsupported_types, skipped);
|
||||
DataTypePtr key_type = parseORCType(
|
||||
orc_type->getSubtype(0), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
|
||||
if (skipped)
|
||||
return {};
|
||||
|
||||
DataTypePtr value_type = parseORCType(orc_type->getSubtype(1), skip_columns_with_unsupported_types, skipped);
|
||||
DataTypePtr value_type = parseORCType(orc_type->getSubtype(1), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
|
||||
if (skipped)
|
||||
return {};
|
||||
|
||||
@ -186,7 +218,8 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
|
||||
|
||||
for (size_t i = 0; i < orc_type->getSubtypeCount(); ++i)
|
||||
{
|
||||
auto parsed_type = parseORCType(orc_type->getSubtype(i), skip_columns_with_unsupported_types, skipped);
|
||||
auto parsed_type
|
||||
= parseORCType(orc_type->getSubtype(i), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
|
||||
if (skipped)
|
||||
return {};
|
||||
|
||||
@ -489,7 +522,7 @@ static void buildORCSearchArgumentImpl(
|
||||
/// For queries with where condition like "a > 10", if a column contains negative values such as "-1", pushing or not pushing
|
||||
/// down filters would result in different outputs.
|
||||
bool skipped = false;
|
||||
auto expect_type = makeNullableRecursively(parseORCType(orc_type, true, skipped));
|
||||
auto expect_type = makeNullableRecursively(parseORCType(orc_type, true, false, nullptr, skipped));
|
||||
const ColumnWithTypeAndName * column = header.findByName(column_name, format_settings.orc.case_insensitive_column_matching);
|
||||
if (!expect_type || !column)
|
||||
{
|
||||
@ -857,11 +890,18 @@ void NativeORCBlockInputFormat::prepareFileReader()
|
||||
total_stripes = static_cast<int>(file_reader->getNumberOfStripes());
|
||||
current_stripe = -1;
|
||||
|
||||
|
||||
std::unique_ptr<orc::StripeInformation> stripe_info;
|
||||
if (file_reader->getNumberOfStripes())
|
||||
stripe_info = file_reader->getStripe(0);
|
||||
|
||||
orc_column_to_ch_column = std::make_unique<ORCColumnToCHColumn>(
|
||||
getPort().getHeader(),
|
||||
format_settings.orc.allow_missing_columns,
|
||||
format_settings.null_as_default,
|
||||
format_settings.orc.case_insensitive_column_matching);
|
||||
format_settings.orc.case_insensitive_column_matching,
|
||||
format_settings.orc.dictionary_as_low_cardinality,
|
||||
std::move(stripe_info));
|
||||
|
||||
const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
|
||||
const auto & header = getPort().getHeader();
|
||||
@ -899,6 +939,7 @@ bool NativeORCBlockInputFormat::prepareStripeReader()
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "ORC stripe {} has no rows", current_stripe);
|
||||
|
||||
orc::RowReaderOptions row_reader_options;
|
||||
row_reader_options.setEnableLazyDecoding(format_settings.orc.dictionary_as_low_cardinality);
|
||||
row_reader_options.includeTypes(include_indices);
|
||||
row_reader_options.setTimezoneName(format_settings.orc.reader_time_zone_name);
|
||||
row_reader_options.range(current_stripe_info->getOffset(), current_stripe_info->getLength());
|
||||
@ -989,15 +1030,25 @@ NamesAndTypesList NativeORCSchemaReader::readSchema()
|
||||
std::atomic<int> is_stopped = 0;
|
||||
getFileReader(in, file_reader, format_settings, is_stopped);
|
||||
|
||||
|
||||
const auto & schema = file_reader->getType();
|
||||
Block header;
|
||||
std::unique_ptr<orc::StripeInformation> stripe_info;
|
||||
if (file_reader->getNumberOfStripes())
|
||||
stripe_info = file_reader->getStripe(0);
|
||||
|
||||
for (size_t i = 0; i < schema.getSubtypeCount(); ++i)
|
||||
{
|
||||
const std::string & name = schema.getFieldName(i);
|
||||
const orc::Type * orc_type = schema.getSubtype(i);
|
||||
|
||||
bool skipped = false;
|
||||
DataTypePtr type = parseORCType(orc_type, format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference, skipped);
|
||||
DataTypePtr type = parseORCType(
|
||||
orc_type,
|
||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference,
|
||||
format_settings.orc.dictionary_as_low_cardinality,
|
||||
stripe_info.get(),
|
||||
skipped);
|
||||
if (!skipped)
|
||||
header.insert(ColumnWithTypeAndName{type, name});
|
||||
}
|
||||
@ -1008,11 +1059,18 @@ NamesAndTypesList NativeORCSchemaReader::readSchema()
|
||||
}
|
||||
|
||||
ORCColumnToCHColumn::ORCColumnToCHColumn(
|
||||
const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_)
|
||||
const Block & header_,
|
||||
bool allow_missing_columns_,
|
||||
bool null_as_default_,
|
||||
bool case_insensitive_matching_,
|
||||
bool dictionary_as_low_cardinality_,
|
||||
std::unique_ptr<orc::StripeInformation> stripe_info_)
|
||||
: header(header_)
|
||||
, allow_missing_columns(allow_missing_columns_)
|
||||
, null_as_default(null_as_default_)
|
||||
, case_insensitive_matching(case_insensitive_matching_)
|
||||
, dictionary_as_low_cardinality(dictionary_as_low_cardinality_)
|
||||
, stripe_info(std::move(stripe_info_))
|
||||
{
|
||||
}
|
||||
|
||||
@ -1126,6 +1184,117 @@ readColumnWithNumericDataCast(const orc::ColumnVectorBatch * orc_column, const o
|
||||
return {std::move(internal_column), std::move(internal_type), column_name};
|
||||
}
|
||||
|
||||
template <bool fixed_string>
|
||||
static ColumnWithTypeAndName readColumnWithEncodedStringOrFixedStringData(
|
||||
const orc::ColumnVectorBatch * orc_column, const orc::Type * orc_type, const String & column_name, bool nullable)
|
||||
{
|
||||
const auto * orc_str_column = dynamic_cast<const orc::EncodedStringVectorBatch *>(orc_column);
|
||||
size_t rows = orc_str_column->numElements;
|
||||
const auto & orc_dict = *orc_str_column->dictionary;
|
||||
size_t dict_size = orc_dict.dictionaryOffset.size() - 1;
|
||||
|
||||
/// Fill CH holder_column with orc dictionary
|
||||
/// Note that holder_column is always a ColumnString or ColumnFixedstring whether nullable is true or false, because ORC dictionary doesn't contain null values.
|
||||
DataTypePtr holder_type;
|
||||
if constexpr (fixed_string)
|
||||
holder_type = std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength());
|
||||
else
|
||||
holder_type = std::make_shared<DataTypeString>();
|
||||
|
||||
auto holder_column = holder_type->createColumn();
|
||||
if constexpr (fixed_string)
|
||||
{
|
||||
const size_t n = orc_type->getMaximumLength();
|
||||
auto & concrete_holder_column = assert_cast<ColumnFixedString &>(*holder_column);
|
||||
PaddedPODArray<UInt8> & column_chars_t = concrete_holder_column.getChars();
|
||||
size_t reserve_size = dict_size * n;
|
||||
column_chars_t.resize_exact(reserve_size);
|
||||
size_t curr_offset = 0;
|
||||
for (size_t i = 0; i < dict_size; ++i)
|
||||
{
|
||||
const auto * buf = orc_dict.dictionaryBlob.data() + orc_dict.dictionaryOffset[i];
|
||||
size_t buf_size = orc_dict.dictionaryOffset[i + 1] - orc_dict.dictionaryOffset[i];
|
||||
memcpy(&column_chars_t[curr_offset], buf, buf_size);
|
||||
curr_offset += n;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & concrete_holder_column = assert_cast<ColumnString &>(*holder_column);
|
||||
PaddedPODArray<UInt8> & column_chars_t = concrete_holder_column.getChars();
|
||||
PaddedPODArray<UInt64> & column_offsets = concrete_holder_column.getOffsets();
|
||||
|
||||
size_t reserve_size = orc_dict.dictionaryBlob.size() + dict_size;
|
||||
column_chars_t.resize_exact(reserve_size);
|
||||
column_offsets.resize_exact(dict_size);
|
||||
size_t curr_offset = 0;
|
||||
for (size_t i = 0; i < dict_size; ++i)
|
||||
{
|
||||
const auto * buf = orc_dict.dictionaryBlob.data() + orc_dict.dictionaryOffset[i];
|
||||
size_t buf_size = orc_dict.dictionaryOffset[i + 1] - orc_dict.dictionaryOffset[i];
|
||||
memcpy(&column_chars_t[curr_offset], buf, buf_size);
|
||||
curr_offset += buf_size;
|
||||
|
||||
column_chars_t[curr_offset] = 0;
|
||||
++curr_offset;
|
||||
|
||||
column_offsets[i] = curr_offset;
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert CH dictionary_column from holder_column
|
||||
DataTypePtr nested_type = nullable ? std::make_shared<DataTypeNullable>(holder_type) : holder_type;
|
||||
auto internal_type = std::make_shared<DataTypeLowCardinality>(std::move(nested_type));
|
||||
auto tmp_internal_column = internal_type->createColumn();
|
||||
auto dictionary_column = IColumn::mutate(assert_cast<ColumnLowCardinality *>(tmp_internal_column.get())->getDictionaryPtr());
|
||||
auto index_column
|
||||
= dynamic_cast<IColumnUnique *>(dictionary_column.get())->uniqueInsertRangeFrom(*holder_column, 0, holder_column->size());
|
||||
|
||||
/// Fill index_column and wrap it with LowCardinality
|
||||
auto call_by_type = [&](auto index_type) -> MutableColumnPtr
|
||||
{
|
||||
using IndexType = decltype(index_type);
|
||||
const ColumnVector<IndexType> * concrete_index_column = checkAndGetColumn<ColumnVector<IndexType>>(index_column.get());
|
||||
if (!concrete_index_column)
|
||||
return nullptr;
|
||||
|
||||
const auto & index_data = concrete_index_column->getData();
|
||||
auto new_index_column = ColumnVector<IndexType>::create(rows);
|
||||
auto & new_index_data = dynamic_cast<ColumnVector<IndexType> &>(*new_index_column).getData();
|
||||
|
||||
if (!orc_str_column->hasNulls)
|
||||
{
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
{
|
||||
/// First map row index to orc dictionary index, then map orc dictionary index to CH dictionary index
|
||||
new_index_data[i] = index_data[orc_str_column->index[i]];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
{
|
||||
/// Set index 0 if we meet null value. If dictionary_column is nullable, 0 represents null value.
|
||||
/// Otherwise 0 represents default string value, it is reasonable because null values are converted to default values when casting nullable column to non-nullable.
|
||||
new_index_data[i] = orc_str_column->notNull[i] ? index_data[orc_str_column->index[i]] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
return ColumnLowCardinality::create(std::move(dictionary_column), std::move(new_index_column));
|
||||
};
|
||||
|
||||
MutableColumnPtr internal_column;
|
||||
if (!internal_column)
|
||||
internal_column = call_by_type(UInt8());
|
||||
if (!internal_column)
|
||||
internal_column = call_by_type(UInt16());
|
||||
if (!internal_column)
|
||||
internal_column = call_by_type(UInt32());
|
||||
if (!internal_column)
|
||||
internal_column = call_by_type(UInt64());
|
||||
return {std::move(internal_column), std::move(internal_type), column_name};
|
||||
}
|
||||
|
||||
static ColumnWithTypeAndName
|
||||
readColumnWithStringData(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name)
|
||||
{
|
||||
@ -1369,16 +1538,16 @@ readColumnWithTimestampData(const orc::ColumnVectorBatch * orc_column, const orc
|
||||
return {std::move(internal_column), std::move(internal_type), column_name};
|
||||
}
|
||||
|
||||
static ColumnWithTypeAndName readColumnFromORCColumn(
|
||||
ColumnWithTypeAndName ORCColumnToCHColumn::readColumnFromORCColumn(
|
||||
const orc::ColumnVectorBatch * orc_column,
|
||||
const orc::Type * orc_type,
|
||||
const std::string & column_name,
|
||||
bool inside_nullable,
|
||||
DataTypePtr type_hint = nullptr)
|
||||
DataTypePtr type_hint) const
|
||||
{
|
||||
bool skipped = false;
|
||||
|
||||
if (!inside_nullable && (orc_column->hasNulls || (type_hint && type_hint->isNullable()))
|
||||
if (!inside_nullable && (orc_column->hasNulls || (type_hint && type_hint->isNullable())) && !orc_column->isEncoded
|
||||
&& (orc_type->getKind() != orc::LIST && orc_type->getKind() != orc::MAP && orc_type->getKind() != orc::STRUCT))
|
||||
{
|
||||
DataTypePtr nested_type_hint;
|
||||
@ -1420,7 +1589,14 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
|
||||
default:;
|
||||
}
|
||||
}
|
||||
return readColumnWithStringData(orc_column, orc_type, column_name);
|
||||
|
||||
if (orc_column->isEncoded && dictionary_as_low_cardinality)
|
||||
{
|
||||
bool nullable = type_hint ? isNullableOrLowCardinalityNullable(type_hint) : true;
|
||||
return readColumnWithEncodedStringOrFixedStringData<false>(orc_column, orc_type, column_name, nullable);
|
||||
}
|
||||
else
|
||||
return readColumnWithStringData(orc_column, orc_type, column_name);
|
||||
}
|
||||
case orc::CHAR: {
|
||||
if (type_hint)
|
||||
@ -1438,7 +1614,14 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
|
||||
default:;
|
||||
}
|
||||
}
|
||||
return readColumnWithFixedStringData(orc_column, orc_type, column_name);
|
||||
|
||||
if (orc_column->isEncoded && dictionary_as_low_cardinality)
|
||||
{
|
||||
bool nullable = type_hint ? isNullableOrLowCardinalityNullable(type_hint) : true;
|
||||
return readColumnWithEncodedStringOrFixedStringData<true>(orc_column, orc_type, column_name, nullable);
|
||||
}
|
||||
else
|
||||
return readColumnWithFixedStringData(orc_column, orc_type, column_name);
|
||||
}
|
||||
case orc::BOOLEAN:
|
||||
return readColumnWithBooleanData(orc_column, orc_type, column_name);
|
||||
@ -1465,7 +1648,7 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
|
||||
case orc::TIMESTAMP_INSTANT:
|
||||
return readColumnWithTimestampData(orc_column, orc_type, column_name);
|
||||
case orc::DECIMAL: {
|
||||
auto interal_type = parseORCType(orc_type, false, skipped);
|
||||
auto interal_type = parseORCType(orc_type, false, false, nullptr, skipped);
|
||||
|
||||
auto precision = orc_type->getPrecision();
|
||||
if (precision == 0)
|
||||
|
@ -111,7 +111,13 @@ public:
|
||||
using ORCColumnWithType = std::pair<ORCColumnPtr, ORCTypePtr>;
|
||||
using NameToColumnPtr = std::unordered_map<std::string, ORCColumnWithType>;
|
||||
|
||||
ORCColumnToCHColumn(const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_ = false);
|
||||
ORCColumnToCHColumn(
|
||||
const Block & header_,
|
||||
bool allow_missing_columns_,
|
||||
bool null_as_default_,
|
||||
bool case_insensitive_matching_ = false,
|
||||
bool dictionary_as_low_cardinality_ = false,
|
||||
std::unique_ptr<orc::StripeInformation> stripe_info_ = {});
|
||||
|
||||
void orcTableToCHChunk(
|
||||
Chunk & res,
|
||||
@ -124,11 +130,20 @@ public:
|
||||
Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values = nullptr);
|
||||
|
||||
private:
|
||||
ColumnWithTypeAndName readColumnFromORCColumn(
|
||||
const orc::ColumnVectorBatch * orc_column,
|
||||
const orc::Type * orc_type,
|
||||
const std::string & column_name,
|
||||
bool inside_nullable,
|
||||
DataTypePtr type_hint = nullptr) const;
|
||||
|
||||
const Block & header;
|
||||
/// If false, throw exception if some columns in header not exists in arrow table.
|
||||
bool allow_missing_columns;
|
||||
bool null_as_default;
|
||||
bool case_insensitive_matching;
|
||||
bool dictionary_as_low_cardinality;
|
||||
std::unique_ptr<orc::StripeInformation> stripe_info;
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user