Merge pull request #40173 from Avogar/arrow-dict

Improve and fix dictionaries in Arrow format
This commit is contained in:
Kruglov Pavel 2022-08-18 20:54:55 +02:00 committed by GitHub
commit b67cb9e378
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 301 additions and 114 deletions

View File

@ -70,6 +70,7 @@ namespace ErrorCodes
extern const int THERE_IS_NO_COLUMN; extern const int THERE_IS_NO_COLUMN;
extern const int UNKNOWN_EXCEPTION; extern const int UNKNOWN_EXCEPTION;
extern const int INCORRECT_NUMBER_OF_COLUMNS; extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int INCORRECT_DATA;
} }
/// Inserts numeric data right into internal column data to reduce an overhead /// Inserts numeric data right into internal column data to reduce an overhead
@ -266,6 +267,9 @@ static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr<arrow::Ch
/// Creates a null bytemap from arrow's null bitmap /// Creates a null bytemap from arrow's null bitmap
static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column) static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
{ {
if (!arrow_column->null_count())
return ColumnUInt8::create(arrow_column->length(), 0);
auto nullmap_column = ColumnUInt8::create(); auto nullmap_column = ColumnUInt8::create();
PaddedPODArray<UInt8> & bytemap_data = assert_cast<ColumnVector<UInt8> &>(*nullmap_column).getData(); PaddedPODArray<UInt8> & bytemap_data = assert_cast<ColumnVector<UInt8> &>(*nullmap_column).getData();
bytemap_data.reserve(arrow_column->length()); bytemap_data.reserve(arrow_column->length());
@ -298,14 +302,121 @@ static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedAr
return offsets_column; return offsets_column;
} }
static ColumnPtr readColumnWithIndexesData(std::shared_ptr<arrow::ChunkedArray> & arrow_column) /*
* Arrow Dictionary and ClickHouse LowCardinality types are a bit different.
* Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index.
* It doesn't have the concept of null or default values.
* An empty string is just a regular value appended at any position of the dictionary.
* Null values have an index of 0, but it should be ignored since the nullmap will return null.
* In ClickHouse LowCardinality, it's different. The dictionary contains null (if dictionary type is Nullable)
* and default values at the beginning. [default, ...] when default values have index of 0 or [null, default, ...]
* when null values have an index of 0 and default values have an index of 1.
* So, we should remap indexes while converting Arrow Dictionary to ClickHouse LowCardinality
* */
template <typename NumericType, typename VectorType = ColumnVector<NumericType>>
static ColumnWithTypeAndName readColumnWithIndexesDataImpl(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name, Int64 default_value_index, NumericType dict_size, bool is_nullable)
{
auto internal_type = std::make_shared<DataTypeNumber<NumericType>>();
auto internal_column = internal_type->createColumn();
auto & column_data = static_cast<VectorType &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
NumericType shift = is_nullable ? 2 : 1;
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
std::shared_ptr<arrow::Array> chunk = arrow_column->chunk(chunk_i);
if (chunk->length() == 0)
continue;
/// buffers[0] is a null bitmap and buffers[1] are actual values
std::shared_ptr<arrow::Buffer> buffer = chunk->data()->buffers[1];
const auto * data = reinterpret_cast<const NumericType *>(buffer->data());
/// Check that indexes are correct (protection against corrupted files)
for (int64_t i = 0; i != chunk->length(); ++i)
{
if (data[i] < 0 || data[i] >= dict_size)
throw Exception(ErrorCodes::INCORRECT_DATA, "Index {} in Dictionary column is out of bounds, dictionary size is {}", Int64(data[i]), UInt64(dict_size));
}
/// If dictionary type is not nullable and arrow dictionary contains default type
/// at 0 index, we don't need to remap anything (it's the case when this data
/// was generated by ClickHouse)
if (!is_nullable && default_value_index == 0)
{
column_data.insert_assume_reserved(data, data + chunk->length());
}
/// If dictionary don't contain default value, we should move all indexes
/// to the right one or two (if dictionary is Nullable) positions
/// Example:
/// Dictionary:
/// dict: ["one", "two"]
/// indexes: [0, 1, 0]
/// LowCardinality:
/// dict: ["", "one", "two"]
/// indexes: [1, 2, 1]
/// LowCardinality(Nullable):
/// dict: [null, "", "one", "two"]
/// indexes: [2, 3, 2]
else if (default_value_index == -1)
{
for (int64_t i = 0; i != chunk->length(); ++i)
{
if (chunk->IsNull(i))
column_data.push_back(0);
else
column_data.push_back(data[i] + shift);
}
}
/// If dictionary contains default value, we change all indexes of it to
/// 0 or 1 (if dictionary type is Nullable) and move all indexes
/// that are less then default value index to the right one or two
/// (if dictionary is Nullable) position and all indexes that are
/// greater then default value index zero or one (if dictionary is Nullable)
/// positions.
/// Example:
/// Dictionary:
/// dict: ["one", "two", "", "three"]
/// indexes: [0, 1, 2, 3, 0]
/// LowCardinality :
/// dict: ["", "one", "two", "three"]
/// indexes: [1, 2, 0, 3, 1]
/// LowCardinality(Nullable):
/// dict: [null, "", "one", "two", "three"]
/// indexes: [2, 3, 1, 4, 2]
else
{
NumericType new_default_index = is_nullable ? 1 : 0;
NumericType default_index = NumericType(default_value_index);
for (int64_t i = 0; i != chunk->length(); ++i)
{
if (chunk->IsNull(i))
column_data.push_back(0);
else
{
NumericType value = data[i];
if (value == default_index)
value = new_default_index;
else if (value < default_index)
value += shift;
else
value += shift - 1;
column_data.push_back(value);
}
}
}
}
return {std::move(internal_column), std::move(internal_type), column_name};
}
static ColumnPtr readColumnWithIndexesData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, Int64 default_value_index, UInt64 dict_size, bool is_nullable)
{ {
switch (arrow_column->type()->id()) switch (arrow_column->type()->id())
{ {
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \ case ARROW_NUMERIC_TYPE: \
{ \ { \
return readColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, "").column; \ return readColumnWithIndexesDataImpl<CPP_NUMERIC_TYPE>(arrow_column, "", default_value_index, dict_size, is_nullable).column; \
} }
FOR_ARROW_INDEXES_TYPES(DISPATCH) FOR_ARROW_INDEXES_TYPES(DISPATCH)
# undef DISPATCH # undef DISPATCH
@ -327,85 +438,25 @@ static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr
return std::make_shared<arrow::ChunkedArray>(array_vector); return std::make_shared<arrow::ChunkedArray>(array_vector);
} }
static ColumnWithTypeAndName createLCColumnFromArrowDictionaryValues(
const std::shared_ptr<ColumnWithTypeAndName> & dict_values,
const ColumnPtr & indexes_column,
const String & column_name
)
{
auto lc_type = std::make_shared<DataTypeLowCardinality>(dict_values->type);
auto lc_column = lc_type->createColumn();
for (auto i = 0u; i < indexes_column->size(); i++)
{
Field f;
dict_values->column->get(indexes_column->getUInt(i), f);
lc_column->insert(f);
}
return {std::move(lc_column), std::move(lc_type), column_name};
}
/*
* Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index.
* It doesn't have the concept of null or default values.
* An empty string is just a regular value appended at any position of the dictionary.
* Null values have an index of 0, but it should be ignored since the nullmap will return null.
* In ClickHouse LowCardinality, it's different. The dictionary contains null and default values at the beginning.
* [null, default, ...]. Therefore, null values have an index of 0 and default values have an index of 1.
* No nullmap is used.
* */
static ColumnWithTypeAndName createLCOfNullableColumnFromArrowDictionaryValues(
const std::shared_ptr<ColumnWithTypeAndName> & dict_values,
const ColumnPtr & indexes_column,
const ColumnPtr & nullmap_column,
const String & column_name
)
{
/*
* ArrowColumn format handles nulls by maintaining a nullmap column, there is no nullable type.
* Therefore, dict_values->type is the actual data type/ non-nullable. It needs to be transformed into nullable
* so LC column is created from nullable type and a null value at the beginning of the collection
* is automatically added.
* */
auto lc_type = std::make_shared<DataTypeLowCardinality>(makeNullable(dict_values->type));
auto lc_column = lc_type->createColumn();
for (auto i = 0u; i < indexes_column->size(); i++)
{
if (nullmap_column && nullmap_column->getBool(i))
{
lc_column->insertDefault();
}
else
{
Field f;
dict_values->column->get(indexes_column->getUInt(i), f);
lc_column->insert(f);
}
}
return {std::move(lc_column), std::move(lc_type), column_name};
}
static ColumnWithTypeAndName readColumnFromArrowColumn( static ColumnWithTypeAndName readColumnFromArrowColumn(
std::shared_ptr<arrow::ChunkedArray> & arrow_column, std::shared_ptr<arrow::ChunkedArray> & arrow_column,
const std::string & column_name, const std::string & column_name,
const std::string & format_name, const std::string & format_name,
bool is_nullable, bool is_nullable,
std::unordered_map<String, std::shared_ptr<ColumnWithTypeAndName>> & dictionary_values, std::unordered_map<String, ArrowColumnToCHColumn::DictionaryInfo> & dictionary_infos,
bool read_ints_as_dates,
bool allow_null_type, bool allow_null_type,
bool skip_columns_with_unsupported_types, bool skip_columns_with_unsupported_types,
bool & skipped) bool & skipped,
DataTypePtr type_hint = nullptr)
{ {
if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST if (!is_nullable && (arrow_column->null_count() || (type_hint && type_hint->isNullable())) && arrow_column->type()->id() != arrow::Type::LIST
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT && && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT &&
arrow_column->type()->id() != arrow::Type::DICTIONARY) arrow_column->type()->id() != arrow::Type::DICTIONARY)
{ {
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); DataTypePtr nested_type_hint;
if (type_hint)
nested_type_hint = removeNullable(type_hint);
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
if (skipped) if (skipped)
return {}; return {};
auto nullmap_column = readByteMapFromArrowColumn(arrow_column); auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
@ -435,14 +486,14 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
case arrow::Type::UINT16: case arrow::Type::UINT16:
{ {
auto column = readColumnWithNumericData<UInt16>(arrow_column, column_name); auto column = readColumnWithNumericData<UInt16>(arrow_column, column_name);
if (read_ints_as_dates) if (type_hint && (isDateOrDate32(type_hint) || isDateTime(type_hint) || isDateTime64(type_hint)))
column.type = std::make_shared<DataTypeDate>(); column.type = std::make_shared<DataTypeDate>();
return column; return column;
} }
case arrow::Type::UINT32: case arrow::Type::UINT32:
{ {
auto column = readColumnWithNumericData<UInt32>(arrow_column, column_name); auto column = readColumnWithNumericData<UInt32>(arrow_column, column_name);
if (read_ints_as_dates) if (type_hint && (isDateOrDate32(type_hint) || isDateTime(type_hint) || isDateTime64(type_hint)))
column.type = std::make_shared<DataTypeDateTime>(); column.type = std::make_shared<DataTypeDateTime>();
return column; return column;
} }
@ -454,8 +505,15 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
return readColumnWithDecimalData<arrow::Decimal256Array>(arrow_column, column_name); return readColumnWithDecimalData<arrow::Decimal256Array>(arrow_column, column_name);
case arrow::Type::MAP: case arrow::Type::MAP:
{ {
DataTypePtr nested_type_hint;
if (type_hint)
{
const auto * map_type_hint = typeid_cast<const DataTypeMap *>(type_hint.get());
if (map_type_hint)
nested_type_hint = assert_cast<const DataTypeArray *>(map_type_hint->getNestedType().get())->getNestedType();
}
auto arrow_nested_column = getNestedArrowColumn(arrow_column); auto arrow_nested_column = getNestedArrowColumn(arrow_column);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
if (skipped) if (skipped)
return {}; return {};
@ -469,8 +527,15 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
} }
case arrow::Type::LIST: case arrow::Type::LIST:
{ {
DataTypePtr nested_type_hint;
if (type_hint)
{
const auto * array_type_hint = typeid_cast<const DataTypeArray *>(type_hint.get());
if (array_type_hint)
nested_type_hint = array_type_hint->getNestedType();
}
auto arrow_nested_column = getNestedArrowColumn(arrow_column); auto arrow_nested_column = getNestedArrowColumn(arrow_column);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
if (skipped) if (skipped)
return {}; return {};
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
@ -493,11 +558,25 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
Columns tuple_elements; Columns tuple_elements;
DataTypes tuple_types; DataTypes tuple_types;
std::vector<String> tuple_names; std::vector<String> tuple_names;
const auto * tuple_type_hint = type_hint ? typeid_cast<const DataTypeTuple *>(type_hint.get()) : nullptr;
for (int i = 0; i != arrow_struct_type->num_fields(); ++i) for (int i = 0; i != arrow_struct_type->num_fields(); ++i)
{ {
auto field_name = arrow_struct_type->field(i)->name();
DataTypePtr nested_type_hint;
if (tuple_type_hint)
{
if (tuple_type_hint->haveExplicitNames())
{
auto pos = tuple_type_hint->tryGetPositionByName(field_name);
if (pos)
nested_type_hint = tuple_type_hint->getElement(*pos);
}
else if (size_t(i) < tuple_type_hint->getElements().size())
nested_type_hint = tuple_type_hint->getElement(i);
}
auto nested_arrow_column = std::make_shared<arrow::ChunkedArray>(nested_arrow_columns[i]); auto nested_arrow_column = std::make_shared<arrow::ChunkedArray>(nested_arrow_columns[i]);
auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); auto element = readColumnFromArrowColumn(nested_arrow_column, field_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
if (skipped) if (skipped)
return {}; return {};
tuple_elements.emplace_back(std::move(element.column)); tuple_elements.emplace_back(std::move(element.column));
@ -511,9 +590,11 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
} }
case arrow::Type::DICTIONARY: case arrow::Type::DICTIONARY:
{ {
auto & dict_values = dictionary_values[column_name]; auto & dict_info = dictionary_infos[column_name];
const auto is_lc_nullable = arrow_column->null_count() > 0 || (type_hint && type_hint->isLowCardinalityNullable());
/// Load dictionary values only once and reuse it. /// Load dictionary values only once and reuse it.
if (!dict_values) if (!dict_info.values)
{ {
arrow::ArrayVector dict_array; arrow::ArrayVector dict_array;
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -522,8 +603,22 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
dict_array.emplace_back(dict_chunk.dictionary()); dict_array.emplace_back(dict_chunk.dictionary());
} }
auto arrow_dict_column = std::make_shared<arrow::ChunkedArray>(dict_array); auto arrow_dict_column = std::make_shared<arrow::ChunkedArray>(dict_array);
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped);
dict_values = std::make_shared<ColumnWithTypeAndName>(std::move(dict_column)); for (size_t i = 0; i != dict_column.column->size(); ++i)
{
if (dict_column.column->isDefaultAt(i))
{
dict_info.default_value_index = i;
break;
}
}
auto lc_type = std::make_shared<DataTypeLowCardinality>(is_lc_nullable ? makeNullable(dict_column.type) : dict_column.type);
auto tmp_lc_column = lc_type->createColumn();
auto tmp_dict_column = IColumn::mutate(assert_cast<ColumnLowCardinality *>(tmp_lc_column.get())->getDictionaryPtr());
dynamic_cast<IColumnUnique *>(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size());
dict_column.column = std::move(tmp_dict_column);
dict_info.values = std::make_shared<ColumnWithTypeAndName>(std::move(dict_column));
dict_info.dictionary_size = arrow_dict_column->length();
} }
arrow::ArrayVector indexes_array; arrow::ArrayVector indexes_array;
@ -534,20 +629,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
} }
auto arrow_indexes_column = std::make_shared<arrow::ChunkedArray>(indexes_array); auto arrow_indexes_column = std::make_shared<arrow::ChunkedArray>(indexes_array);
auto indexes_column = readColumnWithIndexesData(arrow_indexes_column); auto indexes_column = readColumnWithIndexesData(arrow_indexes_column, dict_info.default_value_index, dict_info.dictionary_size, is_lc_nullable);
auto lc_column = ColumnLowCardinality::create(dict_info.values->column, indexes_column);
const auto contains_null = arrow_column->null_count() > 0; auto lc_type = std::make_shared<DataTypeLowCardinality>(is_lc_nullable ? makeNullable(dict_info.values->type) : dict_info.values->type);
return {std::move(lc_column), std::move(lc_type), column_name};
if (contains_null)
{
auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
return createLCOfNullableColumnFromArrowDictionaryValues(dict_values, indexes_column, nullmap_column, column_name);
}
else
{
return createLCColumnFromArrowDictionaryValues(dict_values, indexes_column, column_name);
}
} }
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \ case ARROW_NUMERIC_TYPE: \
@ -623,13 +708,13 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
arrow::ArrayVector array_vector = {arrow_array}; arrow::ArrayVector array_vector = {arrow_array};
auto arrow_column = std::make_shared<arrow::ChunkedArray>(array_vector); auto arrow_column = std::make_shared<arrow::ChunkedArray>(array_vector);
std::unordered_map<std::string, std::shared_ptr<ColumnWithTypeAndName>> dict_values; std::unordered_map<std::string, DictionaryInfo> dict_infos;
bool skipped = false; bool skipped = false;
bool allow_null_type = false; bool allow_null_type = false;
if (hint_header && hint_header->has(field->name()) && hint_header->getByName(field->name()).type->isNullable()) if (hint_header && hint_header->has(field->name()) && hint_header->getByName(field->name()).type->isNullable())
allow_null_type = true; allow_null_type = true;
ColumnWithTypeAndName sample_column = readColumnFromArrowColumn( ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(
arrow_column, field->name(), format_name, false, dict_values, false, allow_null_type, skip_columns_with_unsupported_types, skipped); arrow_column, field->name(), format_name, false, dict_infos, allow_null_type, skip_columns_with_unsupported_types, skipped);
if (!skipped) if (!skipped)
sample_columns.emplace_back(std::move(sample_column)); sample_columns.emplace_back(std::move(sample_column));
} }
@ -700,9 +785,17 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
{ {
if (!nested_tables.contains(search_nested_table_name)) if (!nested_tables.contains(search_nested_table_name))
{ {
NamesAndTypesList nested_columns;
for (const auto & name_and_type : header.getNamesAndTypesList())
{
if (name_and_type.name.starts_with(nested_table_name + "."))
nested_columns.push_back(name_and_type);
}
auto nested_table_type = Nested::collect(nested_columns).front().type;
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[search_nested_table_name]; std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[search_nested_table_name];
ColumnsWithTypeAndName cols = {readColumnFromArrowColumn( ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(
arrow_column, nested_table_name, format_name, false, dictionary_values, true, true, false, skipped)}; arrow_column, nested_table_name, format_name, false, dictionary_infos, true, false, skipped, nested_table_type)};
BlockPtr block_ptr = std::make_shared<Block>(cols); BlockPtr block_ptr = std::make_shared<Block>(cols);
auto column_extractor = std::make_shared<NestedColumnExtractHelper>(*block_ptr, case_insensitive_matching); auto column_extractor = std::make_shared<NestedColumnExtractHelper>(*block_ptr, case_insensitive_matching);
nested_tables[search_nested_table_name] = {block_ptr, column_extractor}; nested_tables[search_nested_table_name] = {block_ptr, column_extractor};
@ -735,7 +828,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
{ {
auto arrow_column = name_to_column_ptr[search_column_name]; auto arrow_column = name_to_column_ptr[search_column_name];
column = readColumnFromArrowColumn( column = readColumnFromArrowColumn(
arrow_column, header_column.name, format_name, false, dictionary_values, true, true, false, skipped); arrow_column, header_column.name, format_name, false, dictionary_infos, true, false, skipped, header_column.type);
} }
try try

View File

@ -44,6 +44,14 @@ public:
const Block * hint_header = nullptr, const Block * hint_header = nullptr,
bool ignore_case = false); bool ignore_case = false);
struct DictionaryInfo
{
std::shared_ptr<ColumnWithTypeAndName> values;
Int64 default_value_index = -1;
UInt64 dictionary_size;
};
private: private:
const Block & header; const Block & header;
const std::string format_name; const std::string format_name;
@ -55,7 +63,7 @@ private:
/// Map {column name : dictionary column}. /// Map {column name : dictionary column}.
/// To avoid converting dictionary from Arrow Dictionary /// To avoid converting dictionary from Arrow Dictionary
/// to LowCardinality every chunk we save it and reuse. /// to LowCardinality every chunk we save it and reuse.
std::unordered_map<std::string, std::shared_ptr<ColumnWithTypeAndName>> dictionary_values; std::unordered_map<std::string, DictionaryInfo> dictionary_infos;
}; };
} }

View File

@ -235,27 +235,30 @@ namespace DB
} }
template<typename T> template<typename T>
static PaddedPODArray<Int64> extractIndexesImpl(ColumnPtr column, size_t start, size_t end) static PaddedPODArray<Int64> extractIndexesImpl(ColumnPtr column, size_t start, size_t end, bool shift)
{ {
const PaddedPODArray<T> & data = assert_cast<const ColumnVector<T> *>(column.get())->getData(); const PaddedPODArray<T> & data = assert_cast<const ColumnVector<T> *>(column.get())->getData();
PaddedPODArray<Int64> result; PaddedPODArray<Int64> result;
result.reserve(end - start); result.reserve(end - start);
std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value); }); if (shift)
std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value) - 1; });
else
std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value); });
return result; return result;
} }
static PaddedPODArray<Int64> extractIndexesImpl(ColumnPtr column, size_t start, size_t end) static PaddedPODArray<Int64> extractIndexesImpl(ColumnPtr column, size_t start, size_t end, bool shift)
{ {
switch (column->getDataType()) switch (column->getDataType())
{ {
case TypeIndex::UInt8: case TypeIndex::UInt8:
return extractIndexesImpl<UInt8>(column, start, end); return extractIndexesImpl<UInt8>(column, start, end, shift);
case TypeIndex::UInt16: case TypeIndex::UInt16:
return extractIndexesImpl<UInt16>(column, start, end); return extractIndexesImpl<UInt16>(column, start, end, shift);
case TypeIndex::UInt32: case TypeIndex::UInt32:
return extractIndexesImpl<UInt32>(column, start, end); return extractIndexesImpl<UInt32>(column, start, end, shift);
case TypeIndex::UInt64: case TypeIndex::UInt64:
return extractIndexesImpl<UInt64>(column, start, end); return extractIndexesImpl<UInt64>(column, start, end, shift);
default: default:
throw Exception(fmt::format("Indexes column must be ColumnUInt, got {}.", column->getName()), throw Exception(fmt::format("Indexes column must be ColumnUInt, got {}.", column->getName()),
ErrorCodes::LOGICAL_ERROR); ErrorCodes::LOGICAL_ERROR);
@ -267,7 +270,7 @@ namespace DB
const String & column_name, const String & column_name,
ColumnPtr & column, ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type, const std::shared_ptr<const IDataType> & column_type,
const PaddedPODArray<UInt8> * null_bytemap, const PaddedPODArray<UInt8> *,
arrow::ArrayBuilder * array_builder, arrow::ArrayBuilder * array_builder,
String format_name, String format_name,
size_t start, size_t start,
@ -278,6 +281,7 @@ namespace DB
const auto * column_lc = assert_cast<const ColumnLowCardinality *>(column.get()); const auto * column_lc = assert_cast<const ColumnLowCardinality *>(column.get());
arrow::DictionaryBuilder<ValueType> * builder = assert_cast<arrow::DictionaryBuilder<ValueType> *>(array_builder); arrow::DictionaryBuilder<ValueType> * builder = assert_cast<arrow::DictionaryBuilder<ValueType> *>(array_builder);
auto & dict_values = dictionary_values[column_name]; auto & dict_values = dictionary_values[column_name];
bool is_nullable = column_type->isLowCardinalityNullable();
/// Convert dictionary from LowCardinality to Arrow dictionary only once and then reuse it. /// Convert dictionary from LowCardinality to Arrow dictionary only once and then reuse it.
if (!dict_values) if (!dict_values)
@ -288,9 +292,9 @@ namespace DB
arrow::Status status = MakeBuilder(pool, value_type, &values_builder); arrow::Status status = MakeBuilder(pool, value_type, &values_builder);
checkStatus(status, column->getName(), format_name); checkStatus(status, column->getName(), format_name);
auto dict_column = column_lc->getDictionary().getNestedColumn(); auto dict_column = column_lc->getDictionary().getNestedNotNullableColumn();
const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType(); const auto & dict_type = removeNullable(assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType());
fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), output_string_as_string, dictionary_values); fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, is_nullable, dict_column->size(), output_string_as_string, dictionary_values);
status = values_builder->Finish(&dict_values); status = values_builder->Finish(&dict_values);
checkStatus(status, column->getName(), format_name); checkStatus(status, column->getName(), format_name);
} }
@ -300,15 +304,14 @@ namespace DB
/// AppendIndices in DictionaryBuilder works only with int64_t data, so we cannot use /// AppendIndices in DictionaryBuilder works only with int64_t data, so we cannot use
/// fillArrowArray here and should copy all indexes to int64_t container. /// fillArrowArray here and should copy all indexes to int64_t container.
auto indexes = extractIndexesImpl(column_lc->getIndexesPtr(), start, end); auto indexes = extractIndexesImpl(column_lc->getIndexesPtr(), start, end, is_nullable);
const uint8_t * arrow_null_bytemap_raw_ptr = nullptr; const uint8_t * arrow_null_bytemap_raw_ptr = nullptr;
PaddedPODArray<uint8_t> arrow_null_bytemap; PaddedPODArray<uint8_t> arrow_null_bytemap;
if (null_bytemap) if (column_type->isLowCardinalityNullable())
{ {
/// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
arrow_null_bytemap.reserve(end - start); arrow_null_bytemap.reserve(end - start);
for (size_t i = start; i < end; ++i) for (size_t i = start; i < end; ++i)
arrow_null_bytemap.emplace_back(!(*null_bytemap)[i]); arrow_null_bytemap.emplace_back(!column_lc->isNullAt(i));
arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data(); arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
} }
@ -680,7 +683,7 @@ namespace DB
{ {
auto nested_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType(); auto nested_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType();
const auto * lc_column = assert_cast<const ColumnLowCardinality *>(column.get()); const auto * lc_column = assert_cast<const ColumnLowCardinality *>(column.get());
const auto & nested_column = lc_column->getDictionaryPtr(); const auto & nested_column = lc_column->getDictionary().getNestedColumn();
const auto & indexes_column = lc_column->getIndexesPtr(); const auto & indexes_column = lc_column->getIndexesPtr();
return arrow::dictionary( return arrow::dictionary(
getArrowTypeForLowCardinalityIndexes(indexes_column), getArrowTypeForLowCardinalityIndexes(indexes_column),

View File

@ -0,0 +1,10 @@
<test>
<create_query>CREATE TABLE test (uint32 UInt32, n_uint32 Nullable(UInt32), lc LowCardinality(String)) ENGINE=File(Arrow) SETTINGS output_format_arrow_low_cardinality_as_dictionary=1</create_query>
<fill_query>insert into test select number, number, toString(number % 10000) from numbers(10000000)</fill_query>
<query>SELECT uint32 from test format Null</query>
<query>SELECT n_uint32 from test format Null</query>
<query>SELECT lc from test format Null</query>
<drop_query>DROP TABLE IF EXISTS test</drop_query>
</test>

View File

@ -0,0 +1,32 @@
dict LowCardinality(Nullable(String))
one
two
three
one
two
dict LowCardinality(Nullable(String))
one
two
three
one
three
dict LowCardinality(Nullable(String))
one
two
three
one
two
three
lc LowCardinality(Nullable(String))
OK
dict LowCardinality(Nullable(String))
one
two
three
one
\N
three

View File

@ -0,0 +1,29 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
mkdir -p $USER_FILES_PATH/test_02383
cp $CURDIR/data_arrow/dictionary*.arrow $USER_FILES_PATH/test_02383/
cp $CURDIR/data_arrow/corrupted.arrow $USER_FILES_PATH/test_02383/
cp $CURDIR/data_arrow/dict_with_nulls.arrow $USER_FILES_PATH/test_02383/
$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary1.arrow')"
$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary1.arrow')"
$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary2.arrow')"
$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary2.arrow')"
$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary3.arrow')"
$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary3.arrow')"
$CLICKHOUSE_CLIENT -q "desc file('test_02383/corrupted.arrow')"
$CLICKHOUSE_CLIENT -q "select * from file('test_02383/corrupted.arrow')" 2>&1 | grep -F -q "INCORRECT_DATA" && echo OK || echo FAIL
$CLICKHOUSE_CLIENT -q "desc file('test_02383/dict_with_nulls.arrow')"
$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dict_with_nulls.arrow')"
rm -rf $USER_FILES_PATH/test_02383

View File

@ -0,0 +1,4 @@
lc LowCardinality(Nullable(String))
abc
lc LowCardinality(Nullable(String))
abc

View File

@ -0,0 +1,8 @@
-- Tags: no-fasttest
insert into function file(02384_data.arrow) select toLowCardinality(toNullable('abc')) as lc settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=0, engine_file_truncate_on_insert=1;
desc file(02384_data.arrow);
select * from file(02384_data.arrow);
insert into function file(02384_data.arrow) select toLowCardinality(toNullable('abc')) as lc settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=1, engine_file_truncate_on_insert=1;
desc file(02384_data.arrow);
select * from file(02384_data.arrow);

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.