Merge pull request #40037 from arthurpassos/fix_arrow_column_dictionary_to_ch_lc

Fix arrow column dictionary to ch lc
This commit is contained in:
Vladimir C 2022-08-10 11:40:40 +02:00 committed by GitHub
commit a006ff0a43
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 161 additions and 10 deletions

View File

@ -326,6 +326,69 @@ static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr
return std::make_shared<arrow::ChunkedArray>(array_vector);
}
static ColumnWithTypeAndName createLCColumnFromArrowDictionaryValues(
const std::shared_ptr<ColumnWithTypeAndName> & dict_values,
const ColumnPtr & indexes_column,
const String & column_name
)
{
auto lc_type = std::make_shared<DataTypeLowCardinality>(dict_values->type);
auto lc_column = lc_type->createColumn();
for (auto i = 0u; i < indexes_column->size(); i++)
{
Field f;
dict_values->column->get(indexes_column->getUInt(i), f);
lc_column->insert(f);
}
return {std::move(lc_column), std::move(lc_type), column_name};
}
/*
* Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index.
* It doesn't have the concept of null or default values.
* An empty string is just a regular value appended at any position of the dictionary.
* Null values have an index of 0, but it should be ignored since the nullmap will return null.
* In ClickHouse LowCardinality, it's different. The dictionary contains null and default values at the beginning.
* [null, default, ...]. Therefore, null values have an index of 0 and default values have an index of 1.
* No nullmap is used.
* */
static ColumnWithTypeAndName createLCOfNullableColumnFromArrowDictionaryValues(
const std::shared_ptr<ColumnWithTypeAndName> & dict_values,
const ColumnPtr & indexes_column,
const ColumnPtr & nullmap_column,
const String & column_name
)
{
/*
* ArrowColumn format handles nulls by maintaining a nullmap column, there is no nullable type.
* Therefore, dict_values->type is the actual data type/ non-nullable. It needs to be transformed into nullable
* so LC column is created from nullable type and a null value at the beginning of the collection
* is automatically added.
* */
auto lc_type = std::make_shared<DataTypeLowCardinality>(makeNullable(dict_values->type));
auto lc_column = lc_type->createColumn();
for (auto i = 0u; i < indexes_column->size(); i++)
{
if (nullmap_column && nullmap_column->getBool(i))
{
lc_column->insertDefault();
}
else
{
Field f;
dict_values->column->get(indexes_column->getUInt(i), f);
lc_column->insert(f);
}
}
return {std::move(lc_column), std::move(lc_type), column_name};
}
static ColumnWithTypeAndName readColumnFromArrowColumn(
std::shared_ptr<arrow::ChunkedArray> & arrow_column,
const std::string & column_name,
@ -338,7 +401,8 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
bool & skipped)
{
if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT)
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT &&
arrow_column->type()->id() != arrow::Type::DICTIONARY)
{
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
if (skipped)
@ -455,12 +519,6 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
}
auto arrow_dict_column = std::make_shared<arrow::ChunkedArray>(dict_array);
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
/// We should convert read column to ColumnUnique.
auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn();
auto tmp_dict_column = IColumn::mutate(assert_cast<ColumnLowCardinality *>(tmp_lc_column.get())->getDictionaryPtr());
static_cast<IColumnUnique *>(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size());
dict_column.column = std::move(tmp_dict_column);
dict_values = std::make_shared<ColumnWithTypeAndName>(std::move(dict_column));
}
@ -473,9 +531,19 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
auto arrow_indexes_column = std::make_shared<arrow::ChunkedArray>(indexes_array);
auto indexes_column = readColumnWithIndexesData(arrow_indexes_column);
auto lc_column = ColumnLowCardinality::create(dict_values->column, indexes_column);
auto lc_type = std::make_shared<DataTypeLowCardinality>(dict_values->type);
return {std::move(lc_column), std::move(lc_type), column_name};
const auto contains_null = arrow_column->null_count() > 0;
if (contains_null)
{
auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
return createLCOfNullableColumnFromArrowDictionaryValues(dict_values, indexes_column, nullmap_column, column_name);
}
else
{
return createLCColumnFromArrowDictionaryValues(dict_values, indexes_column, column_name);
}
}
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \

View File

@ -0,0 +1,9 @@
lc_nullable_string
LowCardinality(Nullable(String))
one
\N
three
\N
six

View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
# ## reading ArrowStream file from python
# import pyarrow as pa
# stream = pa.ipc.open_stream("test.arrows")
# x = stream.read_all()
# print(x)
## writing ArrowStream file from python
# import pyarrow as pa
# data = [
# pa.array(["one", None, "three", "", None, "", "six"]).dictionary_encode(),
# ]
# batch = pa.record_batch(data, names=['id', 'lc_nullable', 'lc_int_nullable', 'bool_nullable'])
# writer = pa.ipc.new_stream("test4.arrows", batch.schema)
# writer.write_batch(batch)
# writer.close()
# cat data.arrow | gzip | base64
cat <<EOF | base64 --decode | gunzip | $CLICKHOUSE_LOCAL --query='SELECT * FROM table FORMAT TSVWithNamesAndTypes' --input-format=ArrowStream
H4sIAAAAAAAAA3VQQQ6CQAychRWIcjCGGA4ePHrwCT7Bgz8waoiSICaoiU/wGR49+Me1XboIJDYp
3d2ZTjsYY8wLwBgcQ8QIMEBEJwqloal8iMNVUSaWmxIjQEjsMb3UvWrA2IZySalRx4SyOGzLe1Hs
9kW2vd6qvDyC+iPL4m8MvseUob2z2NyiutGhFcxb5sP2JLJpbPvhaYstBK8d1PrOW0pMLcha3p0+
h4//4eamUkctTPV02nqRpONfyux2qrLsmj8aX8+Or2nXl6/tzEWj2vWxEh9ha67X2w2yA8esh7k+
13PueVAtzMPvH/HebPkLlbsntUACAAA=
EOF

View File

@ -0,0 +1,7 @@
id lc_nullable lc_int_nullable bool_nullable
Nullable(Int64) LowCardinality(Nullable(String)) LowCardinality(Nullable(Int64)) Nullable(UInt8)
1 onee 1 1
2 twoo 2 0
3 three 3 1
4 four 4 1
5 five 5 1

View File

@ -0,0 +1,36 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
# ## reading ArrowStream file from python
# import pyarrow as pa
# stream = pa.ipc.open_stream("test.arrows")
# x = stream.read_all()
# print(x)
## writing ArrowStream file from python
# import pyarrow as pa
#data = [
# pa.array([1, 2, 3, 4, 5]),
# pa.array(["onee", "twoo", "three", "four", "five"]).dictionary_encode(),
# pa.array([1, 2, 3, 4, 5]).dictionary_encode(),
# pa.array([True, False, True, True, True])
#]
# batch = pa.record_batch(data, names=['id', 'lc_nullable', 'lc_int_nullable', 'bool_nullable'])
# writer = pa.ipc.new_stream("test4.arrows", batch.schema)
# writer.write_batch(batch)
# writer.close()
# cat data.arrow | gzip | base64
cat <<EOF | base64 --decode | gunzip | $CLICKHOUSE_LOCAL --query='SELECT * FROM table FORMAT TSVWithNamesAndTypes' --input-format=ArrowStream
H4sIAAAAAAAAA5VTsU7DQAz1pZc2giBaFFAGkBgYMjIyMOQD+gHdCqWpiBQlqErhZzowMvIB/Ft4
d+drrqGFYsny+fzO9rN1TdM0L4JoSEqOKKQ++RTgBBGSJEwEjLL6DOwn7B37IWIA9tX7a75TcgKd
VVUxLVdF8TgrMvgpsGuD9yL4Y2jivDmFFk/TvKzbVwFFUAk1PQpqZaJzkVB153xONS4Gvk8DsBni
veEmfFVTxW+cmsemplMv0NGAMV9ODUlmHkPdk8mvPM7vKXvp5Pag+ZyADaEDndP2iLTNh5onY0Oc
zORDnZU8qWO3HDcbaeegdhUDKTky5nvfmU+P9kvcsedOTHTyWJG6D7PbEb+pyiyr36qqfl5m2aJa
LRf5a8b83g/gl2z4nW32HJO7522e9zt4er/wTJzzLl62js1hZ2Z3aPGKTyxcPhfbfHpS9/2wp+/1
jr6DA/pO9tzbPtJOPO3EJ5249d1/JOnnXP7rHzpHi/UYI/+4v2LbmH9I36C0faSwBAAA
EOF