mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Merge pull request #40037 from arthurpassos/fix_arrow_column_dictionary_to_ch_lc
Fix arrow column dictionary to ch lc
This commit is contained in:
commit
a006ff0a43
@ -326,6 +326,69 @@ static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr
|
||||
return std::make_shared<arrow::ChunkedArray>(array_vector);
|
||||
}
|
||||
|
||||
static ColumnWithTypeAndName createLCColumnFromArrowDictionaryValues(
|
||||
const std::shared_ptr<ColumnWithTypeAndName> & dict_values,
|
||||
const ColumnPtr & indexes_column,
|
||||
const String & column_name
|
||||
)
|
||||
{
|
||||
auto lc_type = std::make_shared<DataTypeLowCardinality>(dict_values->type);
|
||||
|
||||
auto lc_column = lc_type->createColumn();
|
||||
|
||||
for (auto i = 0u; i < indexes_column->size(); i++)
|
||||
{
|
||||
Field f;
|
||||
dict_values->column->get(indexes_column->getUInt(i), f);
|
||||
lc_column->insert(f);
|
||||
}
|
||||
|
||||
return {std::move(lc_column), std::move(lc_type), column_name};
|
||||
}
|
||||
|
||||
/*
|
||||
* Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index.
|
||||
* It doesn't have the concept of null or default values.
|
||||
* An empty string is just a regular value appended at any position of the dictionary.
|
||||
* Null values have an index of 0, but it should be ignored since the nullmap will return null.
|
||||
* In ClickHouse LowCardinality, it's different. The dictionary contains null and default values at the beginning.
|
||||
* [null, default, ...]. Therefore, null values have an index of 0 and default values have an index of 1.
|
||||
* No nullmap is used.
|
||||
* */
|
||||
static ColumnWithTypeAndName createLCOfNullableColumnFromArrowDictionaryValues(
|
||||
const std::shared_ptr<ColumnWithTypeAndName> & dict_values,
|
||||
const ColumnPtr & indexes_column,
|
||||
const ColumnPtr & nullmap_column,
|
||||
const String & column_name
|
||||
)
|
||||
{
|
||||
/*
|
||||
* ArrowColumn format handles nulls by maintaining a nullmap column, there is no nullable type.
|
||||
* Therefore, dict_values->type is the actual data type/ non-nullable. It needs to be transformed into nullable
|
||||
* so LC column is created from nullable type and a null value at the beginning of the collection
|
||||
* is automatically added.
|
||||
* */
|
||||
auto lc_type = std::make_shared<DataTypeLowCardinality>(makeNullable(dict_values->type));
|
||||
|
||||
auto lc_column = lc_type->createColumn();
|
||||
|
||||
for (auto i = 0u; i < indexes_column->size(); i++)
|
||||
{
|
||||
if (nullmap_column && nullmap_column->getBool(i))
|
||||
{
|
||||
lc_column->insertDefault();
|
||||
}
|
||||
else
|
||||
{
|
||||
Field f;
|
||||
dict_values->column->get(indexes_column->getUInt(i), f);
|
||||
lc_column->insert(f);
|
||||
}
|
||||
}
|
||||
|
||||
return {std::move(lc_column), std::move(lc_type), column_name};
|
||||
}
|
||||
|
||||
static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
std::shared_ptr<arrow::ChunkedArray> & arrow_column,
|
||||
const std::string & column_name,
|
||||
@ -338,7 +401,8 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
bool & skipped)
|
||||
{
|
||||
if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST
|
||||
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT)
|
||||
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT &&
|
||||
arrow_column->type()->id() != arrow::Type::DICTIONARY)
|
||||
{
|
||||
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
|
||||
if (skipped)
|
||||
@ -455,12 +519,6 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
}
|
||||
auto arrow_dict_column = std::make_shared<arrow::ChunkedArray>(dict_array);
|
||||
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
|
||||
|
||||
/// We should convert read column to ColumnUnique.
|
||||
auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn();
|
||||
auto tmp_dict_column = IColumn::mutate(assert_cast<ColumnLowCardinality *>(tmp_lc_column.get())->getDictionaryPtr());
|
||||
static_cast<IColumnUnique *>(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size());
|
||||
dict_column.column = std::move(tmp_dict_column);
|
||||
dict_values = std::make_shared<ColumnWithTypeAndName>(std::move(dict_column));
|
||||
}
|
||||
|
||||
@ -473,9 +531,19 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
|
||||
auto arrow_indexes_column = std::make_shared<arrow::ChunkedArray>(indexes_array);
|
||||
auto indexes_column = readColumnWithIndexesData(arrow_indexes_column);
|
||||
auto lc_column = ColumnLowCardinality::create(dict_values->column, indexes_column);
|
||||
auto lc_type = std::make_shared<DataTypeLowCardinality>(dict_values->type);
|
||||
return {std::move(lc_column), std::move(lc_type), column_name};
|
||||
|
||||
const auto contains_null = arrow_column->null_count() > 0;
|
||||
|
||||
if (contains_null)
|
||||
{
|
||||
auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
|
||||
|
||||
return createLCOfNullableColumnFromArrowDictionaryValues(dict_values, indexes_column, nullmap_column, column_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
return createLCColumnFromArrowDictionaryValues(dict_values, indexes_column, column_name);
|
||||
}
|
||||
}
|
||||
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
|
||||
case ARROW_NUMERIC_TYPE: \
|
||||
|
@ -0,0 +1,9 @@
|
||||
lc_nullable_string
|
||||
LowCardinality(Nullable(String))
|
||||
one
|
||||
\N
|
||||
three
|
||||
|
||||
\N
|
||||
|
||||
six
|
31
tests/queries/0_stateless/02381_arrow_dict_of_nullable_string_to_lc.sh
Executable file
31
tests/queries/0_stateless/02381_arrow_dict_of_nullable_string_to_lc.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
# ## reading ArrowStream file from python
|
||||
# import pyarrow as pa
|
||||
# stream = pa.ipc.open_stream("test.arrows")
|
||||
# x = stream.read_all()
|
||||
# print(x)
|
||||
|
||||
## writing ArrowStream file from python
|
||||
# import pyarrow as pa
|
||||
# data = [
|
||||
# pa.array(["one", None, "three", "", None, "", "six"]).dictionary_encode(),
|
||||
# ]
|
||||
# batch = pa.record_batch(data, names=['id', 'lc_nullable', 'lc_int_nullable', 'bool_nullable'])
|
||||
# writer = pa.ipc.new_stream("test4.arrows", batch.schema)
|
||||
# writer.write_batch(batch)
|
||||
# writer.close()
|
||||
|
||||
# cat data.arrow | gzip | base64
|
||||
|
||||
cat <<EOF | base64 --decode | gunzip | $CLICKHOUSE_LOCAL --query='SELECT * FROM table FORMAT TSVWithNamesAndTypes' --input-format=ArrowStream
|
||||
H4sIAAAAAAAAA3VQQQ6CQAychRWIcjCGGA4ePHrwCT7Bgz8waoiSICaoiU/wGR49+Me1XboIJDYp
|
||||
3d2ZTjsYY8wLwBgcQ8QIMEBEJwqloal8iMNVUSaWmxIjQEjsMb3UvWrA2IZySalRx4SyOGzLe1Hs
|
||||
9kW2vd6qvDyC+iPL4m8MvseUob2z2NyiutGhFcxb5sP2JLJpbPvhaYstBK8d1PrOW0pMLcha3p0+
|
||||
h4//4eamUkctTPV02nqRpONfyux2qrLsmj8aX8+Or2nXl6/tzEWj2vWxEh9ha67X2w2yA8esh7k+
|
||||
13PueVAtzMPvH/HebPkLlbsntUACAAA=
|
||||
EOF
|
@ -0,0 +1,7 @@
|
||||
id lc_nullable lc_int_nullable bool_nullable
|
||||
Nullable(Int64) LowCardinality(Nullable(String)) LowCardinality(Nullable(Int64)) Nullable(UInt8)
|
||||
1 onee 1 1
|
||||
2 twoo 2 0
|
||||
3 three 3 1
|
||||
4 four 4 1
|
||||
5 five 5 1
|
36
tests/queries/0_stateless/02381_arrow_dict_to_lc.sh
Executable file
36
tests/queries/0_stateless/02381_arrow_dict_to_lc.sh
Executable file
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
# ## reading ArrowStream file from python
|
||||
# import pyarrow as pa
|
||||
# stream = pa.ipc.open_stream("test.arrows")
|
||||
# x = stream.read_all()
|
||||
# print(x)
|
||||
|
||||
## writing ArrowStream file from python
|
||||
# import pyarrow as pa
|
||||
#data = [
|
||||
# pa.array([1, 2, 3, 4, 5]),
|
||||
# pa.array(["onee", "twoo", "three", "four", "five"]).dictionary_encode(),
|
||||
# pa.array([1, 2, 3, 4, 5]).dictionary_encode(),
|
||||
# pa.array([True, False, True, True, True])
|
||||
#]
|
||||
# batch = pa.record_batch(data, names=['id', 'lc_nullable', 'lc_int_nullable', 'bool_nullable'])
|
||||
# writer = pa.ipc.new_stream("test4.arrows", batch.schema)
|
||||
# writer.write_batch(batch)
|
||||
# writer.close()
|
||||
|
||||
# cat data.arrow | gzip | base64
|
||||
|
||||
cat <<EOF | base64 --decode | gunzip | $CLICKHOUSE_LOCAL --query='SELECT * FROM table FORMAT TSVWithNamesAndTypes' --input-format=ArrowStream
|
||||
H4sIAAAAAAAAA5VTsU7DQAz1pZc2giBaFFAGkBgYMjIyMOQD+gHdCqWpiBQlqErhZzowMvIB/Ft4
|
||||
d+drrqGFYsny+fzO9rN1TdM0L4JoSEqOKKQ++RTgBBGSJEwEjLL6DOwn7B37IWIA9tX7a75TcgKd
|
||||
VVUxLVdF8TgrMvgpsGuD9yL4Y2jivDmFFk/TvKzbVwFFUAk1PQpqZaJzkVB153xONS4Gvk8DsBni
|
||||
veEmfFVTxW+cmsemplMv0NGAMV9ODUlmHkPdk8mvPM7vKXvp5Pag+ZyADaEDndP2iLTNh5onY0Oc
|
||||
zORDnZU8qWO3HDcbaeegdhUDKTky5nvfmU+P9kvcsedOTHTyWJG6D7PbEb+pyiyr36qqfl5m2aJa
|
||||
LRf5a8b83g/gl2z4nW32HJO7522e9zt4er/wTJzzLl62js1hZ2Z3aPGKTyxcPhfbfHpS9/2wp+/1
|
||||
jr6DA/pO9tzbPtJOPO3EJ5249d1/JOnnXP7rHzpHi/UYI/+4v2LbmH9I36C0faSwBAAA
|
||||
EOF
|
Loading…
Reference in New Issue
Block a user