Support more ClickHouse types in ORC/Arrow/Parquet formats

This commit is contained in:
avogar 2023-03-28 20:02:42 +00:00
parent f7e29d1e92
commit bc56c02858
6 changed files with 467 additions and 227 deletions

View File

@ -205,7 +205,7 @@ Differs from the `TabSeparated` format in that the column names are written in t
During parsing, the first row is expected to contain the column names. You can use column names to determine their position and to check their correctness.
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from the input data will be mapped to the columns of the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -217,7 +217,7 @@ This format is also available under the name `TSVWithNames`.
Differs from the `TabSeparated` format in that the column names are written to the first row, while the column types are in the second row.
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from the input data will be mapped to the columns in the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -470,7 +470,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -480,7 +480,7 @@ Otherwise, the first row will be skipped.
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -500,7 +500,7 @@ There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [Templat
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -510,7 +510,7 @@ Otherwise, the first row will be skipped.
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -969,7 +969,7 @@ Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yie
Differs from `JSONCompactEachRow` format in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -979,7 +979,7 @@ Otherwise, the first row will be skipped.
Differs from `JSONCompactEachRow` format in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -991,7 +991,7 @@ the types from input data will be compared with the types of the corresponding c
Differs from `JSONCompactStringsEachRow` in that in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -1001,7 +1001,7 @@ Otherwise, the first row will be skipped.
Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -1120,7 +1120,7 @@ CREATE TABLE IF NOT EXISTS example_table
- If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type).
- If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`.
:::warning
:::note
When inserting data with `input_format_defaults_for_omitted_fields = 1`, ClickHouse consumes more computational resources, compared to insertion with `input_format_defaults_for_omitted_fields = 0`.
:::
@ -1450,7 +1450,7 @@ Similar to [RowBinary](#rowbinary), but with added header:
- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N)
- N `String`s specifying column names
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -1464,7 +1464,7 @@ Similar to [RowBinary](#rowbinary), but with added header:
- N `String`s specifying column names
- N `String`s specifying column types
:::warning
:::note
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
@ -1915,7 +1915,7 @@ SET format_avro_schema_registry_url = 'http://schema-registry';
SELECT * FROM topic1_stream;
```
:::warning
:::note
Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain its value after a restart. Also you can use the `format_avro_schema_registry_url` setting of the `Kafka` table engine.
:::
@ -1927,30 +1927,31 @@ Setting `format_avro_schema_registry_url` needs to be configured in `users.xml`
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) |
|----------------------------------------------------|-----------------------------------------------------------------|------------------------------|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT` |
| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `DOUBLE` |
| `DATE` | [Date32](/docs/en/sql-reference/data-types/date.md) | `DATE` |
| `TIME (ms)` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `TIMESTAMP`, `TIME (us, ns)` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `TIMESTAMP` |
| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` |
| `STRING`, `BINARY`, `FIXED_LENGTH_BYTE_ARRAY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_LENGTH_BYTE_ARRAY` |
| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `FIXED_LENGTH_BYTE_ARRAY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` |
| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) |
|-----------------------------------------------|------------------------------------------------------------------------------------------------------------|-------------------------------|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT` |
| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `DOUBLE` |
| `DATE` | [Date32](/docs/en/sql-reference/data-types/date.md) | `DATE` |
| `TIME (ms)` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `TIMESTAMP`, `TIME (us, ns)` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `TIMESTAMP` |
| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` |
| `STRING`, `BINARY`, `FIXED_LENGTH_BYTE_ARRAY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_LENGTH_BYTE_ARRAY` |
| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` |
| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/domains/int-uint.md) | `FIXED_LENGTH_BYTE_ARRAY` |
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
@ -1996,31 +1997,32 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) |
|-----------------------------------------|-----------------------------------------------------------------|----------------------------|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT`, `HALF_FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT32` |
| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `FLOAT64` |
| `DATE32` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `UINT16` |
| `DATE64` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `TIMESTAMP`, `TIME32`, `TIME64` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `UINT32` |
| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` |
| `STRING`, `BINARY`, `FIXED_SIZE_BINARY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_SIZE_BINARY` |
| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` |
| `DECIMAL256` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL256` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` |
| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) |
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|----------------------------|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT`, `HALF_FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT32` |
| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `FLOAT64` |
| `DATE32` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `UINT16` |
| `DATE64` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `TIMESTAMP`, `TIME32`, `TIME64` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `UINT32` |
| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` |
| `STRING`, `BINARY`, `FIXED_SIZE_BINARY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_SIZE_BINARY` |
| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` |
| `DECIMAL256` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL256` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` |
| `FIXED_SIZE_BINARY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/domains/int-uint.md) | `FIXED_SIZE_BINARY` |
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
@ -2069,23 +2071,26 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Arrow" > {filenam
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) |
|---------------------------------------|---------------------------------------------------------------|--------------------------|
| `Boolean` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `Boolean` |
| `Tinyint` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `Tinyint` |
| `Smallint` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `Smallint` |
| `Int` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
| `Bigint` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `Bigint` |
| `Float` | [Float32](/docs/en/sql-reference/data-types/float.md) | `Float` |
| `Double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `Double` |
| `Decimal` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `Decimal` |
| `Date` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `Date` |
| `Timestamp` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `Timestamp` |
| `String`, `Char`, `Varchar`, `Binary` | [String](/docs/en/sql-reference/data-types/string.md) | `Binary` |
| `List` | [Array](/docs/en/sql-reference/data-types/array.md) | `List` |
| `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` |
| `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` |
| `-` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) |
|---------------------------------------|-------------------------------------------------------------------------------------------------------------------|--------------------------|
| `Boolean` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `Boolean` |
| `Tinyint` | [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `Tinyint` |
| `Smallint` | [Int16/UInt16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `Smallint` |
| `Int` | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
| `Bigint` | [Int64/UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `Bigint` |
| `Float` | [Float32](/docs/en/sql-reference/data-types/float.md) | `Float` |
| `Double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `Double` |
| `Decimal` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `Decimal` |
| `Date` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `Date` |
| `Timestamp` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `Timestamp` |
| `String`, `Char`, `Varchar`, `Binary` | [String](/docs/en/sql-reference/data-types/string.md) | `Binary` |
| `List` | [Array](/docs/en/sql-reference/data-types/array.md) | `List` |
| `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` |
| `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` |
| `Int` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
| `Binary` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `Binary` |
| `Binary` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `Binary` |
| `Binary` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `Binary` |
Other types are not supported.

View File

@ -44,7 +44,6 @@
M(arrow::Type::UINT8, DB::UInt8) \
M(arrow::Type::INT8, DB::Int8) \
M(arrow::Type::INT16, DB::Int16) \
M(arrow::Type::INT32, DB::Int32) \
M(arrow::Type::UINT64, DB::UInt64) \
M(arrow::Type::INT64, DB::Int64) \
M(arrow::Type::DURATION, DB::Int64) \
@ -105,6 +104,7 @@ static ColumnWithTypeAndName readColumnWithNumericData(std::shared_ptr<arrow::Ch
template <typename ArrowArray>
static ColumnWithTypeAndName readColumnWithStringData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{
readColumnWithNumericData<Int128>(arrow_column, column_name);
auto internal_type = std::make_shared<DataTypeString>();
auto internal_column = internal_type->createColumn();
PaddedPODArray<UInt8> & column_chars_t = assert_cast<ColumnString &>(*internal_column).getChars();
@ -165,6 +165,73 @@ static ColumnWithTypeAndName readColumnWithFixedStringData(std::shared_ptr<arrow
return {std::move(internal_column), std::move(internal_type), column_name};
}
template <typename ValueType>
static ColumnWithTypeAndName readColumnWithBigIntegerFromFixedBinaryData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name, const DataTypePtr & column_type)
{
const auto * fixed_type = assert_cast<arrow::FixedSizeBinaryType *>(arrow_column->type().get());
size_t fixed_len = fixed_type->byte_width();
if (fixed_len != sizeof(ValueType))
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Cannot insert data into {} column from fixed size binary, expected data with size {}, got {}",
column_type->getName(),
sizeof(ValueType),
fixed_len);
auto internal_column = column_type->createColumn();
auto & data = assert_cast<ColumnVector<ValueType> &>(*internal_column).getData();
data.reserve(arrow_column->length());
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
{
arrow::FixedSizeBinaryArray & chunk = dynamic_cast<arrow::FixedSizeBinaryArray &>(*(arrow_column->chunk(chunk_i)));
const auto * raw_data = reinterpret_cast<const ValueType *>(chunk.raw_values());
data.insert_assume_reserved(raw_data, raw_data + chunk.length());
}
return {std::move(internal_column), column_type, column_name};
}
template <typename ColumnType, typename ValueType = typename ColumnType::ValueType>
static ColumnWithTypeAndName readColumnWithBigNumberFromBinaryData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name, const DataTypePtr & column_type)
{
size_t total_size = 0;
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
{
auto & chunk = dynamic_cast<arrow::BinaryArray &>(*(arrow_column->chunk(chunk_i)));
const size_t chunk_length = chunk.length();
for (size_t i = 0; i != chunk_length; ++i)
{
if (!chunk.IsNull(i) && chunk.value_length(i) != sizeof(ValueType))
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Cannot insert data into {} column from binary value, expected data with size {}, got {}",
column_type->getName(),
sizeof(ValueType),
chunk.value_length(i));
total_size += chunk_length;
}
}
auto internal_column = column_type->createColumn();
auto & integer_column = assert_cast<ColumnType &>(*internal_column);
integer_column.reserve(total_size);
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
{
auto & chunk = dynamic_cast<arrow::BinaryArray &>(*(arrow_column->chunk(chunk_i)));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
if (chunk.IsNull(value_i))
integer_column.insertDefault();
else
integer_column.insertData(chunk.Value(value_i).data(), chunk.Value(value_i).size());
}
}
return {std::move(internal_column), column_type, column_name};
}
static ColumnWithTypeAndName readColumnWithBooleanData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{
auto internal_type = DataTypeFactory::instance().get("Bool");
@ -537,7 +604,7 @@ static ColumnWithTypeAndName readIPv6ColumnFromBinaryData(std::shared_ptr<arrow:
for (size_t i = 0; i != chunk_length; ++i)
{
/// If at least one value size is not 16 bytes, fallback to reading String column and further cast to IPv6.
if (chunk.value_length(i) != sizeof(IPv6))
if (!chunk.IsNull(i) && chunk.value_length(i) != sizeof(IPv6))
return readColumnWithStringData<arrow::BinaryArray>(arrow_column, column_name);
}
total_size += chunk_length;
@ -545,14 +612,40 @@ static ColumnWithTypeAndName readIPv6ColumnFromBinaryData(std::shared_ptr<arrow:
auto internal_type = std::make_shared<DataTypeIPv6>();
auto internal_column = internal_type->createColumn();
auto & data = assert_cast<ColumnIPv6 &>(*internal_column).getData();
data.reserve(total_size * sizeof(IPv6));
auto & ipv6_column = assert_cast<ColumnIPv6 &>(*internal_column);
ipv6_column.reserve(total_size);
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
{
auto & chunk = dynamic_cast<arrow::BinaryArray &>(*(arrow_column->chunk(chunk_i)));
const auto * raw_data = reinterpret_cast<const IPv6 *>(chunk.raw_data() + chunk.raw_value_offsets()[0]);
data.insert_assume_reserved(raw_data, raw_data + chunk.length());
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
if (chunk.IsNull(value_i))
ipv6_column.insertDefault();
else
ipv6_column.insertData(chunk.Value(value_i).data(), chunk.Value(value_i).size());
}
}
return {std::move(internal_column), std::move(internal_type), column_name};
}
static ColumnWithTypeAndName readIPv4ColumnWithInt32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{
auto internal_type = std::make_shared<DataTypeIPv4>();
auto internal_column = internal_type->createColumn();
auto & column_data = static_cast<ColumnIPv4 &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
{
std::shared_ptr<arrow::Array> chunk = arrow_column->chunk(chunk_i);
if (chunk->length() == 0)
continue;
/// buffers[0] is a null bitmap and buffers[1] are actual values
std::shared_ptr<arrow::Buffer> buffer = chunk->data()->buffers[1];
const auto * raw_data = reinterpret_cast<const IPv4 *>(buffer->data()) + chunk->offset();
column_data.insert_assume_reserved(raw_data, raw_data + chunk->length());
}
return {std::move(internal_column), std::move(internal_type), column_name};
}
@ -566,7 +659,8 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
bool allow_null_type,
bool skip_columns_with_unsupported_types,
bool & skipped,
DataTypePtr type_hint = nullptr)
DataTypePtr type_hint = nullptr,
bool is_map_nested = false)
{
if (!is_nullable && (arrow_column->null_count() || (type_hint && type_hint->isNullable())) && arrow_column->type()->id() != arrow::Type::LIST
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT &&
@ -589,12 +683,49 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
case arrow::Type::STRING:
case arrow::Type::BINARY:
{
if (type_hint && isIPv6(type_hint))
return readIPv6ColumnFromBinaryData(arrow_column, column_name);
if (type_hint)
{
switch (type_hint->getTypeId())
{
case TypeIndex::IPv6:
return readIPv6ColumnFromBinaryData(arrow_column, column_name);
/// ORC format outputs big integers as binary column, because there is no fixed binary in ORC.
case TypeIndex::Int128:
return readColumnWithBigNumberFromBinaryData<ColumnInt128>(arrow_column, column_name, type_hint);
case TypeIndex::UInt128:
return readColumnWithBigNumberFromBinaryData<ColumnUInt128>(arrow_column, column_name, type_hint);
case TypeIndex::Int256:
return readColumnWithBigNumberFromBinaryData<ColumnInt256>(arrow_column, column_name, type_hint);
case TypeIndex::UInt256:
return readColumnWithBigNumberFromBinaryData<ColumnUInt256>(arrow_column, column_name, type_hint);
/// ORC doesn't support Decimal256 as separate type. We read and write it as binary data.
case TypeIndex::Decimal256:
return readColumnWithBigNumberFromBinaryData<ColumnDecimal<Decimal256>>(arrow_column, column_name, type_hint);
default:;
}
}
return readColumnWithStringData<arrow::BinaryArray>(arrow_column, column_name);
}
case arrow::Type::FIXED_SIZE_BINARY:
{
if (type_hint)
{
switch (type_hint->getTypeId())
{
case TypeIndex::Int128:
return readColumnWithBigIntegerFromFixedBinaryData<Int128>(arrow_column, column_name, type_hint);
case TypeIndex::UInt128:
return readColumnWithBigIntegerFromFixedBinaryData<UInt128>(arrow_column, column_name, type_hint);
case TypeIndex::Int256:
return readColumnWithBigIntegerFromFixedBinaryData<Int256>(arrow_column, column_name, type_hint);
case TypeIndex::UInt256:
return readColumnWithBigIntegerFromFixedBinaryData<UInt256>(arrow_column, column_name, type_hint);
default:;
}
}
return readColumnWithFixedStringData(arrow_column, column_name);
}
case arrow::Type::LARGE_BINARY:
case arrow::Type::LARGE_STRING:
return readColumnWithStringData<arrow::LargeBinaryArray>(arrow_column, column_name);
@ -621,6 +752,14 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
column.type = std::make_shared<DataTypeDateTime>();
return column;
}
case arrow::Type::INT32:
{
/// ORC format doesn't have unsigned integers and we output IPv4 as Int32.
/// We should allow to read it back from Int32.
if (type_hint && isIPv4(type_hint))
return readIPv4ColumnWithInt32Data(arrow_column, column_name);
return readColumnWithNumericData<Int32>(arrow_column, column_name);
}
case arrow::Type::TIMESTAMP:
return readColumnWithTimestampData(arrow_column, column_name);
case arrow::Type::DECIMAL128:
@ -637,10 +776,18 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
nested_type_hint = assert_cast<const DataTypeArray *>(map_type_hint->getNestedType().get())->getNestedType();
}
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint, true);
if (skipped)
return {};
if (nested_type_hint && !nested_type_hint->equals(*nested_column.type))
{
/// Cast to target type, because it can happen that type from nested_column
/// cannot be Map key type.
nested_column.column = castColumn(nested_column, nested_type_hint);
nested_column.type = nested_type_hint;
}
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
const auto * tuple_column = assert_cast<const ColumnTuple *>(nested_column.column.get());
@ -690,7 +837,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
DataTypePtr nested_type_hint;
if (tuple_type_hint)
{
if (tuple_type_hint->haveExplicitNames())
if (tuple_type_hint->haveExplicitNames() && !is_map_nested)
{
auto pos = tuple_type_hint->tryGetPositionByName(field_name);
if (pos)

View File

@ -26,7 +26,6 @@
#include <arrow/util/decimal.h>
#define FOR_INTERNAL_NUMERIC_TYPES(M) \
M(UInt8, arrow::UInt8Builder) \
M(Int8, arrow::Int8Builder) \
M(UInt16, arrow::UInt16Builder) \
M(Int16, arrow::Int16Builder) \
@ -65,8 +64,10 @@ namespace DB
{
{"UInt8", arrow::uint8()},
{"Int8", arrow::int8()},
{"Enum8", arrow::int8()},
{"UInt16", arrow::uint16()},
{"Int16", arrow::int16()},
{"Enum16", arrow::int16()},
{"UInt32", arrow::uint32()},
{"Int32", arrow::int32()},
{"UInt64", arrow::uint64()},
@ -80,6 +81,11 @@ namespace DB
{"String", arrow::binary()},
{"FixedString", arrow::binary()},
{"Int128", arrow::fixed_size_binary(sizeof(Int128))},
{"UInt128", arrow::fixed_size_binary(sizeof(UInt128))},
{"Int256", arrow::fixed_size_binary(sizeof(Int256))},
{"UInt256", arrow::fixed_size_binary(sizeof(UInt256))},
};
@ -148,7 +154,7 @@ namespace DB
}
static void fillArrowArrayWithDateTime64ColumnData(
const DataTypeDateTime64 * type,
const DataTypePtr & type,
ColumnPtr write_column,
const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name,
@ -156,11 +162,12 @@ namespace DB
size_t start,
size_t end)
{
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(type.get());
const auto & column = assert_cast<const ColumnDecimal<DateTime64> &>(*write_column);
arrow::TimestampBuilder & builder = assert_cast<arrow::TimestampBuilder &>(*array_builder);
arrow::Status status;
auto scale = type->getScale();
auto scale = datetime64_type->getScale();
bool need_rescale = scale % 3;
auto rescale_multiplier = DecimalUtils::scaleMultiplier<DateTime64::NativeType>(3 - scale % 3);
for (size_t value_i = start; value_i < end; ++value_i)
@ -186,7 +193,7 @@ namespace DB
static void fillArrowArray(
const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const DataTypePtr & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
@ -200,7 +207,7 @@ namespace DB
static void fillArrowArrayWithArrayColumnData(
const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const DataTypePtr & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
@ -231,7 +238,7 @@ namespace DB
static void fillArrowArrayWithTupleColumnData(
const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const DataTypePtr & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
@ -303,7 +310,7 @@ namespace DB
static void fillArrowArrayWithLowCardinalityColumnDataImpl(
const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const DataTypePtr & column_type,
const PaddedPODArray<UInt8> *,
arrow::ArrayBuilder * array_builder,
String format_name,
@ -359,7 +366,7 @@ namespace DB
static void fillArrowArrayWithLowCardinalityColumnData(
const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const DataTypePtr & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
@ -541,134 +548,6 @@ namespace DB
}
}
static void fillArrowArray(
const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
size_t start,
size_t end,
bool output_string_as_string,
bool output_fixed_string_as_fixed_byte_array,
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
{
const String column_type_name = column_type->getFamilyName();
if (column_type->isNullable())
{
const ColumnNullable * column_nullable = assert_cast<const ColumnNullable *>(column.get());
ColumnPtr nested_column = column_nullable->getNestedColumnPtr();
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
}
else if (isString(column_type))
{
if (output_string_as_string)
fillArrowArrayWithStringColumnData<ColumnString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
else
fillArrowArrayWithStringColumnData<ColumnString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isFixedString(column_type))
{
if (output_fixed_string_as_fixed_byte_array)
fillArrowArrayWithFixedStringColumnData(column, null_bytemap, format_name, array_builder, start, end);
else if (output_string_as_string)
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
else
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isIPv6(column_type))
{
fillArrowArrayWithIPv6ColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isIPv4(column_type))
{
fillArrowArrayWithIPv4ColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isDate(column_type))
{
fillArrowArrayWithDateColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isDateTime(column_type))
{
fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isDate32(column_type))
{
fillArrowArrayWithDate32ColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isArray(column_type))
{
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
}
else if (isTuple(column_type))
{
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
}
else if (column_type->getTypeId() == TypeIndex::LowCardinality)
{
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
}
else if (isMap(column_type))
{
ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType();
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
}
else if (isDecimal(column_type))
{
auto fill_decimal = [&](const auto & types) -> bool
{
using Types = std::decay_t<decltype(types)>;
using ToDataType = typename Types::LeftType;
if constexpr (
std::is_same_v<ToDataType,DataTypeDecimal<Decimal32>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
fillArrowArrayWithDecimalColumnData<ToDataType, Int128, arrow::Decimal128, arrow::Decimal128Builder>(column, null_bytemap, array_builder, format_name, start, end);
return true;
}
if constexpr (std::is_same_v<ToDataType,DataTypeDecimal<Decimal256>>)
{
fillArrowArrayWithDecimalColumnData<ToDataType, Int256, arrow::Decimal256, arrow::Decimal256Builder>(column, null_bytemap, array_builder, format_name, start, end);
return true;
}
return false;
};
if (!callOnIndexAndDataType<void>(column_type->getTypeId(), fill_decimal))
throw Exception{ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with decimal data with type {}", column_type_name};
}
else if (isDateTime64(column_type))
{
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(column_type.get());
fillArrowArrayWithDateTime64ColumnData(datetime64_type, column, null_bytemap, format_name, array_builder, start, end);
}
else if (isBool(column_type))
{
fillArrowArrayWithBoolColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
#define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
else if (#CPP_NUMERIC_TYPE == column_type_name) \
{ \
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(column, null_bytemap, format_name, array_builder, start, end); \
}
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
else
{
throw Exception(ErrorCodes::UNKNOWN_TYPE,
"Internal type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type_name, column_name, format_name);
}
}
template <typename DataType, typename FieldType, typename ArrowDecimalType, typename ArrowBuilder>
static void fillArrowArrayWithDecimalColumnData(
ColumnPtr write_column,
@ -697,6 +576,158 @@ namespace DB
checkStatus(status, write_column->getName(), format_name);
}
template <typename ColumnType>
static void fillArrowArrayWithBigIntegerColumnData(
ColumnPtr write_column,
const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name,
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{
const auto & internal_column = assert_cast<const ColumnType &>(*write_column);
const auto & internal_data = internal_column.getData();
size_t fixed_length = sizeof(typename ColumnType::ValueType);
arrow::FixedSizeBinaryBuilder & builder = assert_cast<arrow::FixedSizeBinaryBuilder &>(*array_builder);
arrow::Status status;
PaddedPODArray<UInt8> arrow_null_bytemap = revertNullByteMap(null_bytemap, start, end);
const UInt8 * arrow_null_bytemap_raw_ptr = arrow_null_bytemap.empty() ? nullptr : arrow_null_bytemap.data();
const uint8_t * data_start = reinterpret_cast<const uint8_t *>(internal_data.data()) + start * fixed_length;
status = builder.AppendValues(data_start, end - start, reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr));
checkStatus(status, write_column->getName(), format_name);
}
static void fillArrowArray(
const String & column_name,
ColumnPtr & column,
const DataTypePtr & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
size_t start,
size_t end,
bool output_string_as_string,
bool output_fixed_string_as_fixed_byte_array,
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
{
const String column_type_name = column_type->getFamilyName();
WhichDataType which(column_type);
switch (column_type->getTypeId())
{
case TypeIndex::Nullable:
{
const ColumnNullable * column_nullable = assert_cast<const ColumnNullable *>(column.get());
ColumnPtr nested_column = column_nullable->getNestedColumnPtr();
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
break;
}
case TypeIndex::String:
{
if (output_string_as_string)
fillArrowArrayWithStringColumnData<ColumnString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
else
fillArrowArrayWithStringColumnData<ColumnString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
break;
}
case TypeIndex::FixedString:
{
if (output_fixed_string_as_fixed_byte_array)
fillArrowArrayWithFixedStringColumnData(column, null_bytemap, format_name, array_builder, start, end);
else if (output_string_as_string)
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
else
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
break;
}
case TypeIndex::IPv6:
fillArrowArrayWithIPv6ColumnData(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::IPv4:
fillArrowArrayWithIPv4ColumnData(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::Date:
fillArrowArrayWithDateColumnData(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::DateTime:
fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::Date32:
fillArrowArrayWithDate32ColumnData(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::Array:
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
break;
case TypeIndex::Tuple:
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
break;
case TypeIndex::LowCardinality:
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
break;
case TypeIndex::Map:
{
ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType();
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
break;
}
case TypeIndex::Decimal32:
fillArrowArrayWithDecimalColumnData<DataTypeDecimal32, Int128, arrow::Decimal128, arrow::Decimal128Builder>(column, null_bytemap, array_builder, format_name, start, end);
break;
case TypeIndex::Decimal64:
fillArrowArrayWithDecimalColumnData<DataTypeDecimal64, Int128, arrow::Decimal128, arrow::Decimal128Builder>(column, null_bytemap, array_builder, format_name, start, end);
break;
case TypeIndex::Decimal128:
fillArrowArrayWithDecimalColumnData<DataTypeDecimal128, Int128, arrow::Decimal128, arrow::Decimal128Builder>(column, null_bytemap, array_builder, format_name, start, end);
break;
case TypeIndex::Decimal256:
fillArrowArrayWithDecimalColumnData<DataTypeDecimal256, Int256, arrow::Decimal256, arrow::Decimal256Builder>(column, null_bytemap, array_builder, format_name, start, end);
break;
case TypeIndex::DateTime64:
fillArrowArrayWithDateTime64ColumnData(column_type, column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::UInt8:
{
if (isBool(column_type))
fillArrowArrayWithBoolColumnData(column, null_bytemap, format_name, array_builder, start, end);
else
fillArrowArrayWithNumericColumnData<UInt8, arrow::UInt8Builder>(column, null_bytemap, format_name, array_builder, start, end);
break;
}
case TypeIndex::Enum8:
fillArrowArrayWithNumericColumnData<Int8, arrow::Int8Builder>(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::Enum16:
fillArrowArrayWithNumericColumnData<Int16, arrow::Int16Builder>(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::Int128:
fillArrowArrayWithBigIntegerColumnData<ColumnInt128>(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::UInt128:
fillArrowArrayWithBigIntegerColumnData<ColumnUInt128>(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::Int256:
fillArrowArrayWithBigIntegerColumnData<ColumnInt256>(column, null_bytemap, format_name, array_builder, start, end);
break;
case TypeIndex::UInt256:
fillArrowArrayWithBigIntegerColumnData<ColumnUInt256>(column, null_bytemap, format_name, array_builder, start, end);
break;
#define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
case TypeIndex::CPP_NUMERIC_TYPE: \
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(column, null_bytemap, format_name, array_builder, start, end); \
break;
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
default:
throw Exception(ErrorCodes::UNKNOWN_TYPE, "Internal type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type_name, column_name, format_name);
}
}
static std::shared_ptr<arrow::DataType> getArrowTypeForLowCardinalityIndexes(ColumnPtr indexes_column)
{
/// Arrow docs recommend preferring signed integers over unsigned integers for representing dictionary indices.

View File

@ -93,10 +93,12 @@ std::unique_ptr<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr &
return orc::createPrimitiveType(orc::TypeKind::BOOLEAN);
return orc::createPrimitiveType(orc::TypeKind::BYTE);
}
case TypeIndex::Enum8: [[fallthrough]];
case TypeIndex::Int8:
{
return orc::createPrimitiveType(orc::TypeKind::BYTE);
}
case TypeIndex::Enum16: [[fallthrough]];
case TypeIndex::UInt16: [[fallthrough]];
case TypeIndex::Int16:
{
@ -131,6 +133,12 @@ std::unique_ptr<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr &
{
return orc::createPrimitiveType(orc::TypeKind::TIMESTAMP);
}
case TypeIndex::Int128: [[fallthrough]];
case TypeIndex::UInt128: [[fallthrough]];
case TypeIndex::Int256: [[fallthrough]];
case TypeIndex::UInt256: [[fallthrough]];
case TypeIndex::Decimal256:
return orc::createPrimitiveType(orc::TypeKind::BINARY);
case TypeIndex::FixedString: [[fallthrough]];
case TypeIndex::String:
{
@ -309,6 +317,7 @@ void ORCBlockOutputFormat::writeColumn(
switch (type->getTypeId())
{
case TypeIndex::Enum8: [[fallthrough]];
case TypeIndex::Int8:
{
/// Note: Explicit cast to avoid clang-tidy error: 'signed char' to 'long' conversion; consider casting to 'unsigned char' first.
@ -320,6 +329,7 @@ void ORCBlockOutputFormat::writeColumn(
writeNumbers<UInt8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt8 & value){ return value; });
break;
}
case TypeIndex::Enum16: [[fallthrough]];
case TypeIndex::Int16:
{
writeNumbers<Int16, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int16 & value){ return value; });
@ -357,6 +367,26 @@ void ORCBlockOutputFormat::writeColumn(
writeNumbers<UInt64,orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt64 & value){ return value; });
break;
}
case TypeIndex::Int128:
{
writeStrings<ColumnInt128>(orc_column, column, null_bytemap);
break;
}
case TypeIndex::UInt128:
{
writeStrings<ColumnUInt128>(orc_column, column, null_bytemap);
break;
}
case TypeIndex::Int256:
{
writeStrings<ColumnInt256>(orc_column, column, null_bytemap);
break;
}
case TypeIndex::UInt256:
{
writeStrings<ColumnUInt256>(orc_column, column, null_bytemap);
break;
}
case TypeIndex::Float32:
{
writeNumbers<Float32, orc::DoubleVectorBatch>(orc_column, column, null_bytemap, [](const Float32 & value){ return value; });
@ -432,6 +462,11 @@ void ORCBlockOutputFormat::writeColumn(
[](Int128 value){ return orc::Int128(value >> 64, (value << 64) >> 64); });
break;
}
case TypeIndex::Decimal256:
{
writeStrings<ColumnDecimal<Decimal256>>(orc_column, column, null_bytemap);
break;
}
case TypeIndex::Nullable:
{
const auto & nullable_column = assert_cast<const ColumnNullable &>(column);

View File

@ -0,0 +1,5 @@
42 42 42 42 a b
42 42 42 42 a b
42 42 42 42 a b 42.42 0.0.0.0
\N
\N

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select 42::Int128 as c1, 42::UInt128 as c2, 42::Int256 as c3, 42::UInt256 as c4, 'a'::Enum8('a' = 1) as c5, 'b'::Enum16('b' = 1) as c6 format Parquet" | $CLICKHOUSE_LOCAL --input-format Parquet --structure="c1 Int128, c2 UInt128, c3 Int256, c4 UInt256, c5 Enum8('a' = 1), c6 Enum16('b' = 1)" -q "select * from table"
$CLICKHOUSE_LOCAL -q "select 42::Int128 as c1, 42::UInt128 as c2, 42::Int256 as c3, 42::UInt256 as c4, 'a'::Enum8('a' = 1) as c5, 'b'::Enum16('b' = 1) as c6 format Arrow" | $CLICKHOUSE_LOCAL --input-format Arrow --structure="c1 Int128, c2 UInt128, c3 Int256, c4 UInt256, c5 Enum8('a' = 1), c6 Enum16('b' = 1)" -q "select * from table"
$CLICKHOUSE_LOCAL -q "select 42::Int128 as c1, 42::UInt128 as c2, 42::Int256 as c3, 42::UInt256 as c4, 'a'::Enum8('a' = 1) as c5, 'b'::Enum16('b' = 1) as c6, 42.42::Decimal256(2) as c7, '0.0.0.0'::IPv4 as c8 format ORC" | $CLICKHOUSE_LOCAL --input-format ORC --structure="c1 Int128, c2 UInt128, c3 Int256, c4 UInt256, c5 Enum8('a' = 1), c6 Enum16('b' = 1), c7 Decimal256(2), c8 IPv4" -q "select * from table"
$CLICKHOUSE_LOCAL -q "select NULL::Nullable(IPv6) as x format ORC" | $CLICKHOUSE_LOCAL --input-format ORC --structure="x Nullable(IPv6)" -q "select * from table"
$CLICKHOUSE_LOCAL -q "select NULL::Nullable(UInt256) as x format ORC" | $CLICKHOUSE_LOCAL --input-format ORC --structure="x Nullable(UInt256)" -q "select * from table"