mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 16:50:48 +00:00
Support more ClickHouse types in ORC/Arrow/Parquet formats
This commit is contained in:
parent
f7e29d1e92
commit
bc56c02858
@ -205,7 +205,7 @@ Differs from the `TabSeparated` format in that the column names are written in t
|
|||||||
|
|
||||||
During parsing, the first row is expected to contain the column names. You can use column names to determine their position and to check their correctness.
|
During parsing, the first row is expected to contain the column names. You can use column names to determine their position and to check their correctness.
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from the input data will be mapped to the columns of the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from the input data will be mapped to the columns of the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -217,7 +217,7 @@ This format is also available under the name `TSVWithNames`.
|
|||||||
|
|
||||||
Differs from the `TabSeparated` format in that the column names are written to the first row, while the column types are in the second row.
|
Differs from the `TabSeparated` format in that the column names are written to the first row, while the column types are in the second row.
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from the input data will be mapped to the columns in the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from the input data will be mapped to the columns in the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -470,7 +470,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
|
|||||||
|
|
||||||
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -480,7 +480,7 @@ Otherwise, the first row will be skipped.
|
|||||||
|
|
||||||
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -500,7 +500,7 @@ There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [Templat
|
|||||||
|
|
||||||
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -510,7 +510,7 @@ Otherwise, the first row will be skipped.
|
|||||||
|
|
||||||
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -969,7 +969,7 @@ Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yie
|
|||||||
|
|
||||||
Differs from `JSONCompactEachRow` format in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
Differs from `JSONCompactEachRow` format in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -979,7 +979,7 @@ Otherwise, the first row will be skipped.
|
|||||||
|
|
||||||
Differs from `JSONCompactEachRow` format in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
Differs from `JSONCompactEachRow` format in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -991,7 +991,7 @@ the types from input data will be compared with the types of the corresponding c
|
|||||||
|
|
||||||
Differs from `JSONCompactStringsEachRow` in that in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
Differs from `JSONCompactStringsEachRow` in that in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -1001,7 +1001,7 @@ Otherwise, the first row will be skipped.
|
|||||||
|
|
||||||
Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -1120,7 +1120,7 @@ CREATE TABLE IF NOT EXISTS example_table
|
|||||||
- If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type).
|
- If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type).
|
||||||
- If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`.
|
- If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`.
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
When inserting data with `input_format_defaults_for_omitted_fields = 1`, ClickHouse consumes more computational resources, compared to insertion with `input_format_defaults_for_omitted_fields = 0`.
|
When inserting data with `input_format_defaults_for_omitted_fields = 1`, ClickHouse consumes more computational resources, compared to insertion with `input_format_defaults_for_omitted_fields = 0`.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -1450,7 +1450,7 @@ Similar to [RowBinary](#rowbinary), but with added header:
|
|||||||
- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N)
|
- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N)
|
||||||
- N `String`s specifying column names
|
- N `String`s specifying column names
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -1464,7 +1464,7 @@ Similar to [RowBinary](#rowbinary), but with added header:
|
|||||||
- N `String`s specifying column names
|
- N `String`s specifying column names
|
||||||
- N `String`s specifying column types
|
- N `String`s specifying column types
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
@ -1915,7 +1915,7 @@ SET format_avro_schema_registry_url = 'http://schema-registry';
|
|||||||
SELECT * FROM topic1_stream;
|
SELECT * FROM topic1_stream;
|
||||||
```
|
```
|
||||||
|
|
||||||
:::warning
|
:::note
|
||||||
Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. Also you can use the `format_avro_schema_registry_url` setting of the `Kafka` table engine.
|
Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. Also you can use the `format_avro_schema_registry_url` setting of the `Kafka` table engine.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -1928,12 +1928,12 @@ Setting `format_avro_schema_registry_url` needs to be configured in `users.xml`
|
|||||||
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
|
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
|
||||||
|
|
||||||
| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) |
|
| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) |
|
||||||
|----------------------------------------------------|-----------------------------------------------------------------|------------------------------|
|
|-----------------------------------------------|------------------------------------------------------------------------------------------------------------|-------------------------------|
|
||||||
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
|
||||||
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
|
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
|
||||||
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` |
|
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `INT8` |
|
||||||
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
|
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
|
||||||
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` |
|
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `INT16` |
|
||||||
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
|
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
|
||||||
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
|
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
|
||||||
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
|
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
|
||||||
@ -1950,7 +1950,8 @@ The table below shows supported data types and how they match ClickHouse [data t
|
|||||||
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
|
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
|
||||||
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
|
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
|
||||||
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
|
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
|
||||||
| `FIXED_LENGTH_BYTE_ARRAY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` |
|
| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` |
|
||||||
|
| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/domains/int-uint.md) | `FIXED_LENGTH_BYTE_ARRAY` |
|
||||||
|
|
||||||
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
|
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
|
||||||
|
|
||||||
@ -1997,12 +1998,12 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t
|
|||||||
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
|
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
|
||||||
|
|
||||||
| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) |
|
| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) |
|
||||||
|-----------------------------------------|-----------------------------------------------------------------|----------------------------|
|
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|----------------------------|
|
||||||
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
|
||||||
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
|
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
|
||||||
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` |
|
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `INT8` |
|
||||||
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
|
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
|
||||||
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` |
|
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `INT16` |
|
||||||
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
|
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
|
||||||
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
|
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
|
||||||
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
|
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
|
||||||
@ -2021,6 +2022,7 @@ The table below shows supported data types and how they match ClickHouse [data t
|
|||||||
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
|
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
|
||||||
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
|
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
|
||||||
| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` |
|
| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` |
|
||||||
|
| `FIXED_SIZE_BINARY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/domains/int-uint.md) | `FIXED_SIZE_BINARY` |
|
||||||
|
|
||||||
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
|
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
|
||||||
|
|
||||||
@ -2070,12 +2072,12 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Arrow" > {filenam
|
|||||||
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
|
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
|
||||||
|
|
||||||
| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) |
|
| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) |
|
||||||
|---------------------------------------|---------------------------------------------------------------|--------------------------|
|
|---------------------------------------|-------------------------------------------------------------------------------------------------------------------|--------------------------|
|
||||||
| `Boolean` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `Boolean` |
|
| `Boolean` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `Boolean` |
|
||||||
| `Tinyint` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `Tinyint` |
|
| `Tinyint` | [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `Tinyint` |
|
||||||
| `Smallint` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `Smallint` |
|
| `Smallint` | [Int16/UInt16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `Smallint` |
|
||||||
| `Int` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
|
| `Int` | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
|
||||||
| `Bigint` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `Bigint` |
|
| `Bigint` | [Int64/UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `Bigint` |
|
||||||
| `Float` | [Float32](/docs/en/sql-reference/data-types/float.md) | `Float` |
|
| `Float` | [Float32](/docs/en/sql-reference/data-types/float.md) | `Float` |
|
||||||
| `Double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `Double` |
|
| `Double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `Double` |
|
||||||
| `Decimal` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `Decimal` |
|
| `Decimal` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `Decimal` |
|
||||||
@ -2085,7 +2087,10 @@ The table below shows supported data types and how they match ClickHouse [data t
|
|||||||
| `List` | [Array](/docs/en/sql-reference/data-types/array.md) | `List` |
|
| `List` | [Array](/docs/en/sql-reference/data-types/array.md) | `List` |
|
||||||
| `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` |
|
| `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` |
|
||||||
| `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` |
|
| `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` |
|
||||||
| `-` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
|
| `Int` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
|
||||||
|
| `Binary` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `Binary` |
|
||||||
|
| `Binary` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `Binary` |
|
||||||
|
| `Binary` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `Binary` |
|
||||||
|
|
||||||
Other types are not supported.
|
Other types are not supported.
|
||||||
|
|
||||||
|
@ -44,7 +44,6 @@
|
|||||||
M(arrow::Type::UINT8, DB::UInt8) \
|
M(arrow::Type::UINT8, DB::UInt8) \
|
||||||
M(arrow::Type::INT8, DB::Int8) \
|
M(arrow::Type::INT8, DB::Int8) \
|
||||||
M(arrow::Type::INT16, DB::Int16) \
|
M(arrow::Type::INT16, DB::Int16) \
|
||||||
M(arrow::Type::INT32, DB::Int32) \
|
|
||||||
M(arrow::Type::UINT64, DB::UInt64) \
|
M(arrow::Type::UINT64, DB::UInt64) \
|
||||||
M(arrow::Type::INT64, DB::Int64) \
|
M(arrow::Type::INT64, DB::Int64) \
|
||||||
M(arrow::Type::DURATION, DB::Int64) \
|
M(arrow::Type::DURATION, DB::Int64) \
|
||||||
@ -105,6 +104,7 @@ static ColumnWithTypeAndName readColumnWithNumericData(std::shared_ptr<arrow::Ch
|
|||||||
template <typename ArrowArray>
|
template <typename ArrowArray>
|
||||||
static ColumnWithTypeAndName readColumnWithStringData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
|
static ColumnWithTypeAndName readColumnWithStringData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
|
||||||
{
|
{
|
||||||
|
readColumnWithNumericData<Int128>(arrow_column, column_name);
|
||||||
auto internal_type = std::make_shared<DataTypeString>();
|
auto internal_type = std::make_shared<DataTypeString>();
|
||||||
auto internal_column = internal_type->createColumn();
|
auto internal_column = internal_type->createColumn();
|
||||||
PaddedPODArray<UInt8> & column_chars_t = assert_cast<ColumnString &>(*internal_column).getChars();
|
PaddedPODArray<UInt8> & column_chars_t = assert_cast<ColumnString &>(*internal_column).getChars();
|
||||||
@ -165,6 +165,73 @@ static ColumnWithTypeAndName readColumnWithFixedStringData(std::shared_ptr<arrow
|
|||||||
return {std::move(internal_column), std::move(internal_type), column_name};
|
return {std::move(internal_column), std::move(internal_type), column_name};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename ValueType>
|
||||||
|
static ColumnWithTypeAndName readColumnWithBigIntegerFromFixedBinaryData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name, const DataTypePtr & column_type)
|
||||||
|
{
|
||||||
|
const auto * fixed_type = assert_cast<arrow::FixedSizeBinaryType *>(arrow_column->type().get());
|
||||||
|
size_t fixed_len = fixed_type->byte_width();
|
||||||
|
if (fixed_len != sizeof(ValueType))
|
||||||
|
throw Exception(
|
||||||
|
ErrorCodes::BAD_ARGUMENTS,
|
||||||
|
"Cannot insert data into {} column from fixed size binary, expected data with size {}, got {}",
|
||||||
|
column_type->getName(),
|
||||||
|
sizeof(ValueType),
|
||||||
|
fixed_len);
|
||||||
|
|
||||||
|
auto internal_column = column_type->createColumn();
|
||||||
|
auto & data = assert_cast<ColumnVector<ValueType> &>(*internal_column).getData();
|
||||||
|
data.reserve(arrow_column->length());
|
||||||
|
|
||||||
|
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
|
||||||
|
{
|
||||||
|
arrow::FixedSizeBinaryArray & chunk = dynamic_cast<arrow::FixedSizeBinaryArray &>(*(arrow_column->chunk(chunk_i)));
|
||||||
|
const auto * raw_data = reinterpret_cast<const ValueType *>(chunk.raw_values());
|
||||||
|
data.insert_assume_reserved(raw_data, raw_data + chunk.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
return {std::move(internal_column), column_type, column_name};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ColumnType, typename ValueType = typename ColumnType::ValueType>
|
||||||
|
static ColumnWithTypeAndName readColumnWithBigNumberFromBinaryData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name, const DataTypePtr & column_type)
|
||||||
|
{
|
||||||
|
size_t total_size = 0;
|
||||||
|
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
|
||||||
|
{
|
||||||
|
auto & chunk = dynamic_cast<arrow::BinaryArray &>(*(arrow_column->chunk(chunk_i)));
|
||||||
|
const size_t chunk_length = chunk.length();
|
||||||
|
|
||||||
|
for (size_t i = 0; i != chunk_length; ++i)
|
||||||
|
{
|
||||||
|
if (!chunk.IsNull(i) && chunk.value_length(i) != sizeof(ValueType))
|
||||||
|
throw Exception(
|
||||||
|
ErrorCodes::BAD_ARGUMENTS,
|
||||||
|
"Cannot insert data into {} column from binary value, expected data with size {}, got {}",
|
||||||
|
column_type->getName(),
|
||||||
|
sizeof(ValueType),
|
||||||
|
chunk.value_length(i));
|
||||||
|
total_size += chunk_length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto internal_column = column_type->createColumn();
|
||||||
|
auto & integer_column = assert_cast<ColumnType &>(*internal_column);
|
||||||
|
integer_column.reserve(total_size);
|
||||||
|
|
||||||
|
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
|
||||||
|
{
|
||||||
|
auto & chunk = dynamic_cast<arrow::BinaryArray &>(*(arrow_column->chunk(chunk_i)));
|
||||||
|
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
|
||||||
|
{
|
||||||
|
if (chunk.IsNull(value_i))
|
||||||
|
integer_column.insertDefault();
|
||||||
|
else
|
||||||
|
integer_column.insertData(chunk.Value(value_i).data(), chunk.Value(value_i).size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {std::move(internal_column), column_type, column_name};
|
||||||
|
}
|
||||||
|
|
||||||
static ColumnWithTypeAndName readColumnWithBooleanData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
|
static ColumnWithTypeAndName readColumnWithBooleanData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
|
||||||
{
|
{
|
||||||
auto internal_type = DataTypeFactory::instance().get("Bool");
|
auto internal_type = DataTypeFactory::instance().get("Bool");
|
||||||
@ -537,7 +604,7 @@ static ColumnWithTypeAndName readIPv6ColumnFromBinaryData(std::shared_ptr<arrow:
|
|||||||
for (size_t i = 0; i != chunk_length; ++i)
|
for (size_t i = 0; i != chunk_length; ++i)
|
||||||
{
|
{
|
||||||
/// If at least one value size is not 16 bytes, fallback to reading String column and further cast to IPv6.
|
/// If at least one value size is not 16 bytes, fallback to reading String column and further cast to IPv6.
|
||||||
if (chunk.value_length(i) != sizeof(IPv6))
|
if (!chunk.IsNull(i) && chunk.value_length(i) != sizeof(IPv6))
|
||||||
return readColumnWithStringData<arrow::BinaryArray>(arrow_column, column_name);
|
return readColumnWithStringData<arrow::BinaryArray>(arrow_column, column_name);
|
||||||
}
|
}
|
||||||
total_size += chunk_length;
|
total_size += chunk_length;
|
||||||
@ -545,14 +612,40 @@ static ColumnWithTypeAndName readIPv6ColumnFromBinaryData(std::shared_ptr<arrow:
|
|||||||
|
|
||||||
auto internal_type = std::make_shared<DataTypeIPv6>();
|
auto internal_type = std::make_shared<DataTypeIPv6>();
|
||||||
auto internal_column = internal_type->createColumn();
|
auto internal_column = internal_type->createColumn();
|
||||||
auto & data = assert_cast<ColumnIPv6 &>(*internal_column).getData();
|
auto & ipv6_column = assert_cast<ColumnIPv6 &>(*internal_column);
|
||||||
data.reserve(total_size * sizeof(IPv6));
|
ipv6_column.reserve(total_size);
|
||||||
|
|
||||||
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
|
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
|
||||||
{
|
{
|
||||||
auto & chunk = dynamic_cast<arrow::BinaryArray &>(*(arrow_column->chunk(chunk_i)));
|
auto & chunk = dynamic_cast<arrow::BinaryArray &>(*(arrow_column->chunk(chunk_i)));
|
||||||
const auto * raw_data = reinterpret_cast<const IPv6 *>(chunk.raw_data() + chunk.raw_value_offsets()[0]);
|
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
|
||||||
data.insert_assume_reserved(raw_data, raw_data + chunk.length());
|
{
|
||||||
|
if (chunk.IsNull(value_i))
|
||||||
|
ipv6_column.insertDefault();
|
||||||
|
else
|
||||||
|
ipv6_column.insertData(chunk.Value(value_i).data(), chunk.Value(value_i).size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {std::move(internal_column), std::move(internal_type), column_name};
|
||||||
|
}
|
||||||
|
|
||||||
|
static ColumnWithTypeAndName readIPv4ColumnWithInt32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
|
||||||
|
{
|
||||||
|
auto internal_type = std::make_shared<DataTypeIPv4>();
|
||||||
|
auto internal_column = internal_type->createColumn();
|
||||||
|
auto & column_data = static_cast<ColumnIPv4 &>(*internal_column).getData();
|
||||||
|
column_data.reserve(arrow_column->length());
|
||||||
|
|
||||||
|
for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
|
||||||
|
{
|
||||||
|
std::shared_ptr<arrow::Array> chunk = arrow_column->chunk(chunk_i);
|
||||||
|
if (chunk->length() == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/// buffers[0] is a null bitmap and buffers[1] are actual values
|
||||||
|
std::shared_ptr<arrow::Buffer> buffer = chunk->data()->buffers[1];
|
||||||
|
const auto * raw_data = reinterpret_cast<const IPv4 *>(buffer->data()) + chunk->offset();
|
||||||
|
column_data.insert_assume_reserved(raw_data, raw_data + chunk->length());
|
||||||
}
|
}
|
||||||
return {std::move(internal_column), std::move(internal_type), column_name};
|
return {std::move(internal_column), std::move(internal_type), column_name};
|
||||||
}
|
}
|
||||||
@ -566,7 +659,8 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
|||||||
bool allow_null_type,
|
bool allow_null_type,
|
||||||
bool skip_columns_with_unsupported_types,
|
bool skip_columns_with_unsupported_types,
|
||||||
bool & skipped,
|
bool & skipped,
|
||||||
DataTypePtr type_hint = nullptr)
|
DataTypePtr type_hint = nullptr,
|
||||||
|
bool is_map_nested = false)
|
||||||
{
|
{
|
||||||
if (!is_nullable && (arrow_column->null_count() || (type_hint && type_hint->isNullable())) && arrow_column->type()->id() != arrow::Type::LIST
|
if (!is_nullable && (arrow_column->null_count() || (type_hint && type_hint->isNullable())) && arrow_column->type()->id() != arrow::Type::LIST
|
||||||
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT &&
|
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT &&
|
||||||
@ -589,12 +683,49 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
|||||||
case arrow::Type::STRING:
|
case arrow::Type::STRING:
|
||||||
case arrow::Type::BINARY:
|
case arrow::Type::BINARY:
|
||||||
{
|
{
|
||||||
if (type_hint && isIPv6(type_hint))
|
if (type_hint)
|
||||||
|
{
|
||||||
|
switch (type_hint->getTypeId())
|
||||||
|
{
|
||||||
|
case TypeIndex::IPv6:
|
||||||
return readIPv6ColumnFromBinaryData(arrow_column, column_name);
|
return readIPv6ColumnFromBinaryData(arrow_column, column_name);
|
||||||
|
/// ORC format outputs big integers as binary column, because there is no fixed binary in ORC.
|
||||||
|
case TypeIndex::Int128:
|
||||||
|
return readColumnWithBigNumberFromBinaryData<ColumnInt128>(arrow_column, column_name, type_hint);
|
||||||
|
case TypeIndex::UInt128:
|
||||||
|
return readColumnWithBigNumberFromBinaryData<ColumnUInt128>(arrow_column, column_name, type_hint);
|
||||||
|
case TypeIndex::Int256:
|
||||||
|
return readColumnWithBigNumberFromBinaryData<ColumnInt256>(arrow_column, column_name, type_hint);
|
||||||
|
case TypeIndex::UInt256:
|
||||||
|
return readColumnWithBigNumberFromBinaryData<ColumnUInt256>(arrow_column, column_name, type_hint);
|
||||||
|
/// ORC doesn't support Decimal256 as separate type. We read and write it as binary data.
|
||||||
|
case TypeIndex::Decimal256:
|
||||||
|
return readColumnWithBigNumberFromBinaryData<ColumnDecimal<Decimal256>>(arrow_column, column_name, type_hint);
|
||||||
|
default:;
|
||||||
|
}
|
||||||
|
}
|
||||||
return readColumnWithStringData<arrow::BinaryArray>(arrow_column, column_name);
|
return readColumnWithStringData<arrow::BinaryArray>(arrow_column, column_name);
|
||||||
}
|
}
|
||||||
case arrow::Type::FIXED_SIZE_BINARY:
|
case arrow::Type::FIXED_SIZE_BINARY:
|
||||||
|
{
|
||||||
|
if (type_hint)
|
||||||
|
{
|
||||||
|
switch (type_hint->getTypeId())
|
||||||
|
{
|
||||||
|
case TypeIndex::Int128:
|
||||||
|
return readColumnWithBigIntegerFromFixedBinaryData<Int128>(arrow_column, column_name, type_hint);
|
||||||
|
case TypeIndex::UInt128:
|
||||||
|
return readColumnWithBigIntegerFromFixedBinaryData<UInt128>(arrow_column, column_name, type_hint);
|
||||||
|
case TypeIndex::Int256:
|
||||||
|
return readColumnWithBigIntegerFromFixedBinaryData<Int256>(arrow_column, column_name, type_hint);
|
||||||
|
case TypeIndex::UInt256:
|
||||||
|
return readColumnWithBigIntegerFromFixedBinaryData<UInt256>(arrow_column, column_name, type_hint);
|
||||||
|
default:;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return readColumnWithFixedStringData(arrow_column, column_name);
|
return readColumnWithFixedStringData(arrow_column, column_name);
|
||||||
|
}
|
||||||
case arrow::Type::LARGE_BINARY:
|
case arrow::Type::LARGE_BINARY:
|
||||||
case arrow::Type::LARGE_STRING:
|
case arrow::Type::LARGE_STRING:
|
||||||
return readColumnWithStringData<arrow::LargeBinaryArray>(arrow_column, column_name);
|
return readColumnWithStringData<arrow::LargeBinaryArray>(arrow_column, column_name);
|
||||||
@ -621,6 +752,14 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
|||||||
column.type = std::make_shared<DataTypeDateTime>();
|
column.type = std::make_shared<DataTypeDateTime>();
|
||||||
return column;
|
return column;
|
||||||
}
|
}
|
||||||
|
case arrow::Type::INT32:
|
||||||
|
{
|
||||||
|
/// ORC format doesn't have unsigned integers and we output IPv4 as Int32.
|
||||||
|
/// We should allow to read it back from Int32.
|
||||||
|
if (type_hint && isIPv4(type_hint))
|
||||||
|
return readIPv4ColumnWithInt32Data(arrow_column, column_name);
|
||||||
|
return readColumnWithNumericData<Int32>(arrow_column, column_name);
|
||||||
|
}
|
||||||
case arrow::Type::TIMESTAMP:
|
case arrow::Type::TIMESTAMP:
|
||||||
return readColumnWithTimestampData(arrow_column, column_name);
|
return readColumnWithTimestampData(arrow_column, column_name);
|
||||||
case arrow::Type::DECIMAL128:
|
case arrow::Type::DECIMAL128:
|
||||||
@ -637,10 +776,18 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
|||||||
nested_type_hint = assert_cast<const DataTypeArray *>(map_type_hint->getNestedType().get())->getNestedType();
|
nested_type_hint = assert_cast<const DataTypeArray *>(map_type_hint->getNestedType().get())->getNestedType();
|
||||||
}
|
}
|
||||||
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
|
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
|
||||||
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
|
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint, true);
|
||||||
if (skipped)
|
if (skipped)
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
|
if (nested_type_hint && !nested_type_hint->equals(*nested_column.type))
|
||||||
|
{
|
||||||
|
/// Cast to target type, because it can happen that type from nested_column
|
||||||
|
/// cannot be Map key type.
|
||||||
|
nested_column.column = castColumn(nested_column, nested_type_hint);
|
||||||
|
nested_column.type = nested_type_hint;
|
||||||
|
}
|
||||||
|
|
||||||
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
|
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
|
||||||
|
|
||||||
const auto * tuple_column = assert_cast<const ColumnTuple *>(nested_column.column.get());
|
const auto * tuple_column = assert_cast<const ColumnTuple *>(nested_column.column.get());
|
||||||
@ -690,7 +837,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
|||||||
DataTypePtr nested_type_hint;
|
DataTypePtr nested_type_hint;
|
||||||
if (tuple_type_hint)
|
if (tuple_type_hint)
|
||||||
{
|
{
|
||||||
if (tuple_type_hint->haveExplicitNames())
|
if (tuple_type_hint->haveExplicitNames() && !is_map_nested)
|
||||||
{
|
{
|
||||||
auto pos = tuple_type_hint->tryGetPositionByName(field_name);
|
auto pos = tuple_type_hint->tryGetPositionByName(field_name);
|
||||||
if (pos)
|
if (pos)
|
||||||
|
@ -26,7 +26,6 @@
|
|||||||
#include <arrow/util/decimal.h>
|
#include <arrow/util/decimal.h>
|
||||||
|
|
||||||
#define FOR_INTERNAL_NUMERIC_TYPES(M) \
|
#define FOR_INTERNAL_NUMERIC_TYPES(M) \
|
||||||
M(UInt8, arrow::UInt8Builder) \
|
|
||||||
M(Int8, arrow::Int8Builder) \
|
M(Int8, arrow::Int8Builder) \
|
||||||
M(UInt16, arrow::UInt16Builder) \
|
M(UInt16, arrow::UInt16Builder) \
|
||||||
M(Int16, arrow::Int16Builder) \
|
M(Int16, arrow::Int16Builder) \
|
||||||
@ -65,8 +64,10 @@ namespace DB
|
|||||||
{
|
{
|
||||||
{"UInt8", arrow::uint8()},
|
{"UInt8", arrow::uint8()},
|
||||||
{"Int8", arrow::int8()},
|
{"Int8", arrow::int8()},
|
||||||
|
{"Enum8", arrow::int8()},
|
||||||
{"UInt16", arrow::uint16()},
|
{"UInt16", arrow::uint16()},
|
||||||
{"Int16", arrow::int16()},
|
{"Int16", arrow::int16()},
|
||||||
|
{"Enum16", arrow::int16()},
|
||||||
{"UInt32", arrow::uint32()},
|
{"UInt32", arrow::uint32()},
|
||||||
{"Int32", arrow::int32()},
|
{"Int32", arrow::int32()},
|
||||||
{"UInt64", arrow::uint64()},
|
{"UInt64", arrow::uint64()},
|
||||||
@ -80,6 +81,11 @@ namespace DB
|
|||||||
|
|
||||||
{"String", arrow::binary()},
|
{"String", arrow::binary()},
|
||||||
{"FixedString", arrow::binary()},
|
{"FixedString", arrow::binary()},
|
||||||
|
|
||||||
|
{"Int128", arrow::fixed_size_binary(sizeof(Int128))},
|
||||||
|
{"UInt128", arrow::fixed_size_binary(sizeof(UInt128))},
|
||||||
|
{"Int256", arrow::fixed_size_binary(sizeof(Int256))},
|
||||||
|
{"UInt256", arrow::fixed_size_binary(sizeof(UInt256))},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -148,7 +154,7 @@ namespace DB
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void fillArrowArrayWithDateTime64ColumnData(
|
static void fillArrowArrayWithDateTime64ColumnData(
|
||||||
const DataTypeDateTime64 * type,
|
const DataTypePtr & type,
|
||||||
ColumnPtr write_column,
|
ColumnPtr write_column,
|
||||||
const PaddedPODArray<UInt8> * null_bytemap,
|
const PaddedPODArray<UInt8> * null_bytemap,
|
||||||
const String & format_name,
|
const String & format_name,
|
||||||
@ -156,11 +162,12 @@ namespace DB
|
|||||||
size_t start,
|
size_t start,
|
||||||
size_t end)
|
size_t end)
|
||||||
{
|
{
|
||||||
|
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(type.get());
|
||||||
const auto & column = assert_cast<const ColumnDecimal<DateTime64> &>(*write_column);
|
const auto & column = assert_cast<const ColumnDecimal<DateTime64> &>(*write_column);
|
||||||
arrow::TimestampBuilder & builder = assert_cast<arrow::TimestampBuilder &>(*array_builder);
|
arrow::TimestampBuilder & builder = assert_cast<arrow::TimestampBuilder &>(*array_builder);
|
||||||
arrow::Status status;
|
arrow::Status status;
|
||||||
|
|
||||||
auto scale = type->getScale();
|
auto scale = datetime64_type->getScale();
|
||||||
bool need_rescale = scale % 3;
|
bool need_rescale = scale % 3;
|
||||||
auto rescale_multiplier = DecimalUtils::scaleMultiplier<DateTime64::NativeType>(3 - scale % 3);
|
auto rescale_multiplier = DecimalUtils::scaleMultiplier<DateTime64::NativeType>(3 - scale % 3);
|
||||||
for (size_t value_i = start; value_i < end; ++value_i)
|
for (size_t value_i = start; value_i < end; ++value_i)
|
||||||
@ -186,7 +193,7 @@ namespace DB
|
|||||||
static void fillArrowArray(
|
static void fillArrowArray(
|
||||||
const String & column_name,
|
const String & column_name,
|
||||||
ColumnPtr & column,
|
ColumnPtr & column,
|
||||||
const std::shared_ptr<const IDataType> & column_type,
|
const DataTypePtr & column_type,
|
||||||
const PaddedPODArray<UInt8> * null_bytemap,
|
const PaddedPODArray<UInt8> * null_bytemap,
|
||||||
arrow::ArrayBuilder * array_builder,
|
arrow::ArrayBuilder * array_builder,
|
||||||
String format_name,
|
String format_name,
|
||||||
@ -200,7 +207,7 @@ namespace DB
|
|||||||
static void fillArrowArrayWithArrayColumnData(
|
static void fillArrowArrayWithArrayColumnData(
|
||||||
const String & column_name,
|
const String & column_name,
|
||||||
ColumnPtr & column,
|
ColumnPtr & column,
|
||||||
const std::shared_ptr<const IDataType> & column_type,
|
const DataTypePtr & column_type,
|
||||||
const PaddedPODArray<UInt8> * null_bytemap,
|
const PaddedPODArray<UInt8> * null_bytemap,
|
||||||
arrow::ArrayBuilder * array_builder,
|
arrow::ArrayBuilder * array_builder,
|
||||||
String format_name,
|
String format_name,
|
||||||
@ -231,7 +238,7 @@ namespace DB
|
|||||||
static void fillArrowArrayWithTupleColumnData(
|
static void fillArrowArrayWithTupleColumnData(
|
||||||
const String & column_name,
|
const String & column_name,
|
||||||
ColumnPtr & column,
|
ColumnPtr & column,
|
||||||
const std::shared_ptr<const IDataType> & column_type,
|
const DataTypePtr & column_type,
|
||||||
const PaddedPODArray<UInt8> * null_bytemap,
|
const PaddedPODArray<UInt8> * null_bytemap,
|
||||||
arrow::ArrayBuilder * array_builder,
|
arrow::ArrayBuilder * array_builder,
|
||||||
String format_name,
|
String format_name,
|
||||||
@ -303,7 +310,7 @@ namespace DB
|
|||||||
static void fillArrowArrayWithLowCardinalityColumnDataImpl(
|
static void fillArrowArrayWithLowCardinalityColumnDataImpl(
|
||||||
const String & column_name,
|
const String & column_name,
|
||||||
ColumnPtr & column,
|
ColumnPtr & column,
|
||||||
const std::shared_ptr<const IDataType> & column_type,
|
const DataTypePtr & column_type,
|
||||||
const PaddedPODArray<UInt8> *,
|
const PaddedPODArray<UInt8> *,
|
||||||
arrow::ArrayBuilder * array_builder,
|
arrow::ArrayBuilder * array_builder,
|
||||||
String format_name,
|
String format_name,
|
||||||
@ -359,7 +366,7 @@ namespace DB
|
|||||||
static void fillArrowArrayWithLowCardinalityColumnData(
|
static void fillArrowArrayWithLowCardinalityColumnData(
|
||||||
const String & column_name,
|
const String & column_name,
|
||||||
ColumnPtr & column,
|
ColumnPtr & column,
|
||||||
const std::shared_ptr<const IDataType> & column_type,
|
const DataTypePtr & column_type,
|
||||||
const PaddedPODArray<UInt8> * null_bytemap,
|
const PaddedPODArray<UInt8> * null_bytemap,
|
||||||
arrow::ArrayBuilder * array_builder,
|
arrow::ArrayBuilder * array_builder,
|
||||||
String format_name,
|
String format_name,
|
||||||
@ -541,134 +548,6 @@ namespace DB
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void fillArrowArray(
|
|
||||||
const String & column_name,
|
|
||||||
ColumnPtr & column,
|
|
||||||
const std::shared_ptr<const IDataType> & column_type,
|
|
||||||
const PaddedPODArray<UInt8> * null_bytemap,
|
|
||||||
arrow::ArrayBuilder * array_builder,
|
|
||||||
String format_name,
|
|
||||||
size_t start,
|
|
||||||
size_t end,
|
|
||||||
bool output_string_as_string,
|
|
||||||
bool output_fixed_string_as_fixed_byte_array,
|
|
||||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
|
||||||
{
|
|
||||||
const String column_type_name = column_type->getFamilyName();
|
|
||||||
|
|
||||||
if (column_type->isNullable())
|
|
||||||
{
|
|
||||||
const ColumnNullable * column_nullable = assert_cast<const ColumnNullable *>(column.get());
|
|
||||||
ColumnPtr nested_column = column_nullable->getNestedColumnPtr();
|
|
||||||
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
|
||||||
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
|
|
||||||
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
|
|
||||||
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
|
||||||
}
|
|
||||||
else if (isString(column_type))
|
|
||||||
{
|
|
||||||
if (output_string_as_string)
|
|
||||||
fillArrowArrayWithStringColumnData<ColumnString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
else
|
|
||||||
fillArrowArrayWithStringColumnData<ColumnString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
else if (isFixedString(column_type))
|
|
||||||
{
|
|
||||||
if (output_fixed_string_as_fixed_byte_array)
|
|
||||||
fillArrowArrayWithFixedStringColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
else if (output_string_as_string)
|
|
||||||
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
else
|
|
||||||
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
else if (isIPv6(column_type))
|
|
||||||
{
|
|
||||||
fillArrowArrayWithIPv6ColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
else if (isIPv4(column_type))
|
|
||||||
{
|
|
||||||
fillArrowArrayWithIPv4ColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
else if (isDate(column_type))
|
|
||||||
{
|
|
||||||
fillArrowArrayWithDateColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
else if (isDateTime(column_type))
|
|
||||||
{
|
|
||||||
fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
else if (isDate32(column_type))
|
|
||||||
{
|
|
||||||
fillArrowArrayWithDate32ColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
else if (isArray(column_type))
|
|
||||||
{
|
|
||||||
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
|
||||||
}
|
|
||||||
else if (isTuple(column_type))
|
|
||||||
{
|
|
||||||
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
|
||||||
}
|
|
||||||
else if (column_type->getTypeId() == TypeIndex::LowCardinality)
|
|
||||||
{
|
|
||||||
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
|
||||||
}
|
|
||||||
else if (isMap(column_type))
|
|
||||||
{
|
|
||||||
ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
|
|
||||||
DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType();
|
|
||||||
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
|
||||||
}
|
|
||||||
else if (isDecimal(column_type))
|
|
||||||
{
|
|
||||||
auto fill_decimal = [&](const auto & types) -> bool
|
|
||||||
{
|
|
||||||
using Types = std::decay_t<decltype(types)>;
|
|
||||||
using ToDataType = typename Types::LeftType;
|
|
||||||
if constexpr (
|
|
||||||
std::is_same_v<ToDataType,DataTypeDecimal<Decimal32>>
|
|
||||||
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|
|
||||||
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
|
|
||||||
{
|
|
||||||
fillArrowArrayWithDecimalColumnData<ToDataType, Int128, arrow::Decimal128, arrow::Decimal128Builder>(column, null_bytemap, array_builder, format_name, start, end);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if constexpr (std::is_same_v<ToDataType,DataTypeDecimal<Decimal256>>)
|
|
||||||
{
|
|
||||||
fillArrowArrayWithDecimalColumnData<ToDataType, Int256, arrow::Decimal256, arrow::Decimal256Builder>(column, null_bytemap, array_builder, format_name, start, end);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
};
|
|
||||||
|
|
||||||
if (!callOnIndexAndDataType<void>(column_type->getTypeId(), fill_decimal))
|
|
||||||
throw Exception{ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with decimal data with type {}", column_type_name};
|
|
||||||
}
|
|
||||||
else if (isDateTime64(column_type))
|
|
||||||
{
|
|
||||||
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(column_type.get());
|
|
||||||
fillArrowArrayWithDateTime64ColumnData(datetime64_type, column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
else if (isBool(column_type))
|
|
||||||
{
|
|
||||||
fillArrowArrayWithBoolColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
|
||||||
}
|
|
||||||
#define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
|
|
||||||
else if (#CPP_NUMERIC_TYPE == column_type_name) \
|
|
||||||
{ \
|
|
||||||
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(column, null_bytemap, format_name, array_builder, start, end); \
|
|
||||||
}
|
|
||||||
|
|
||||||
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
|
|
||||||
#undef DISPATCH
|
|
||||||
else
|
|
||||||
{
|
|
||||||
throw Exception(ErrorCodes::UNKNOWN_TYPE,
|
|
||||||
"Internal type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type_name, column_name, format_name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename DataType, typename FieldType, typename ArrowDecimalType, typename ArrowBuilder>
|
template <typename DataType, typename FieldType, typename ArrowDecimalType, typename ArrowBuilder>
|
||||||
static void fillArrowArrayWithDecimalColumnData(
|
static void fillArrowArrayWithDecimalColumnData(
|
||||||
ColumnPtr write_column,
|
ColumnPtr write_column,
|
||||||
@ -697,6 +576,158 @@ namespace DB
|
|||||||
checkStatus(status, write_column->getName(), format_name);
|
checkStatus(status, write_column->getName(), format_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename ColumnType>
|
||||||
|
static void fillArrowArrayWithBigIntegerColumnData(
|
||||||
|
ColumnPtr write_column,
|
||||||
|
const PaddedPODArray<UInt8> * null_bytemap,
|
||||||
|
const String & format_name,
|
||||||
|
arrow::ArrayBuilder* array_builder,
|
||||||
|
size_t start,
|
||||||
|
size_t end)
|
||||||
|
{
|
||||||
|
const auto & internal_column = assert_cast<const ColumnType &>(*write_column);
|
||||||
|
const auto & internal_data = internal_column.getData();
|
||||||
|
size_t fixed_length = sizeof(typename ColumnType::ValueType);
|
||||||
|
arrow::FixedSizeBinaryBuilder & builder = assert_cast<arrow::FixedSizeBinaryBuilder &>(*array_builder);
|
||||||
|
arrow::Status status;
|
||||||
|
|
||||||
|
PaddedPODArray<UInt8> arrow_null_bytemap = revertNullByteMap(null_bytemap, start, end);
|
||||||
|
const UInt8 * arrow_null_bytemap_raw_ptr = arrow_null_bytemap.empty() ? nullptr : arrow_null_bytemap.data();
|
||||||
|
|
||||||
|
const uint8_t * data_start = reinterpret_cast<const uint8_t *>(internal_data.data()) + start * fixed_length;
|
||||||
|
status = builder.AppendValues(data_start, end - start, reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr));
|
||||||
|
checkStatus(status, write_column->getName(), format_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fillArrowArray(
|
||||||
|
const String & column_name,
|
||||||
|
ColumnPtr & column,
|
||||||
|
const DataTypePtr & column_type,
|
||||||
|
const PaddedPODArray<UInt8> * null_bytemap,
|
||||||
|
arrow::ArrayBuilder * array_builder,
|
||||||
|
String format_name,
|
||||||
|
size_t start,
|
||||||
|
size_t end,
|
||||||
|
bool output_string_as_string,
|
||||||
|
bool output_fixed_string_as_fixed_byte_array,
|
||||||
|
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||||
|
{
|
||||||
|
const String column_type_name = column_type->getFamilyName();
|
||||||
|
WhichDataType which(column_type);
|
||||||
|
|
||||||
|
switch (column_type->getTypeId())
|
||||||
|
{
|
||||||
|
case TypeIndex::Nullable:
|
||||||
|
{
|
||||||
|
const ColumnNullable * column_nullable = assert_cast<const ColumnNullable *>(column.get());
|
||||||
|
ColumnPtr nested_column = column_nullable->getNestedColumnPtr();
|
||||||
|
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
||||||
|
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
|
||||||
|
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
|
||||||
|
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case TypeIndex::String:
|
||||||
|
{
|
||||||
|
if (output_string_as_string)
|
||||||
|
fillArrowArrayWithStringColumnData<ColumnString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
else
|
||||||
|
fillArrowArrayWithStringColumnData<ColumnString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case TypeIndex::FixedString:
|
||||||
|
{
|
||||||
|
if (output_fixed_string_as_fixed_byte_array)
|
||||||
|
fillArrowArrayWithFixedStringColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
else if (output_string_as_string)
|
||||||
|
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
else
|
||||||
|
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case TypeIndex::IPv6:
|
||||||
|
fillArrowArrayWithIPv6ColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::IPv4:
|
||||||
|
fillArrowArrayWithIPv4ColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Date:
|
||||||
|
fillArrowArrayWithDateColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::DateTime:
|
||||||
|
fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Date32:
|
||||||
|
fillArrowArrayWithDate32ColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Array:
|
||||||
|
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Tuple:
|
||||||
|
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
||||||
|
break;
|
||||||
|
case TypeIndex::LowCardinality:
|
||||||
|
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Map:
|
||||||
|
{
|
||||||
|
ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
|
||||||
|
DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType();
|
||||||
|
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, output_fixed_string_as_fixed_byte_array, dictionary_values);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case TypeIndex::Decimal32:
|
||||||
|
fillArrowArrayWithDecimalColumnData<DataTypeDecimal32, Int128, arrow::Decimal128, arrow::Decimal128Builder>(column, null_bytemap, array_builder, format_name, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Decimal64:
|
||||||
|
fillArrowArrayWithDecimalColumnData<DataTypeDecimal64, Int128, arrow::Decimal128, arrow::Decimal128Builder>(column, null_bytemap, array_builder, format_name, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Decimal128:
|
||||||
|
fillArrowArrayWithDecimalColumnData<DataTypeDecimal128, Int128, arrow::Decimal128, arrow::Decimal128Builder>(column, null_bytemap, array_builder, format_name, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Decimal256:
|
||||||
|
fillArrowArrayWithDecimalColumnData<DataTypeDecimal256, Int256, arrow::Decimal256, arrow::Decimal256Builder>(column, null_bytemap, array_builder, format_name, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::DateTime64:
|
||||||
|
fillArrowArrayWithDateTime64ColumnData(column_type, column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::UInt8:
|
||||||
|
{
|
||||||
|
if (isBool(column_type))
|
||||||
|
fillArrowArrayWithBoolColumnData(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
else
|
||||||
|
fillArrowArrayWithNumericColumnData<UInt8, arrow::UInt8Builder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case TypeIndex::Enum8:
|
||||||
|
fillArrowArrayWithNumericColumnData<Int8, arrow::Int8Builder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Enum16:
|
||||||
|
fillArrowArrayWithNumericColumnData<Int16, arrow::Int16Builder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Int128:
|
||||||
|
fillArrowArrayWithBigIntegerColumnData<ColumnInt128>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::UInt128:
|
||||||
|
fillArrowArrayWithBigIntegerColumnData<ColumnUInt128>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::Int256:
|
||||||
|
fillArrowArrayWithBigIntegerColumnData<ColumnInt256>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
case TypeIndex::UInt256:
|
||||||
|
fillArrowArrayWithBigIntegerColumnData<ColumnUInt256>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
break;
|
||||||
|
#define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
|
||||||
|
case TypeIndex::CPP_NUMERIC_TYPE: \
|
||||||
|
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(column, null_bytemap, format_name, array_builder, start, end); \
|
||||||
|
break;
|
||||||
|
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
|
||||||
|
#undef DISPATCH
|
||||||
|
default:
|
||||||
|
throw Exception(ErrorCodes::UNKNOWN_TYPE, "Internal type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type_name, column_name, format_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static std::shared_ptr<arrow::DataType> getArrowTypeForLowCardinalityIndexes(ColumnPtr indexes_column)
|
static std::shared_ptr<arrow::DataType> getArrowTypeForLowCardinalityIndexes(ColumnPtr indexes_column)
|
||||||
{
|
{
|
||||||
/// Arrow docs recommend preferring signed integers over unsigned integers for representing dictionary indices.
|
/// Arrow docs recommend preferring signed integers over unsigned integers for representing dictionary indices.
|
||||||
|
@ -93,10 +93,12 @@ std::unique_ptr<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr &
|
|||||||
return orc::createPrimitiveType(orc::TypeKind::BOOLEAN);
|
return orc::createPrimitiveType(orc::TypeKind::BOOLEAN);
|
||||||
return orc::createPrimitiveType(orc::TypeKind::BYTE);
|
return orc::createPrimitiveType(orc::TypeKind::BYTE);
|
||||||
}
|
}
|
||||||
|
case TypeIndex::Enum8: [[fallthrough]];
|
||||||
case TypeIndex::Int8:
|
case TypeIndex::Int8:
|
||||||
{
|
{
|
||||||
return orc::createPrimitiveType(orc::TypeKind::BYTE);
|
return orc::createPrimitiveType(orc::TypeKind::BYTE);
|
||||||
}
|
}
|
||||||
|
case TypeIndex::Enum16: [[fallthrough]];
|
||||||
case TypeIndex::UInt16: [[fallthrough]];
|
case TypeIndex::UInt16: [[fallthrough]];
|
||||||
case TypeIndex::Int16:
|
case TypeIndex::Int16:
|
||||||
{
|
{
|
||||||
@ -131,6 +133,12 @@ std::unique_ptr<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr &
|
|||||||
{
|
{
|
||||||
return orc::createPrimitiveType(orc::TypeKind::TIMESTAMP);
|
return orc::createPrimitiveType(orc::TypeKind::TIMESTAMP);
|
||||||
}
|
}
|
||||||
|
case TypeIndex::Int128: [[fallthrough]];
|
||||||
|
case TypeIndex::UInt128: [[fallthrough]];
|
||||||
|
case TypeIndex::Int256: [[fallthrough]];
|
||||||
|
case TypeIndex::UInt256: [[fallthrough]];
|
||||||
|
case TypeIndex::Decimal256:
|
||||||
|
return orc::createPrimitiveType(orc::TypeKind::BINARY);
|
||||||
case TypeIndex::FixedString: [[fallthrough]];
|
case TypeIndex::FixedString: [[fallthrough]];
|
||||||
case TypeIndex::String:
|
case TypeIndex::String:
|
||||||
{
|
{
|
||||||
@ -309,6 +317,7 @@ void ORCBlockOutputFormat::writeColumn(
|
|||||||
|
|
||||||
switch (type->getTypeId())
|
switch (type->getTypeId())
|
||||||
{
|
{
|
||||||
|
case TypeIndex::Enum8: [[fallthrough]];
|
||||||
case TypeIndex::Int8:
|
case TypeIndex::Int8:
|
||||||
{
|
{
|
||||||
/// Note: Explicit cast to avoid clang-tidy error: 'signed char' to 'long' conversion; consider casting to 'unsigned char' first.
|
/// Note: Explicit cast to avoid clang-tidy error: 'signed char' to 'long' conversion; consider casting to 'unsigned char' first.
|
||||||
@ -320,6 +329,7 @@ void ORCBlockOutputFormat::writeColumn(
|
|||||||
writeNumbers<UInt8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt8 & value){ return value; });
|
writeNumbers<UInt8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt8 & value){ return value; });
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case TypeIndex::Enum16: [[fallthrough]];
|
||||||
case TypeIndex::Int16:
|
case TypeIndex::Int16:
|
||||||
{
|
{
|
||||||
writeNumbers<Int16, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int16 & value){ return value; });
|
writeNumbers<Int16, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int16 & value){ return value; });
|
||||||
@ -357,6 +367,26 @@ void ORCBlockOutputFormat::writeColumn(
|
|||||||
writeNumbers<UInt64,orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt64 & value){ return value; });
|
writeNumbers<UInt64,orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt64 & value){ return value; });
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case TypeIndex::Int128:
|
||||||
|
{
|
||||||
|
writeStrings<ColumnInt128>(orc_column, column, null_bytemap);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case TypeIndex::UInt128:
|
||||||
|
{
|
||||||
|
writeStrings<ColumnUInt128>(orc_column, column, null_bytemap);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case TypeIndex::Int256:
|
||||||
|
{
|
||||||
|
writeStrings<ColumnInt256>(orc_column, column, null_bytemap);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case TypeIndex::UInt256:
|
||||||
|
{
|
||||||
|
writeStrings<ColumnUInt256>(orc_column, column, null_bytemap);
|
||||||
|
break;
|
||||||
|
}
|
||||||
case TypeIndex::Float32:
|
case TypeIndex::Float32:
|
||||||
{
|
{
|
||||||
writeNumbers<Float32, orc::DoubleVectorBatch>(orc_column, column, null_bytemap, [](const Float32 & value){ return value; });
|
writeNumbers<Float32, orc::DoubleVectorBatch>(orc_column, column, null_bytemap, [](const Float32 & value){ return value; });
|
||||||
@ -432,6 +462,11 @@ void ORCBlockOutputFormat::writeColumn(
|
|||||||
[](Int128 value){ return orc::Int128(value >> 64, (value << 64) >> 64); });
|
[](Int128 value){ return orc::Int128(value >> 64, (value << 64) >> 64); });
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case TypeIndex::Decimal256:
|
||||||
|
{
|
||||||
|
writeStrings<ColumnDecimal<Decimal256>>(orc_column, column, null_bytemap);
|
||||||
|
break;
|
||||||
|
}
|
||||||
case TypeIndex::Nullable:
|
case TypeIndex::Nullable:
|
||||||
{
|
{
|
||||||
const auto & nullable_column = assert_cast<const ColumnNullable &>(column);
|
const auto & nullable_column = assert_cast<const ColumnNullable &>(column);
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
42 42 42 42 a b
|
||||||
|
42 42 42 42 a b
|
||||||
|
42 42 42 42 a b 42.42 0.0.0.0
|
||||||
|
\N
|
||||||
|
\N
|
17
tests/queries/0_stateless/02595_orc_arrow_parquet_more_types.sh
Executable file
17
tests/queries/0_stateless/02595_orc_arrow_parquet_more_types.sh
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Tags: no-fasttest
|
||||||
|
|
||||||
|
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||||
|
# shellcheck source=../shell_config.sh
|
||||||
|
. "$CURDIR"/../shell_config.sh
|
||||||
|
|
||||||
|
$CLICKHOUSE_LOCAL -q "select 42::Int128 as c1, 42::UInt128 as c2, 42::Int256 as c3, 42::UInt256 as c4, 'a'::Enum8('a' = 1) as c5, 'b'::Enum16('b' = 1) as c6 format Parquet" | $CLICKHOUSE_LOCAL --input-format Parquet --structure="c1 Int128, c2 UInt128, c3 Int256, c4 UInt256, c5 Enum8('a' = 1), c6 Enum16('b' = 1)" -q "select * from table"
|
||||||
|
|
||||||
|
$CLICKHOUSE_LOCAL -q "select 42::Int128 as c1, 42::UInt128 as c2, 42::Int256 as c3, 42::UInt256 as c4, 'a'::Enum8('a' = 1) as c5, 'b'::Enum16('b' = 1) as c6 format Arrow" | $CLICKHOUSE_LOCAL --input-format Arrow --structure="c1 Int128, c2 UInt128, c3 Int256, c4 UInt256, c5 Enum8('a' = 1), c6 Enum16('b' = 1)" -q "select * from table"
|
||||||
|
|
||||||
|
$CLICKHOUSE_LOCAL -q "select 42::Int128 as c1, 42::UInt128 as c2, 42::Int256 as c3, 42::UInt256 as c4, 'a'::Enum8('a' = 1) as c5, 'b'::Enum16('b' = 1) as c6, 42.42::Decimal256(2) as c7, '0.0.0.0'::IPv4 as c8 format ORC" | $CLICKHOUSE_LOCAL --input-format ORC --structure="c1 Int128, c2 UInt128, c3 Int256, c4 UInt256, c5 Enum8('a' = 1), c6 Enum16('b' = 1), c7 Decimal256(2), c8 IPv4" -q "select * from table"
|
||||||
|
|
||||||
|
$CLICKHOUSE_LOCAL -q "select NULL::Nullable(IPv6) as x format ORC" | $CLICKHOUSE_LOCAL --input-format ORC --structure="x Nullable(IPv6)" -q "select * from table"
|
||||||
|
|
||||||
|
$CLICKHOUSE_LOCAL -q "select NULL::Nullable(UInt256) as x format ORC" | $CLICKHOUSE_LOCAL --input-format ORC --structure="x Nullable(UInt256)" -q "select * from table"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user