Merge branch 'master' into standalone_keeper

This commit is contained in:
alesapin 2021-05-15 17:15:06 +03:00
commit 83627de2b8
90 changed files with 1442 additions and 334 deletions

1
.gitmodules vendored
View File

@ -17,6 +17,7 @@
[submodule "contrib/zlib-ng"] [submodule "contrib/zlib-ng"]
path = contrib/zlib-ng path = contrib/zlib-ng
url = https://github.com/ClickHouse-Extras/zlib-ng.git url = https://github.com/ClickHouse-Extras/zlib-ng.git
branch = clickhouse-new
[submodule "contrib/googletest"] [submodule "contrib/googletest"]
path = contrib/googletest path = contrib/googletest
url = https://github.com/google/googletest.git url = https://github.com/google/googletest.git

View File

@ -51,12 +51,22 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log
/// Use extended interface of Channel for more comprehensive logging. /// Use extended interface of Channel for more comprehensive logging.
split = new DB::OwnSplitChannel(); split = new DB::OwnSplitChannel();
auto log_level = config.getString("logger.level", "trace"); auto log_level_string = config.getString("logger.level", "trace");
/// different channels (log, console, syslog) may have different loglevels configured
/// The maximum (the most verbose) of those will be used as default for Poco loggers
int max_log_level = 0;
const auto log_path = config.getString("logger.log", ""); const auto log_path = config.getString("logger.log", "");
if (!log_path.empty()) if (!log_path.empty())
{ {
createDirectory(log_path); createDirectory(log_path);
std::cerr << "Logging " << log_level << " to " << log_path << std::endl; std::cerr << "Logging " << log_level_string << " to " << log_path << std::endl;
auto log_level = Poco::Logger::parseLevel(log_level_string);
if (log_level > max_log_level)
{
max_log_level = log_level;
}
// Set up two channel chains. // Set up two channel chains.
log_file = new Poco::FileChannel; log_file = new Poco::FileChannel;
@ -72,6 +82,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log
Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter; Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter;
Poco::AutoPtr<DB::OwnFormattingChannel> log = new DB::OwnFormattingChannel(pf, log_file); Poco::AutoPtr<DB::OwnFormattingChannel> log = new DB::OwnFormattingChannel(pf, log_file);
log->setLevel(log_level);
split->addChannel(log); split->addChannel(log);
} }
@ -79,6 +90,15 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log
if (!errorlog_path.empty()) if (!errorlog_path.empty())
{ {
createDirectory(errorlog_path); createDirectory(errorlog_path);
// NOTE: we don't use notice & critical in the code, so in practice error log collects fatal & error & warning.
// (!) Warnings are important, they require attention and should never be silenced / ignored.
auto errorlog_level = Poco::Logger::parseLevel(config.getString("logger.errorlog_level", "notice"));
if (errorlog_level > max_log_level)
{
max_log_level = errorlog_level;
}
std::cerr << "Logging errors to " << errorlog_path << std::endl; std::cerr << "Logging errors to " << errorlog_path << std::endl;
error_log_file = new Poco::FileChannel; error_log_file = new Poco::FileChannel;
@ -93,7 +113,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log
Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter; Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter;
Poco::AutoPtr<DB::OwnFormattingChannel> errorlog = new DB::OwnFormattingChannel(pf, error_log_file); Poco::AutoPtr<DB::OwnFormattingChannel> errorlog = new DB::OwnFormattingChannel(pf, error_log_file);
errorlog->setLevel(Poco::Message::PRIO_NOTICE); errorlog->setLevel(errorlog_level);
errorlog->open(); errorlog->open();
split->addChannel(errorlog); split->addChannel(errorlog);
} }
@ -101,6 +121,11 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log
if (config.getBool("logger.use_syslog", false)) if (config.getBool("logger.use_syslog", false))
{ {
//const std::string & cmd_name = commandName(); //const std::string & cmd_name = commandName();
auto syslog_level = Poco::Logger::parseLevel(config.getString("logger.syslog_level", log_level_string));
if (syslog_level > max_log_level)
{
max_log_level = syslog_level;
}
if (config.has("logger.syslog.address")) if (config.has("logger.syslog.address"))
{ {
@ -127,6 +152,8 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log
Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter; Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter;
Poco::AutoPtr<DB::OwnFormattingChannel> log = new DB::OwnFormattingChannel(pf, syslog_channel); Poco::AutoPtr<DB::OwnFormattingChannel> log = new DB::OwnFormattingChannel(pf, syslog_channel);
log->setLevel(syslog_level);
split->addChannel(log); split->addChannel(log);
} }
@ -138,9 +165,17 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log
{ {
bool color_enabled = config.getBool("logger.color_terminal", color_logs_by_default); bool color_enabled = config.getBool("logger.color_terminal", color_logs_by_default);
auto console_log_level_string = config.getString("logger.console_log_level", log_level_string);
auto console_log_level = Poco::Logger::parseLevel(console_log_level_string);
if (console_log_level > max_log_level)
{
max_log_level = console_log_level;
}
Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter(color_enabled); Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter(color_enabled);
Poco::AutoPtr<DB::OwnFormattingChannel> log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); Poco::AutoPtr<DB::OwnFormattingChannel> log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel);
logger.warning("Logging " + log_level + " to console"); logger.warning("Logging " + console_log_level_string + " to console");
log->setLevel(console_log_level);
split->addChannel(log); split->addChannel(log);
} }
@ -149,17 +184,17 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log
logger.setChannel(split); logger.setChannel(split);
// Global logging level (it can be overridden for specific loggers). // Global logging level (it can be overridden for specific loggers).
logger.setLevel(log_level); logger.setLevel(max_log_level);
// Set level to all already created loggers // Set level to all already created loggers
std::vector<std::string> names; std::vector<std::string> names;
//logger_root = Logger::root(); //logger_root = Logger::root();
logger.root().names(names); logger.root().names(names);
for (const auto & name : names) for (const auto & name : names)
logger.root().get(name).setLevel(log_level); logger.root().get(name).setLevel(max_log_level);
// Attach to the root logger. // Attach to the root logger.
logger.root().setLevel(log_level); logger.root().setLevel(max_log_level);
logger.root().setChannel(logger.getChannel()); logger.root().setChannel(logger.getChannel());
// Explicitly specified log levels for specific loggers. // Explicitly specified log levels for specific loggers.

View File

@ -22,6 +22,9 @@ public:
void setLevel(Poco::Message::Priority priority_) { priority = priority_; } void setLevel(Poco::Message::Priority priority_) { priority = priority_; }
// Poco::Logger::parseLevel returns ints
void setLevel(int level) { priority = static_cast<Poco::Message::Priority>(level); }
void open() override void open() override
{ {
if (pChannel) if (pChannel)

2
contrib/boringssl vendored

@ -1 +1 @@
Subproject commit 83c1cda8a0224dc817cbad2966c7ed4acc35f02a Subproject commit a6a2e2ab3e44d97ce98e51c558e989f211de7eb3

2
contrib/cppkafka vendored

@ -1 +1 @@
Subproject commit b06e64ef5bffd636d918a742c689f69130c1dbab Subproject commit 57a599d99c540e647bcd0eb9ea77c523cca011b3

2
contrib/grpc vendored

@ -1 +1 @@
Subproject commit 1085a941238e66b13e3fb89c310533745380acbc Subproject commit 5b79aae85c515e0df4abfb7b1e07975fdc7cecc1

2
contrib/poco vendored

@ -1 +1 @@
Subproject commit b7d9ec16ee33ca76643d5fcd907ea9a33285640a Subproject commit 5994506908028612869fee627d68d8212dfe7c1e

2
contrib/rocksdb vendored

@ -1 +1 @@
Subproject commit 54a0decabbcf4c0bb5cf7befa9c597f28289bff5 Subproject commit 07c77549a20b63ff6981b400085eba36bb5c80c4

2
contrib/zlib-ng vendored

@ -1 +1 @@
Subproject commit 5cc4d232020dc66d1d6c5438834457e2a2f6127b Subproject commit db232d30b4c72fd58e6d7eae2d12cebf9c3d90db

2
contrib/zstd vendored

@ -1 +1 @@
Subproject commit 10f0e6993f9d2f682da6d04aa2385b7d53cbb4ee Subproject commit a488ba114ec17ea1054b9057c26a046fc122b3b6

View File

@ -66,6 +66,7 @@ SET(Sources
"${LIBRARY_DIR}/compress/zstd_compress.c" "${LIBRARY_DIR}/compress/zstd_compress.c"
"${LIBRARY_DIR}/compress/zstd_compress_literals.c" "${LIBRARY_DIR}/compress/zstd_compress_literals.c"
"${LIBRARY_DIR}/compress/zstd_compress_sequences.c" "${LIBRARY_DIR}/compress/zstd_compress_sequences.c"
"${LIBRARY_DIR}/compress/zstd_compress_superblock.c"
"${LIBRARY_DIR}/compress/zstd_double_fast.c" "${LIBRARY_DIR}/compress/zstd_double_fast.c"
"${LIBRARY_DIR}/compress/zstd_fast.c" "${LIBRARY_DIR}/compress/zstd_fast.c"
"${LIBRARY_DIR}/compress/zstd_lazy.c" "${LIBRARY_DIR}/compress/zstd_lazy.c"
@ -95,16 +96,19 @@ SET(Headers
"${LIBRARY_DIR}/common/pool.h" "${LIBRARY_DIR}/common/pool.h"
"${LIBRARY_DIR}/common/threading.h" "${LIBRARY_DIR}/common/threading.h"
"${LIBRARY_DIR}/common/xxhash.h" "${LIBRARY_DIR}/common/xxhash.h"
"${LIBRARY_DIR}/common/zstd_errors.h" "${LIBRARY_DIR}/common/zstd_deps.h"
"${LIBRARY_DIR}/common/zstd_internal.h" "${LIBRARY_DIR}/common/zstd_internal.h"
"${LIBRARY_DIR}/common/zstd_trace.h"
"${LIBRARY_DIR}/compress/hist.h" "${LIBRARY_DIR}/compress/hist.h"
"${LIBRARY_DIR}/compress/zstd_compress_internal.h" "${LIBRARY_DIR}/compress/zstd_compress_internal.h"
"${LIBRARY_DIR}/compress/zstd_compress_literals.h" "${LIBRARY_DIR}/compress/zstd_compress_literals.h"
"${LIBRARY_DIR}/compress/zstd_compress_sequences.h" "${LIBRARY_DIR}/compress/zstd_compress_sequences.h"
"${LIBRARY_DIR}/compress/zstd_compress_superblock.h"
"${LIBRARY_DIR}/compress/zstd_cwksp.h" "${LIBRARY_DIR}/compress/zstd_cwksp.h"
"${LIBRARY_DIR}/compress/zstd_double_fast.h" "${LIBRARY_DIR}/compress/zstd_double_fast.h"
"${LIBRARY_DIR}/compress/zstd_fast.h" "${LIBRARY_DIR}/compress/zstd_fast.h"
"${LIBRARY_DIR}/compress/zstd_lazy.h" "${LIBRARY_DIR}/compress/zstd_lazy.h"
"${LIBRARY_DIR}/compress/zstd_ldm_geartab.h"
"${LIBRARY_DIR}/compress/zstd_ldm.h" "${LIBRARY_DIR}/compress/zstd_ldm.h"
"${LIBRARY_DIR}/compress/zstdmt_compress.h" "${LIBRARY_DIR}/compress/zstdmt_compress.h"
"${LIBRARY_DIR}/compress/zstd_opt.h" "${LIBRARY_DIR}/compress/zstd_opt.h"
@ -113,7 +117,8 @@ SET(Headers
"${LIBRARY_DIR}/decompress/zstd_decompress_internal.h" "${LIBRARY_DIR}/decompress/zstd_decompress_internal.h"
"${LIBRARY_DIR}/dictBuilder/cover.h" "${LIBRARY_DIR}/dictBuilder/cover.h"
"${LIBRARY_DIR}/dictBuilder/divsufsort.h" "${LIBRARY_DIR}/dictBuilder/divsufsort.h"
"${LIBRARY_DIR}/dictBuilder/zdict.h" "${LIBRARY_DIR}/zdict.h"
"${LIBRARY_DIR}/zstd_errors.h"
"${LIBRARY_DIR}/zstd.h") "${LIBRARY_DIR}/zstd.h")
SET(ZSTD_LEGACY_SUPPORT true) SET(ZSTD_LEGACY_SUPPORT true)

View File

@ -139,8 +139,8 @@ The following settings can be specified in configuration file for given endpoint
- `endpoint` — Specifies prefix of an endpoint. Mandatory. - `endpoint` — Specifies prefix of an endpoint. Mandatory.
- `access_key_id` and `secret_access_key` — Specifies credentials to use with given endpoint. Optional. - `access_key_id` and `secret_access_key` — Specifies credentials to use with given endpoint. Optional.
- `use_environment_credentials` — If set to `true`, S3 client will try to obtain credentials from environment variables and [Amazon EC2](https://en.wikipedia.org/wiki/Amazon_Elastic_Compute_Cloud) metadata for given endpoint. Optional, default value is `false`.
- `region` — Specifies S3 region name. Optional. - `region` — Specifies S3 region name. Optional.
- `use_environment_credentials` — If set to `true`, S3 client will try to obtain credentials from environment variables and Amazon EC2 metadata for given endpoint. Optional, default value is `false`.
- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Optional, default value is `false`. - `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Optional, default value is `false`.
- `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be speficied multiple times. - `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be speficied multiple times.
- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional.

View File

@ -104,7 +104,7 @@ For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse buil
After downloading, you can use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data. After downloading, you can use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data.
Run `sudo ./clickhouse install` if you want to install clickhouse system-wide (also with needed condiguration files, configuring users etc.). After that run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it. Run `sudo ./clickhouse install` if you want to install clickhouse system-wide (also with needed configuration files, configuring users etc.). After that run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.
These builds are not recommended for use in production environments because they are less thoroughly tested, but you can do so on your own risk. They also have only a subset of ClickHouse features available. These builds are not recommended for use in production environments because they are less thoroughly tested, but you can do so on your own risk. They also have only a subset of ClickHouse features available.

View File

@ -0,0 +1,98 @@
---
toc_priority: 65
toc_title: clickhouse-format
---
# clickhouse-format {#clickhouse-format}
Allows formatting input queries.
Keys:
- `--help` or`-h` — Produce help message.
- `--hilite` — Add syntax highlight with ANSI terminal escape sequences.
- `--oneline` — Format in single line.
- `--quiet` or `-q` — Just check syntax, no output on success.
- `--multiquery` or `-n` — Allow multiple queries in the same file.
- `--obfuscate` — Obfuscate instead of formatting.
- `--seed <string>` — Seed arbitrary string that determines the result of obfuscation.
- `--backslash` — Add a backslash at the end of each line of the formatted query. Can be useful when you copy a query from web or somewhere else with multiple lines, and want to execute it in command line.
## Examples {#examples}
1. Highlighting and single line:
```bash
$ clickhouse-format --oneline --hilite <<< "SELECT sum(number) FROM numbers(5);"
```
Result:
```sql
SELECT sum(number) FROM numbers(5)
```
2. Multiqueries:
```bash
$ clickhouse-format -n <<< "SELECT * FROM (SELECT 1 AS x UNION ALL SELECT 1 UNION DISTINCT SELECT 3);"
```
Result:
```text
SELECT *
FROM
(
SELECT 1 AS x
UNION ALL
SELECT 1
UNION DISTINCT
SELECT 3
)
;
```
3. Obfuscating:
```bash
$ clickhouse-format --seed Hello --obfuscate <<< "SELECT cost_first_screen BETWEEN a AND b, CASE WHEN x >= 123 THEN y ELSE NULL END;"
```
Result:
```text
SELECT treasury_mammoth_hazelnut BETWEEN nutmeg AND span, CASE WHEN chive >= 116 THEN switching ELSE ANYTHING END;
```
Same query and another seed string:
```bash
$ clickhouse-format --seed World --obfuscate <<< "SELECT cost_first_screen BETWEEN a AND b, CASE WHEN x >= 123 THEN y ELSE NULL END;"
```
Result:
```text
SELECT horse_tape_summer BETWEEN folklore AND moccasins, CASE WHEN intestine >= 116 THEN nonconformist ELSE FORESTRY END;
```
4. Adding backslash:
```bash
$ clickhouse-format --backslash <<< "SELECT * FROM (SELECT 1 AS x UNION ALL SELECT 1 UNION DISTINCT SELECT 3);"
```
Result:
```text
SELECT * \
FROM \
( \
SELECT 1 AS x \
UNION ALL \
SELECT 1 \
UNION DISTINCT \
SELECT 3 \
)
```

View File

@ -9,5 +9,8 @@ toc_title: Overview
- [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without stopping the ClickHouse server, similar to how `awk` does this. - [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without stopping the ClickHouse server, similar to how `awk` does this.
- [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. - [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster.
- [clickhouse-benchmark](../../operations/utilities/clickhouse-benchmark.md) — Loads server with the custom queries and settings. - [clickhouse-benchmark](../../operations/utilities/clickhouse-benchmark.md) — Loads server with the custom queries and settings.
- [clickhouse-format](../../operations/utilities/clickhouse-format.md) — Enables formatting input queries.
- [ClickHouse obfuscator](../../operations/utilities/clickhouse-obfuscator.md) — Obfuscates data.
- [ClickHouse compressor](../../operations/utilities/clickhouse-compressor.md) — Compresses and decompresses data.
- [clickhouse-odbc-bridge](../../operations/utilities/odbc-bridge.md) — A proxy server for ODBC driver.
[Original article](https://clickhouse.tech/docs/en/operations/utils/) <!--hide-->

View File

@ -6,9 +6,8 @@ toc_priority: 141
Sums the arithmetic difference between consecutive rows. If the difference is negative, it is ignored. Sums the arithmetic difference between consecutive rows. If the difference is negative, it is ignored.
Note that the underlying data must be sorted in order for this function to work properly. !!! info "Note"
If you would like to use this function in a materialized view, you most likely want to use the The underlying data must be sorted for this function to work properly. If you would like to use this function in a [materialized view](../../../sql-reference/statements/create/view.md#materialized), you most likely want to use the [deltaSumTimestamp](../../../sql-reference/aggregate-functions/reference/deltasumtimestamp.md#agg_functions-deltasumtimestamp) method instead.
[deltaSumTimestamp](deltasumtimestamp.md) method instead.
**Syntax** **Syntax**

View File

@ -2,38 +2,42 @@
toc_priority: 141 toc_priority: 141
--- ---
# deltaSumTimestamp {#agg_functions-deltasum} # deltaSumTimestamp {#agg_functions-deltasumtimestamp}
Syntax: `deltaSumTimestamp(value, timestamp)` Adds the difference between consecutive rows. If the difference is negative, it is ignored.
Adds the differences between consecutive rows. If the difference is negative, it is ignored. This function is primarily for [materialized views](../../../sql-reference/statements/create/view.md#materialized) that are ordered by some time bucket-aligned timestamp, for example, a `toStartOfMinute` bucket. Because the rows in such a materialized view will all have the same timestamp, it is impossible for them to be merged in the "right" order. This function keeps track of the `timestamp` of the values it's seen, so it's possible to order the states correctly during merging.
Uses `timestamp` to order values.
This function is primarily for materialized views that are ordered by some time bucket aligned To calculate the delta sum across an ordered collection you can simply use the [deltaSum](../../../sql-reference/aggregate-functions/reference/deltasum.md#agg_functions-deltasum) function.
timestamp, for example a `toStartOfMinute` bucket. Because the rows in such a materialized view
will all have the same timestamp, it is impossible for them to be merged in the "right" order. This
function keeps track of the `timestamp` of the values it's seen, so it's possible to order the states
correctly during merging.
To calculate the delta sum across an ordered collection you can simply use the **Syntax**
[deltaSum](./deltasum.md) function.
``` sql
deltaSumTimestamp(value, timestamp)
```
**Arguments** **Arguments**
- `value` must be some [Integer](../../data-types/int-uint.md) type or [Float](../../data-types/float.md) type or a [Date](../../data-types/date.md) or [DateTime](../../data-types/datetime.md). - `value` — Input values, must be some [Integer](../../data-types/int-uint.md) type or [Float](../../data-types/float.md) type or a [Date](../../data-types/date.md) or [DateTime](../../data-types/datetime.md).
- `timestamp` must be some [Integer](../../data-types/int-uint.md) type or [Float](../../data-types/float.md) type or a [Date](../../data-types/date.md) or [DateTime](../../data-types/datetime.md). - `timestamp` — The parameter for order values, must be some [Integer](../../data-types/int-uint.md) type or [Float](../../data-types/float.md) type or a [Date](../../data-types/date.md) or [DateTime](../../data-types/datetime.md).
**Returned value** **Returned value**
- Accumulated differences between consecutive values, ordered by the `timestamp` parameter. - Accumulated differences between consecutive values, ordered by the `timestamp` parameter.
Type: [Integer](../../data-types/int-uint.md) or [Float](../../data-types/float.md) or [Date](../../data-types/date.md) or [DateTime](../../data-types/datetime.md).
**Example** **Example**
Query:
```sql ```sql
SELECT deltaSumTimestamp(value, timestamp) SELECT deltaSumTimestamp(value, timestamp)
FROM (select number as timestamp, [0, 4, 8, 3, 0, 0, 0, 1, 3, 5][number] as value from numbers(1, 10)) FROM (SELECT number AS timestamp, [0, 4, 8, 3, 0, 0, 0, 1, 3, 5][number] AS value FROM numbers(1, 10));
``` ```
Result:
``` text ``` text
┌─deltaSumTimestamp(value, timestamp)─┐ ┌─deltaSumTimestamp(value, timestamp)─┐
│ 13 │ │ 13 │

View File

@ -95,9 +95,10 @@ LAYOUT(FLAT(INITIAL_ARRAY_SIZE 50000 MAX_ARRAY_SIZE 5000000))
The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items. The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.
If `preallocate` is `true` (default is `false`) the hash table will be preallocated (this will make dictionary load faster). But note that you should use it only if: If `preallocate` is `true` (default is `false`) the hash table will be preallocated (this will make the dictionary load faster). But note that you should use it only if:
- the source support approximate number of elements (for now it is supported only by the `ClickHouse` source)
- there is no duplicates in the data (otherwise it may increase memory usage for the hashtable) - The source support an approximate number of elements (for now it is supported only by the `ClickHouse` source).
- There are no duplicates in the data (otherwise it may increase memory usage for the hashtable).
All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.

View File

@ -10,13 +10,14 @@ toc_title: External Dictionaries
For information on connecting and configuring external dictionaries, see [External dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). For information on connecting and configuring external dictionaries, see [External dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md).
## dictGet, dictGetOrDefault {#dictget} ## dictGet, dictGetOrDefault, dictGetOrNull {#dictget}
Retrieves values from an external dictionary. Retrieves values from an external dictionary.
``` sql ``` sql
dictGet('dict_name', attr_names, id_expr) dictGet('dict_name', attr_names, id_expr)
dictGetOrDefault('dict_name', attr_names, id_expr, default_value_expr) dictGetOrDefault('dict_name', attr_names, id_expr, default_value_expr)
dictGetOrNull('dict_name', attr_name, id_expr)
``` ```
**Arguments** **Arguments**
@ -34,12 +35,13 @@ dictGetOrDefault('dict_name', attr_names, id_expr, default_value_expr)
- `dictGet` returns the content of the `<null_value>` element specified for the attribute in the dictionary configuration. - `dictGet` returns the content of the `<null_value>` element specified for the attribute in the dictionary configuration.
- `dictGetOrDefault` returns the value passed as the `default_value_expr` parameter. - `dictGetOrDefault` returns the value passed as the `default_value_expr` parameter.
- `dictGetOrNull` returns `NULL` in case key was not found in dictionary.
ClickHouse throws an exception if it cannot parse the value of the attribute or the value doesnt match the attribute data type. ClickHouse throws an exception if it cannot parse the value of the attribute or the value doesnt match the attribute data type.
**Example for single attribute** **Example for simple key dictionary**
Create a text file `ext-dict-text.csv` containing the following: Create a text file `ext-dict-test.csv` containing the following:
``` text ``` text
1,1 1,1
@ -96,7 +98,7 @@ LIMIT 3
└─────┴────────┘ └─────┴────────┘
``` ```
**Example for multiple attributes** **Example for complex key dictionary**
Create a text file `ext-dict-mult.csv` containing the following: Create a text file `ext-dict-mult.csv` containing the following:
@ -161,6 +163,65 @@ LIMIT 3;
└─────────┴───────────────────────┘ └─────────┴───────────────────────┘
``` ```
**Example for range key dictionary**
Input table:
```sql
CREATE TABLE range_key_dictionary_source_table
(
key UInt64,
start_date Date,
end_date Date,
value String,
value_nullable Nullable(String)
)
ENGINE = TinyLog();
INSERT INTO range_key_dictionary_source_table VALUES(1, toDate('2019-05-20'), toDate('2019-05-20'), 'First', 'First');
INSERT INTO range_key_dictionary_source_table VALUES(2, toDate('2019-05-20'), toDate('2019-05-20'), 'Second', NULL);
INSERT INTO range_key_dictionary_source_table VALUES(3, toDate('2019-05-20'), toDate('2019-05-20'), 'Third', 'Third');
```
Create the external dictionary:
```sql
CREATE DICTIONARY range_key_dictionary
(
key UInt64,
start_date Date,
end_date Date,
value String,
value_nullable Nullable(String)
)
PRIMARY KEY key
SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'range_key_dictionary_source_table'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(RANGE_HASHED())
RANGE(MIN start_date MAX end_date);
```
Perform the query:
``` sql
SELECT
(number, toDate('2019-05-20')),
dictHas('range_key_dictionary', number, toDate('2019-05-20')),
dictGetOrNull('range_key_dictionary', 'value', number, toDate('2019-05-20')),
dictGetOrNull('range_key_dictionary', 'value_nullable', number, toDate('2019-05-20')),
dictGetOrNull('range_key_dictionary', ('value', 'value_nullable'), number, toDate('2019-05-20'))
FROM system.numbers LIMIT 5 FORMAT TabSeparated;
```
Result:
``` text
(0,'2019-05-20') 0 \N \N (NULL,NULL)
(1,'2019-05-20') 1 First First ('First','First')
(2,'2019-05-20') 0 \N \N (NULL,NULL)
(3,'2019-05-20') 0 \N \N (NULL,NULL)
(4,'2019-05-20') 0 \N \N (NULL,NULL)
```
**See Also** **See Also**
- [External Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) - [External Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md)

View File

@ -82,8 +82,9 @@ SELECT * FROM s3_engine_table LIMIT 2;
Необязательные настройки: Необязательные настройки:
- `access_key_id` и `secret_access_key` — указывают учетные данные для использования с данной точкой приема запроса. - `access_key_id` и `secret_access_key` — указывают учетные данные для использования с данной точкой приема запроса.
- `use_environment_credentials` — если `true`, S3-клиент будет пытаться получить учетные данные из переменных среды и метаданных [Amazon EC2](https://ru.wikipedia.org/wiki/Amazon_EC2) для данной точки приема запроса. Значение по умолчанию — `false`.
- `use_insecure_imds_request` — признак использования менее безопасного соединения при выполнении запроса к IMDS при получении учётных данных из метаданных Amazon EC2. Значение по умолчанию — `false`.
- `region` — название региона S3. - `region` — название региона S3.
- `use_environment_credentials` — если `true`, S3-клиент будет пытаться получить учетные данные из переменных среды и метаданных Amazon EC2 для данной точки приема запроса. Значение по умолчанию - `false`.
- `header` — добавляет указанный HTTP-заголовок к запросу на заданную точку приема запроса. Может быть определен несколько раз. - `header` — добавляет указанный HTTP-заголовок к запросу на заданную точку приема запроса. Может быть определен несколько раз.
- `server_side_encryption_customer_key_base64` — устанавливает необходимые заголовки для доступа к объектам S3 с шифрованием SSE-C. - `server_side_encryption_customer_key_base64` — устанавливает необходимые заголовки для доступа к объектам S3 с шифрованием SSE-C.
@ -97,6 +98,7 @@ SELECT * FROM s3_engine_table LIMIT 2;
<!-- <secret_access_key>SECRET_ACCESS_KEY</secret_access_key> --> <!-- <secret_access_key>SECRET_ACCESS_KEY</secret_access_key> -->
<!-- <region>us-west-1</region> --> <!-- <region>us-west-1</region> -->
<!-- <use_environment_credentials>false</use_environment_credentials> --> <!-- <use_environment_credentials>false</use_environment_credentials> -->
<!-- <use_insecure_imds_request>false</use_insecure_imds_request> -->
<!-- <header>Authorization: Bearer SOME-TOKEN</header> --> <!-- <header>Authorization: Bearer SOME-TOKEN</header> -->
<!-- <server_side_encryption_customer_key_base64>BASE64-ENCODED-KEY</server_side_encryption_customer_key_base64> --> <!-- <server_side_encryption_customer_key_base64>BASE64-ENCODED-KEY</server_side_encryption_customer_key_base64> -->
</endpoint-name> </endpoint-name>
@ -143,6 +145,7 @@ ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_p
CREATE TABLE big_table (name String, value UInt32) CREATE TABLE big_table (name String, value UInt32)
ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV'); ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV');
``` ```
**Смотрите также** **Смотрите также**
- [Табличная функция S3](../../../sql-reference/table-functions/s3.md) - [Табличная функция S3](../../../sql-reference/table-functions/s3.md)

View File

@ -95,7 +95,9 @@ sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh
- [AArch64](https://builds.clickhouse.tech/master/aarch64/clickhouse) — `curl -O 'https://builds.clickhouse.tech/master/aarch64/clickhouse' && chmod a+x ./clickhouse` - [AArch64](https://builds.clickhouse.tech/master/aarch64/clickhouse) — `curl -O 'https://builds.clickhouse.tech/master/aarch64/clickhouse' && chmod a+x ./clickhouse`
- [FreeBSD](https://builds.clickhouse.tech/master/freebsd/clickhouse) — `curl -O 'https://builds.clickhouse.tech/master/freebsd/clickhouse' && chmod a+x ./clickhouse` - [FreeBSD](https://builds.clickhouse.tech/master/freebsd/clickhouse) — `curl -O 'https://builds.clickhouse.tech/master/freebsd/clickhouse' && chmod a+x ./clickhouse`
После скачивания, можно воспользоваться `clickhouse client` для подключения к серверу, или `clickhouse local` для обработки локальных данных. Для запуска `clickhouse server` необходимо скачать конфигурационные файлы [сервера](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config.xml) и [пользователей](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/users.xml) с GitHub. После скачивания можно воспользоваться `clickhouse client` для подключения к серверу или `clickhouse local` для обработки локальных данных.
Чтобы установить ClickHouse в рамках всей системы (с необходимыми конфигурационными файлами, настройками пользователей и т.д.), выполните `sudo ./clickhouse install`. Затем выполните команды `clickhouse start` (чтобы запустить сервер) и `clickhouse-client` (чтобы подключиться к нему).
Данные сборки не рекомендуются для использования в продакшене, так как они недостаточно тщательно протестированны. Также, в них присутствуют не все возможности ClickHouse. Данные сборки не рекомендуются для использования в продакшене, так как они недостаточно тщательно протестированны. Также, в них присутствуют не все возможности ClickHouse.
@ -172,4 +174,3 @@ SELECT 1
**Поздравляем, система работает!** **Поздравляем, система работает!**
Для дальнейших экспериментов можно попробовать загрузить один из тестовых наборов данных или пройти [пошаговое руководство для начинающих](https://clickhouse.tech/tutorial.html). Для дальнейших экспериментов можно попробовать загрузить один из тестовых наборов данных или пройти [пошаговое руководство для начинающих](https://clickhouse.tech/tutorial.html).

View File

@ -2767,12 +2767,12 @@ SELECT * FROM test2;
## prefer_column_name_to_alias {#prefer-column-name-to-alias} ## prefer_column_name_to_alias {#prefer-column-name-to-alias}
Включает или отключает замену названий столбцов на синонимы в выражениях и секциях запросов, см. [Примечания по использованию синонимов](../../sql-reference/syntax.md#syntax-expression_aliases). Включите эту настройку, чтобы синтаксис синонимов в ClickHouse был более совместим с большинством других СУБД. Включает или отключает замену названий столбцов на псевдонимы (alias) в выражениях и секциях запросов, см. [Примечания по использованию синонимов](../../sql-reference/syntax.md#syntax-expression_aliases). Включите эту настройку, чтобы синтаксис псевдонимов в ClickHouse был более совместим с большинством других СУБД.
Возможные значения: Возможные значения:
- 0 — синоним подставляется вместо имени столбца. - 0 — псевдоним подставляется вместо имени столбца.
- 1 — синоним не подставляется вместо имени столбца. - 1 — псевдоним не подставляется вместо имени столбца.
Значение по умолчанию: `0`. Значение по умолчанию: `0`.

View File

@ -21,6 +21,7 @@
- `bytes_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Объем оперативной памяти, используемый словарем. - `bytes_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Объем оперативной памяти, используемый словарем.
- `query_count` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Количество запросов с момента загрузки словаря или с момента последней успешной перезагрузки. - `query_count` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Количество запросов с момента загрузки словаря или с момента последней успешной перезагрузки.
- `hit_rate` ([Float64](../../sql-reference/data-types/float.md)) — Для cache-словарей — процент закэшированных значений. - `hit_rate` ([Float64](../../sql-reference/data-types/float.md)) — Для cache-словарей — процент закэшированных значений.
- `found_rate` ([Float64](../../sql-reference/data-types/float.md)) — Процент обращений к словарю, при которых значение было найдено.
- `element_count` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Количество элементов, хранящихся в словаре. - `element_count` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Количество элементов, хранящихся в словаре.
- `load_factor` ([Float64](../../sql-reference/data-types/float.md)) — Процент заполнения словаря (для хэшированного словаря — процент заполнения хэш-таблицы). - `load_factor` ([Float64](../../sql-reference/data-types/float.md)) — Процент заполнения словаря (для хэшированного словаря — процент заполнения хэш-таблицы).
- `source` ([String](../../sql-reference/data-types/string.md)) — Текст, описывающий [источник данных](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) для словаря. - `source` ([String](../../sql-reference/data-types/string.md)) — Текст, описывающий [источник данных](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) для словаря.
@ -58,4 +59,3 @@ SELECT * FROM system.dictionaries
│ dictdb │ dict │ LOADED │ dictdb.dict │ Flat │ UInt64 │ ['value_default','value_expression'] │ ['String','String'] │ 74032 │ 0 │ 1 │ 1 │ 0.0004887585532746823 │ ClickHouse: dictdb.dicttbl │ 0 │ 1 │ 2020-03-04 04:17:34 │ 2020-03-04 04:30:34 │ 0.002 │ │ │ dictdb │ dict │ LOADED │ dictdb.dict │ Flat │ UInt64 │ ['value_default','value_expression'] │ ['String','String'] │ 74032 │ 0 │ 1 │ 1 │ 0.0004887585532746823 │ ClickHouse: dictdb.dicttbl │ 0 │ 1 │ 2020-03-04 04:17:34 │ 2020-03-04 04:30:34 │ 0.002 │ │
└──────────┴──────┴────────┴─────────────┴──────┴────────┴──────────────────────────────────────┴─────────────────────┴─────────────────┴─────────────┴──────────┴───────────────┴───────────────────────┴────────────────────────────┴──────────────┴──────────────┴─────────────────────┴──────────────────────────────┘───────────────────────┴────────────────┘ └──────────┴──────┴────────┴─────────────┴──────┴────────┴──────────────────────────────────────┴─────────────────────┴─────────────────┴─────────────┴──────────┴───────────────┴───────────────────────┴────────────────────────────┴──────────────┴──────────────┴─────────────────────┴──────────────────────────────┘───────────────────────┴────────────────┘
``` ```

View File

@ -0,0 +1,27 @@
## ClickHouse compressor
Simple program for data compression and decompression in ClickHouse way.
### Examples
Compress data with LZ4:
```
$ ./clickhouse-compressor < input_file > output_file
```
Decompress data from LZ4 format:
```
$ ./clickhouse-compressor --decompress < input_file > output_file
```
Compress data with ZSTD at level 5:
```
$ ./clickhouse-compressor --codec 'ZSTD(5)' < input_file > output_file
```
Compress data with Delta of four bytes and ZSTD level 10.
```
$ ./clickhouse-compressor --codec 'Delta(4)' --codec 'ZSTD(10)' < input_file > output_file
```

View File

@ -0,0 +1,98 @@
---
toc_priority: 65
toc_title: clickhouse-format
---
# clickhouse-format {#clickhouse-format}
Позволяет форматировать входящие запросы.
Ключи:
- `--help` или`-h` — выводит описание ключей.
- `--hilite` — добавляет подсветку синтаксиса с экранированием символов.
- `--oneline` — форматирование в одну строку.
- `--quiet` или `-q` — проверяет синтаксис без вывода результата.
- `--multiquery` or `-n` — поддерживает несколько запросов в одной строке.
- `--obfuscate` — обфусцирует вместо форматирования.
- `--seed <строка>` — задает строку, которая определяет результат обфускации.
- `--backslash` — добавляет обратный слеш в конце каждой строки отформатированного запроса. Удобно использовать если многострочный запрос скопирован из интернета или другого источника и его нужно выполнить из командной строки.
## Примеры {#examples}
1. Подсветка синтаксиса и форматирование в одну строку:
```bash
$ clickhouse-format --oneline --hilite <<< "SELECT sum(number) FROM numbers(5);"
```
Результат:
```sql
SELECT sum(number) FROM numbers(5)
```
2. Несколько запросов в одной строке:
```bash
$ clickhouse-format -n <<< "SELECT * FROM (SELECT 1 AS x UNION ALL SELECT 1 UNION DISTINCT SELECT 3);"
```
Результат:
```text
SELECT *
FROM
(
SELECT 1 AS x
UNION ALL
SELECT 1
UNION DISTINCT
SELECT 3
)
;
```
3. Обфускация:
```bash
$ clickhouse-format --seed Hello --obfuscate <<< "SELECT cost_first_screen BETWEEN a AND b, CASE WHEN x >= 123 THEN y ELSE NULL END;"
```
Результат:
```text
SELECT treasury_mammoth_hazelnut BETWEEN nutmeg AND span, CASE WHEN chive >= 116 THEN switching ELSE ANYTHING END;
```
Тот же запрос с другой инициализацией обфускатора:
```bash
$ clickhouse-format --seed World --obfuscate <<< "SELECT cost_first_screen BETWEEN a AND b, CASE WHEN x >= 123 THEN y ELSE NULL END;"
```
Результат:
```text
SELECT horse_tape_summer BETWEEN folklore AND moccasins, CASE WHEN intestine >= 116 THEN nonconformist ELSE FORESTRY END;
```
4. Добавление обратного слеша:
```bash
$ clickhouse-format --backslash <<< "SELECT * FROM (SELECT 1 AS x UNION ALL SELECT 1 UNION DISTINCT SELECT 3);"
```
Результат:
```text
SELECT * \
FROM \
( \
SELECT 1 AS x \
UNION ALL \
SELECT 1 \
UNION DISTINCT \
SELECT 3 \
)
```

View File

@ -6,6 +6,10 @@ toc_title: "Обзор"
# Утилиты ClickHouse {#utility-clickhouse} # Утилиты ClickHouse {#utility-clickhouse}
- [clickhouse-local](clickhouse-local.md) - [clickhouse-local](clickhouse-local.md) - позволяет выполнять SQL-запросы над данными без остановки сервера ClickHouse, подобно утилите `awk`.
- [clickhouse-copier](clickhouse-copier.md) - копирует (и перешардирует) данные с одного кластера на другой. - [clickhouse-copier](clickhouse-copier.md) - копирует (и перешардирует) данные с одного кластера на другой.
- [clickhouse-benchmark](../../operations/utilities/clickhouse-benchmark.md) — устанавливает соединение с сервером ClickHouse и запускает циклическое выполнение указанных запросов.
- [clickhouse-format](../../operations/utilities/clickhouse-format.md) — позволяет форматировать входящие запросы.
- [ClickHouse obfuscator](../../operations/utilities/clickhouse-obfuscator.md) — обфусцирует данные.
- [ClickHouse compressor](../../operations/utilities/clickhouse-compressor.md) — упаковывает и распаковывает данные.
- [clickhouse-odbc-bridge](../../operations/utilities/odbc-bridge.md) — прокси-сервер для ODBC.

View File

@ -0,0 +1,38 @@
# clickhouse-odbc-bridge
Simple HTTP-server which works like a proxy for ODBC driver. The main motivation
was possible segfaults or another faults in ODBC implementations, which can
crash whole clickhouse-server process.
This tool works via HTTP, not via pipes, shared memory, or TCP because:
- It's simpler to implement
- It's simpler to debug
- jdbc-bridge can be implemented in the same way
## Usage
`clickhouse-server` use this tool inside odbc table function and StorageODBC.
However it can be used as standalone tool from command line with the following
parameters in POST-request URL:
- `connection_string` -- ODBC connection string.
- `columns` -- columns in ClickHouse NamesAndTypesList format, name in backticks,
type as string. Name and type are space separated, rows separated with
newline.
- `max_block_size` -- optional parameter, sets maximum size of single block.
Query is send in post body. Response is returned in RowBinary format.
## Example:
```bash
$ clickhouse-odbc-bridge --http-port 9018 --daemon
$ curl -d "query=SELECT PageID, ImpID, AdType FROM Keys ORDER BY PageID, ImpID" --data-urlencode "connection_string=DSN=ClickHouse;DATABASE=stat" --data-urlencode "columns=columns format version: 1
3 columns:
\`PageID\` String
\`ImpID\` String
\`AdType\` String
" "http://localhost:9018/" > result.txt
$ cat result.txt # Result in RowBinary format
12246623837185725195925621517
```

View File

@ -6,6 +6,9 @@ toc_priority: 141
Суммирует арифметическую разницу между последовательными строками. Если разница отрицательна — она будет проигнорирована. Суммирует арифметическую разницу между последовательными строками. Если разница отрицательна — она будет проигнорирована.
!!! info "Примечание"
Чтобы эта функция работала должным образом, исходные данные должны быть отсортированы. В [материализованном представлении](../../../sql-reference/statements/create/view.md#materialized) вместо нее рекомендуется использовать [deltaSumTimestamp](../../../sql-reference/aggregate-functions/reference/deltasumtimestamp.md#agg_functions-deltasumtimestamp).
**Синтаксис** **Синтаксис**
``` sql ``` sql
@ -18,7 +21,8 @@ deltaSum(value)
**Возвращаемое значение** **Возвращаемое значение**
- накопленная арифметическая разница, типа `Integer` или `Float`. - Накопленная арифметическая разница.
Тип: `Integer` или `Float`.
**Примеры** **Примеры**

View File

@ -0,0 +1,45 @@
---
toc_priority: 141
---
# deltaSumTimestamp {#agg_functions-deltasumtimestamp}
Суммирует разницу между последовательными строками. Если разница отрицательна — она будет проигнорирована.
Эта функция предназначена в первую очередь для [материализованных представлений](../../../sql-reference/statements/create/view.md#materialized), упорядоченных по некоторому временному бакету согласно timestamp, например, по бакету `toStartOfMinute`. Поскольку строки в таком материализованном представлении будут иметь одинаковый timestamp, невозможно объединить их в "правом" порядке. Функция отслеживает `timestamp` наблюдаемых значений, поэтому возможно правильно упорядочить состояния во время слияния.
Чтобы вычислить разницу между упорядоченными последовательными строками, вы можете использовать функцию [deltaSum](../../../sql-reference/aggregate-functions/reference/deltasum.md#agg_functions-deltasum) вместо функции `deltaSumTimestamp`.
**Синтаксис**
``` sql
deltaSumTimestamp(value, timestamp)
```
**Аргументы**
- `value` — входные значения, должны быть типа [Integer](../../data-types/int-uint.md), или [Float](../../data-types/float.md), или [Date](../../data-types/date.md), или [DateTime](../../data-types/datetime.md).
- `timestamp` — параметр для упорядочивания значений, должен быть типа [Integer](../../data-types/int-uint.md), или [Float](../../data-types/float.md), или [Date](../../data-types/date.md), или [DateTime](../../data-types/datetime.md).
**Возвращаемое значение**
- Накопленная разница между последовательными значениями, упорядоченными по параметру `timestamp`.
Тип: [Integer](../../data-types/int-uint.md), или [Float](../../data-types/float.md), или [Date](../../data-types/date.md), или [DateTime](../../data-types/datetime.md).
**Пример**
Запрос:
```sql
SELECT deltaSumTimestamp(value, timestamp)
FROM (SELECT number AS timestamp, [0, 4, 8, 3, 0, 0, 0, 1, 3, 5][number] AS value FROM numbers(1, 10));
```
Результат:
``` text
┌─deltaSumTimestamp(value, timestamp)─┐
│ 13 │
└─────────────────────────────────────┘
```

View File

@ -94,26 +94,35 @@ LAYOUT(FLAT(INITIAL_ARRAY_SIZE 50000 MAX_ARRAY_SIZE 5000000))
Словарь полностью хранится в оперативной памяти в виде хэш-таблиц. Словарь может содержать произвольное количество элементов с произвольными идентификаторами. На практике количество ключей может достигать десятков миллионов элементов. Словарь полностью хранится в оперативной памяти в виде хэш-таблиц. Словарь может содержать произвольное количество элементов с произвольными идентификаторами. На практике количество ключей может достигать десятков миллионов элементов.
Если `preallocate` имеет значение `true` (по умолчанию `false`), хеш-таблица будет предварительно определена (это ускорит загрузку словаря). Используйте этот метод только в случае, если:
- Источник поддерживает произвольное количество элементов (пока поддерживается только источником `ClickHouse`).
- В данных нет дубликатов (иначе это может увеличить объем используемой памяти хеш-таблицы).
Поддерживаются все виды источников. При обновлении данные (из файла, из таблицы) читаются целиком. Поддерживаются все виды источников. При обновлении данные (из файла, из таблицы) читаются целиком.
Пример конфигурации: Пример конфигурации:
``` xml ``` xml
<layout> <layout>
<hashed /> <hashed>
<preallocate>0</preallocate>
</hashed>
</layout> </layout>
``` ```
или или
``` sql ``` sql
LAYOUT(HASHED()) LAYOUT(HASHED(PREALLOCATE 0))
``` ```
### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed} ### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed}
Аналогичен `hashed`, но при этом занимает меньше места в памяти и генерирует более высокую загрузку CPU. Аналогичен `hashed`, но при этом занимает меньше места в памяти и генерирует более высокую загрузку CPU.
Для этого типа размещения также можно задать `preallocate` в значении `true`. В данном случае это более важно, чем для типа `hashed`.
Пример конфигурации: Пример конфигурации:
``` xml ``` xml
@ -125,7 +134,7 @@ LAYOUT(HASHED())
или или
``` sql ``` sql
LAYOUT(SPARSE_HASHED()) LAYOUT(SPARSE_HASHED([PREALLOCATE 0]))
``` ```
### complex_key_hashed {#complex-key-hashed} ### complex_key_hashed {#complex-key-hashed}
@ -443,4 +452,3 @@ dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1')))
Никакие другие типы не поддерживаются. Функция возвращает атрибут для префикса, соответствующего данному IP-адресу. Если есть перекрывающиеся префиксы, возвращается наиболее специфический. Никакие другие типы не поддерживаются. Функция возвращает атрибут для префикса, соответствующего данному IP-адресу. Если есть перекрывающиеся префиксы, возвращается наиболее специфический.
Данные должны полностью помещаться в оперативной памяти. Данные должны полностью помещаться в оперативной памяти.

View File

@ -7,13 +7,14 @@ toc_title: "Функции для работы с внешними словар
Информацию о подключении и настройке внешних словарей смотрите в разделе [Внешние словари](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). Информацию о подключении и настройке внешних словарей смотрите в разделе [Внешние словари](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md).
## dictGet, dictGetOrDefault {#dictget} ## dictGet, dictGetOrDefault, dictGetOrNull {#dictget}
Извлекает значение из внешнего словаря. Извлекает значение из внешнего словаря.
``` sql ``` sql
dictGet('dict_name', attr_names, id_expr) dictGet('dict_name', attr_names, id_expr)
dictGetOrDefault('dict_name', attr_names, id_expr, default_value_expr) dictGetOrDefault('dict_name', attr_names, id_expr, default_value_expr)
dictGetOrNull('dict_name', attr_name, id_expr)
``` ```
**Аргументы** **Аргументы**
@ -31,6 +32,7 @@ dictGetOrDefault('dict_name', attr_names, id_expr, default_value_expr)
- `dictGet` возвращает содержимое элемента `<null_value>`, указанного для атрибута в конфигурации словаря. - `dictGet` возвращает содержимое элемента `<null_value>`, указанного для атрибута в конфигурации словаря.
- `dictGetOrDefault` возвращает атрибут `default_value_expr`. - `dictGetOrDefault` возвращает атрибут `default_value_expr`.
- `dictGetOrNull` возвращает `NULL` в случае, если ключ не найден в словаре.
Если значение атрибута не удалось обработать или оно не соответствует типу данных атрибута, то ClickHouse генерирует исключение. Если значение атрибута не удалось обработать или оно не соответствует типу данных атрибута, то ClickHouse генерирует исключение.
@ -158,6 +160,65 @@ LIMIT 3;
└─────────┴───────────────────────┘ └─────────┴───────────────────────┘
``` ```
**Пример для словаря с диапазоном ключей**
Создадим таблицу:
```sql
CREATE TABLE range_key_dictionary_source_table
(
key UInt64,
start_date Date,
end_date Date,
value String,
value_nullable Nullable(String)
)
ENGINE = TinyLog();
INSERT INTO range_key_dictionary_source_table VALUES(1, toDate('2019-05-20'), toDate('2019-05-20'), 'First', 'First');
INSERT INTO range_key_dictionary_source_table VALUES(2, toDate('2019-05-20'), toDate('2019-05-20'), 'Second', NULL);
INSERT INTO range_key_dictionary_source_table VALUES(3, toDate('2019-05-20'), toDate('2019-05-20'), 'Third', 'Third');
```
Создадим внешний словарь:
```sql
CREATE DICTIONARY range_key_dictionary
(
key UInt64,
start_date Date,
end_date Date,
value String,
value_nullable Nullable(String)
)
PRIMARY KEY key
SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'range_key_dictionary_source_table'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(RANGE_HASHED())
RANGE(MIN start_date MAX end_date);
```
Выполним запрос:
``` sql
SELECT
(number, toDate('2019-05-20')),
dictHas('range_key_dictionary', number, toDate('2019-05-20')),
dictGetOrNull('range_key_dictionary', 'value', number, toDate('2019-05-20')),
dictGetOrNull('range_key_dictionary', 'value_nullable', number, toDate('2019-05-20')),
dictGetOrNull('range_key_dictionary', ('value', 'value_nullable'), number, toDate('2019-05-20'))
FROM system.numbers LIMIT 5 FORMAT TabSeparated;
```
Результат:
``` text
(0,'2019-05-20') 0 \N \N (NULL,NULL)
(1,'2019-05-20') 1 First First ('First','First')
(2,'2019-05-20') 0 \N \N (NULL,NULL)
(3,'2019-05-20') 0 \N \N (NULL,NULL)
(4,'2019-05-20') 0 \N \N (NULL,NULL)
```
**Смотрите также** **Смотрите также**
- [Внешние словари](../../sql-reference/functions/ext-dict-functions.md) - [Внешние словари](../../sql-reference/functions/ext-dict-functions.md)

View File

@ -3,6 +3,7 @@
#include <Columns/IColumn.h> #include <Columns/IColumn.h>
#include <Columns/ColumnNullable.h> #include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h> #include <Columns/ColumnConst.h>
#include <Columns/ColumnArray.h>
#include <Core/Field.h> #include <Core/Field.h>

View File

@ -104,7 +104,7 @@ class IColumn;
M(Bool, compile_expressions, true, "Compile some scalar functions and operators to native code.", 0) \ M(Bool, compile_expressions, true, "Compile some scalar functions and operators to native code.", 0) \
M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \ M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \
M(UInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.", 0) \ M(UInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.", 0) \
M(UInt64, group_by_two_level_threshold_bytes, 100000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \ M(UInt64, group_by_two_level_threshold_bytes, 50000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \
M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \ M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \
M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \ M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \
\ \

View File

@ -6,6 +6,7 @@
#include <unistd.h> #include <unistd.h>
#include <IO/ReadBufferAIO.h> #include <IO/ReadBufferAIO.h>
#include <Common/randomSeed.h> #include <Common/randomSeed.h>
#include <filesystem>
#include <fstream> #include <fstream>
#include <string> #include <string>
@ -14,7 +15,7 @@ namespace
{ {
std::string createTmpFileForEOFtest() std::string createTmpFileForEOFtest()
{ {
char pattern[] = "/tmp/fileXXXXXX"; char pattern[] = "./EOFtestFolderXXXXXX";
if (char * dir = ::mkdtemp(pattern); dir) if (char * dir = ::mkdtemp(pattern); dir)
{ {
return std::string(dir) + "/foo"; return std::string(dir) + "/foo";
@ -78,6 +79,13 @@ TEST(ReadBufferAIOTest, TestReadAfterAIO)
size_t read_after_eof_big = testbuf.read(repeatdata.data(), repeatdata.size()); size_t read_after_eof_big = testbuf.read(repeatdata.data(), repeatdata.size());
EXPECT_EQ(read_after_eof_big, data.length()); EXPECT_EQ(read_after_eof_big, data.length());
EXPECT_TRUE(testbuf.eof()); EXPECT_TRUE(testbuf.eof());
if (file_path[0] != '/')
{
const size_t last_slash = file_path.rfind('/');
const std::string temp_dir = file_path.substr(0, last_slash);
std::filesystem::remove_all(temp_dir);
}
} }
#endif #endif

View File

@ -26,6 +26,7 @@ namespace ErrorCodes
extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
extern const int THERE_IS_NO_COLUMN; extern const int THERE_IS_NO_COLUMN;
extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_COLUMN;
extern const int NOT_FOUND_COLUMN_IN_BLOCK;
} }
const char * ActionsDAG::typeToString(ActionsDAG::ActionType type) const char * ActionsDAG::typeToString(ActionsDAG::ActionType type)
@ -439,6 +440,164 @@ void ActionsDAG::removeUnusedActions(bool allow_remove_inputs)
inputs.erase(it, inputs.end()); inputs.erase(it, inputs.end());
} }
static ColumnWithTypeAndName executeActionForHeader(const ActionsDAG::Node * node, ColumnsWithTypeAndName arguments)
{
ColumnWithTypeAndName res_column;
res_column.type = node->result_type;
res_column.name = node->result_name;
switch (node->type)
{
case ActionsDAG::ActionType::FUNCTION:
{
// bool all_args_are_const = true;
// for (const auto & argument : arguments)
// if (typeid_cast<const ColumnConst *>(argument.column.get()) == nullptr)
// all_args_are_const = false;
res_column.column = node->function->execute(arguments, res_column.type, 0, true);
// if (!all_args_are_const)
// res_column.column = res_column.column->convertToFullColumnIfConst();
break;
}
case ActionsDAG::ActionType::ARRAY_JOIN:
{
auto key = arguments.at(0);
key.column = key.column->convertToFullColumnIfConst();
const ColumnArray * array = typeid_cast<const ColumnArray *>(key.column.get());
if (!array)
throw Exception(ErrorCodes::TYPE_MISMATCH,
"ARRAY JOIN of not array: {}", node->result_name);
res_column.column = array->getDataPtr()->cloneEmpty();
break;
}
case ActionsDAG::ActionType::COLUMN:
{
res_column.column = node->column->cloneResized(0);
break;
}
case ActionsDAG::ActionType::ALIAS:
{
res_column.column = arguments.at(0).column;
break;
}
case ActionsDAG::ActionType::INPUT:
{
break;
}
}
return res_column;
}
Block ActionsDAG::updateHeader(Block header) const
{
std::unordered_map<const Node *, ColumnWithTypeAndName> node_to_column;
std::set<size_t> pos_to_remove;
{
std::unordered_map<std::string_view, std::list<size_t>> input_positions;
for (size_t pos = 0; pos < inputs.size(); ++pos)
input_positions[inputs[pos]->result_name].emplace_back(pos);
for (size_t pos = 0; pos < header.columns(); ++pos)
{
const auto & col = header.getByPosition(pos);
auto it = input_positions.find(col.name);
if (it != input_positions.end() && !it->second.empty())
{
auto & list = it->second;
pos_to_remove.insert(pos);
node_to_column[inputs[list.front()]] = std::move(col);
list.pop_front();
}
}
}
ColumnsWithTypeAndName result_columns;
result_columns.reserve(index.size());
struct Frame
{
const Node * node;
size_t next_child = 0;
};
{
for (const auto * output : index)
{
if (node_to_column.count(output) == 0)
{
std::stack<Frame> stack;
stack.push({.node = output});
while (!stack.empty())
{
auto & frame = stack.top();
const auto * node = frame.node;
while (frame.next_child < node->children.size())
{
const auto * child = node->children[frame.next_child];
if (node_to_column.count(child) == 0)
{
stack.push({.node = child});
break;
}
++frame.next_child;
}
if (frame.next_child < node->children.size())
continue;
stack.pop();
ColumnsWithTypeAndName arguments(node->children.size());
for (size_t i = 0; i < arguments.size(); ++i)
{
arguments[i] = node_to_column[node->children[i]];
if (!arguments[i].column)
throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK,
"Not found column {} in block", node->children[i]->result_name);
}
node_to_column[node] = executeActionForHeader(node, std::move(arguments));
}
}
auto & column = node_to_column[output];
if (column.column)
result_columns.push_back(node_to_column[output]);
}
}
if (isInputProjected())
header.clear();
else
header.erase(pos_to_remove);
Block res;
for (auto & col : result_columns)
res.insert(std::move(col));
for (const auto & item : header)
res.insert(std::move(item));
return res;
}
NameSet ActionsDAG::foldActionsByProjection( NameSet ActionsDAG::foldActionsByProjection(
const NameSet & required_columns, const Block & projection_block_for_keys, const String & predicate_column_name, bool add_missing_keys) const NameSet & required_columns, const Block & projection_block_for_keys, const String & predicate_column_name, bool add_missing_keys)
{ {

View File

@ -186,6 +186,14 @@ public:
ActionsDAGPtr clone() const; ActionsDAGPtr clone() const;
/// Execute actions for header. Input block must have empty columns.
/// Result should be equal to the execution of ExpressionActions build form this DAG.
/// Actions are not changed, no expressions are compiled.
///
/// In addition, check that result constants are constants according to DAG.
/// In case if function return constant, but arguments are not constant, materialize it.
Block updateHeader(Block header) const;
/// For apply materialize() function for every output. /// For apply materialize() function for every output.
/// Also add aliases so the result names remain unchanged. /// Also add aliases so the result names remain unchanged.
void addMaterializingOutputActions(); void addMaterializingOutputActions();

View File

@ -4,14 +4,15 @@
#if USE_ARROW || USE_ORC || USE_PARQUET #if USE_ARROW || USE_ORC || USE_PARQUET
#include <DataTypes/DataTypeFactory.h> #include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesDecimal.h> #include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeArray.h>
#include <common/DateLUTImpl.h> #include <common/DateLUTImpl.h>
#include <common/types.h> #include <common/types.h>
#include <Core/Block.h> #include <Core/Block.h>
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Columns/ColumnNullable.h> #include <Columns/ColumnNullable.h>
#include <Columns/ColumnArray.h>
#include <Interpreters/castColumn.h> #include <Interpreters/castColumn.h>
#include <algorithm> #include <algorithm>
#include <DataTypes/DataTypeLowCardinality.h> #include <DataTypes/DataTypeLowCardinality.h>
@ -59,9 +60,9 @@ namespace DB
/// Inserts numeric data right into internal column data to reduce an overhead /// Inserts numeric data right into internal column data to reduce an overhead
template <typename NumericType, typename VectorType = ColumnVector<NumericType>> template <typename NumericType, typename VectorType = ColumnVector<NumericType>>
static void fillColumnWithNumericData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column) static void fillColumnWithNumericData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{ {
auto & column_data = static_cast<VectorType &>(*internal_column).getData(); auto & column_data = static_cast<VectorType &>(internal_column).getData();
column_data.reserve(arrow_column->length()); column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -78,10 +79,10 @@ namespace DB
/// Inserts chars and offsets right into internal column data to reduce an overhead. /// Inserts chars and offsets right into internal column data to reduce an overhead.
/// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars. /// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars.
/// Also internal strings are null terminated. /// Also internal strings are null terminated.
static void fillColumnWithStringData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column) static void fillColumnWithStringData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{ {
PaddedPODArray<UInt8> & column_chars_t = assert_cast<ColumnString &>(*internal_column).getChars(); PaddedPODArray<UInt8> & column_chars_t = assert_cast<ColumnString &>(internal_column).getChars();
PaddedPODArray<UInt64> & column_offsets = assert_cast<ColumnString &>(*internal_column).getOffsets(); PaddedPODArray<UInt64> & column_offsets = assert_cast<ColumnString &>(internal_column).getOffsets();
size_t chars_t_size = 0; size_t chars_t_size = 0;
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -116,9 +117,9 @@ namespace DB
} }
} }
static void fillColumnWithBooleanData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column) static void fillColumnWithBooleanData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{ {
auto & column_data = assert_cast<ColumnVector<UInt8> &>(*internal_column).getData(); auto & column_data = assert_cast<ColumnVector<UInt8> &>(internal_column).getData();
column_data.reserve(arrow_column->length()); column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -133,9 +134,9 @@ namespace DB
} }
/// Arrow stores Parquet::DATE in Int32, while ClickHouse stores Date in UInt16. Therefore, it should be checked before saving /// Arrow stores Parquet::DATE in Int32, while ClickHouse stores Date in UInt16. Therefore, it should be checked before saving
static void fillColumnWithDate32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column) static void fillColumnWithDate32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{ {
PaddedPODArray<UInt16> & column_data = assert_cast<ColumnVector<UInt16> &>(*internal_column).getData(); PaddedPODArray<UInt16> & column_data = assert_cast<ColumnVector<UInt16> &>(internal_column).getData();
column_data.reserve(arrow_column->length()); column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -148,7 +149,7 @@ namespace DB
if (days_num > DATE_LUT_MAX_DAY_NUM) if (days_num > DATE_LUT_MAX_DAY_NUM)
{ {
// TODO: will it rollback correctly? // TODO: will it rollback correctly?
throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + internal_column->getName() throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + internal_column.getName()
+ "\" is greater than " + "\" is greater than "
"max allowed Date value, which is " "max allowed Date value, which is "
+ std::to_string(DATE_LUT_MAX_DAY_NUM), + std::to_string(DATE_LUT_MAX_DAY_NUM),
@ -161,9 +162,9 @@ namespace DB
} }
/// Arrow stores Parquet::DATETIME in Int64, while ClickHouse stores DateTime in UInt32. Therefore, it should be checked before saving /// Arrow stores Parquet::DATETIME in Int64, while ClickHouse stores DateTime in UInt32. Therefore, it should be checked before saving
static void fillColumnWithDate64Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column) static void fillColumnWithDate64Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{ {
auto & column_data = assert_cast<ColumnVector<UInt32> &>(*internal_column).getData(); auto & column_data = assert_cast<ColumnVector<UInt32> &>(internal_column).getData();
column_data.reserve(arrow_column->length()); column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -177,9 +178,9 @@ namespace DB
} }
} }
static void fillColumnWithTimestampData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column) static void fillColumnWithTimestampData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{ {
auto & column_data = assert_cast<ColumnVector<UInt32> &>(*internal_column).getData(); auto & column_data = assert_cast<ColumnVector<UInt32> &>(internal_column).getData();
column_data.reserve(arrow_column->length()); column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -213,9 +214,9 @@ namespace DB
} }
} }
static void fillColumnWithDecimalData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column) static void fillColumnWithDecimalData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{ {
auto & column = assert_cast<ColumnDecimal<Decimal128> &>(*internal_column); auto & column = assert_cast<ColumnDecimal<Decimal128> &>(internal_column);
auto & column_data = column.getData(); auto & column_data = column.getData();
column_data.reserve(arrow_column->length()); column_data.reserve(arrow_column->length());
@ -230,9 +231,9 @@ namespace DB
} }
/// Creates a null bytemap from arrow's null bitmap /// Creates a null bytemap from arrow's null bitmap
static void fillByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & bytemap) static void fillByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & bytemap)
{ {
PaddedPODArray<UInt8> & bytemap_data = assert_cast<ColumnVector<UInt8> &>(*bytemap).getData(); PaddedPODArray<UInt8> & bytemap_data = assert_cast<ColumnVector<UInt8> &>(bytemap).getData();
bytemap_data.reserve(arrow_column->length()); bytemap_data.reserve(arrow_column->length());
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->num_chunks()); ++chunk_i) for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->num_chunks()); ++chunk_i)
@ -244,6 +245,143 @@ namespace DB
} }
} }
static void fillOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & offsets)
{
ColumnArray::Offsets & offsets_data = assert_cast<ColumnVector<UInt64> &>(offsets).getData();
offsets_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
auto arrow_offsets_array = list_chunk.offsets();
auto & arrow_offsets = static_cast<arrow::Int32Array &>(*arrow_offsets_array);
auto start = offsets_data.back();
for (int64_t i = 1; i < arrow_offsets.length(); ++i)
offsets_data.emplace_back(start + arrow_offsets.Value(i));
}
}
static void readColumnFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column, const std::string & column_name, const std::string format_name, bool is_nullable)
{
if (internal_column.isNullable())
{
ColumnNullable & column_nullable = typeid_cast<ColumnNullable &>(internal_column);
readColumnFromArrowColumn(arrow_column, column_nullable.getNestedColumn(), column_name, format_name, true);
fillByteMapFromArrowColumn(arrow_column, column_nullable.getNullMapColumn());
return;
}
// TODO: check if a column is const?
if (!is_nullable && !checkColumn<ColumnArray>(internal_column) && arrow_column->null_count())
{
throw Exception
{
"Can not insert NULL data into non-nullable column \"" + column_name + "\"",
ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN
};
}
switch (arrow_column->type()->id())
{
case arrow::Type::STRING:
case arrow::Type::BINARY:
//case arrow::Type::FIXED_SIZE_BINARY:
fillColumnWithStringData(arrow_column, internal_column);
break;
case arrow::Type::BOOL:
fillColumnWithBooleanData(arrow_column, internal_column);
break;
case arrow::Type::DATE32:
fillColumnWithDate32Data(arrow_column, internal_column);
break;
case arrow::Type::DATE64:
fillColumnWithDate64Data(arrow_column, internal_column);
break;
case arrow::Type::TIMESTAMP:
fillColumnWithTimestampData(arrow_column, internal_column);
break;
case arrow::Type::DECIMAL:
//fillColumnWithNumericData<Decimal128, ColumnDecimal<Decimal128>>(arrow_column, read_column); // Have problems with trash values under NULL, but faster
fillColumnWithDecimalData(arrow_column, internal_column /*, internal_nested_type*/);
break;
case arrow::Type::LIST:
{
const auto * list_type = static_cast<arrow::ListType *>(arrow_column->type().get());
auto list_nested_type = list_type->value_type();
arrow::ArrayVector array_vector;
array_vector.reserve(arrow_column->num_chunks());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
std::shared_ptr<arrow::Array> chunk = list_chunk.values();
array_vector.emplace_back(std::move(chunk));
}
auto arrow_nested_column = std::make_shared<arrow::ChunkedArray>(array_vector);
ColumnArray & column_array = typeid_cast<ColumnArray &>(internal_column);
readColumnFromArrowColumn(arrow_nested_column, column_array.getData(), column_name, format_name, false);
fillOffsetsFromArrowListColumn(arrow_column, column_array.getOffsetsColumn());
break;
}
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \
fillColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, internal_column); \
break;
FOR_ARROW_NUMERIC_TYPES(DISPATCH)
# undef DISPATCH
// TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds?
// TODO: read JSON as a string?
// TODO: read UUID as a string?
default:
throw Exception
{
"Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \""
+ column_name + "\"",
ErrorCodes::UNKNOWN_TYPE
};
}
}
static DataTypePtr getInternalType(std::shared_ptr<arrow::DataType> arrow_type, const DataTypePtr & column_type, const std::string & column_name, const std::string & format_name)
{
if (column_type->isNullable())
{
DataTypePtr nested_type = typeid_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
return makeNullable(getInternalType(arrow_type, nested_type, column_name, format_name));
}
if (arrow_type->id() == arrow::Type::DECIMAL)
{
const auto * decimal_type = static_cast<arrow::DecimalType *>(arrow_type.get());
return std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(), decimal_type->scale());
}
if (arrow_type->id() == arrow::Type::LIST)
{
const auto * list_type = static_cast<arrow::ListType *>(arrow_type.get());
auto list_nested_type = list_type->value_type();
const DataTypeArray * array_type = typeid_cast<const DataTypeArray *>(column_type.get());
if (!array_type)
throw Exception{"Cannot convert arrow LIST type to a not Array ClickHouse type " + column_type->getName(), ErrorCodes::CANNOT_CONVERT_TYPE};
return std::make_shared<DataTypeArray>(getInternalType(list_nested_type, array_type->getNestedType(), column_name, format_name));
}
if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(),
[=](auto && elem) { return elem.first == arrow_type->id(); });
internal_type_it != arrow_type_to_internal_type.end())
{
return DataTypeFactory::instance().get(internal_type_it->second);
}
throw Exception
{
"The type \"" + arrow_type->name() + "\" of an input column \"" + column_name + "\" is not supported for conversion from a " + format_name + " data format",
ErrorCodes::CANNOT_CONVERT_TYPE
};
}
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table, void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table,
const Block & header, std::string format_name) const Block & header, std::string format_name)
{ {
@ -272,98 +410,15 @@ namespace DB
ErrorCodes::THERE_IS_NO_COLUMN}; ErrorCodes::THERE_IS_NO_COLUMN};
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[header_column.name]; std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[header_column.name];
arrow::Type::type arrow_type = arrow_column->type()->id();
// TODO: check if a column is const? DataTypePtr internal_type = getInternalType(arrow_column->type(), column_type, header_column.name, format_name);
if (!column_type->isNullable() && arrow_column->null_count())
{
throw Exception{"Can not insert NULL data into non-nullable column \"" + header_column.name + "\"",
ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN};
}
const bool target_column_is_nullable = column_type->isNullable() || arrow_column->null_count(); MutableColumnPtr read_column = internal_type->createColumn();
readColumnFromArrowColumn(arrow_column, *read_column, header_column.name, format_name, false);
DataTypePtr internal_nested_type;
if (arrow_type == arrow::Type::DECIMAL)
{
const auto * decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
internal_nested_type = std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(),
decimal_type->scale());
}
else if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(),
[=](auto && elem) { return elem.first == arrow_type; });
internal_type_it != arrow_type_to_internal_type.end())
{
internal_nested_type = DataTypeFactory::instance().get(internal_type_it->second);
}
else
{
throw Exception{"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + header_column.name
+ "\" is not supported for conversion from a " + format_name + " data format",
ErrorCodes::CANNOT_CONVERT_TYPE};
}
const DataTypePtr internal_type = target_column_is_nullable ? makeNullable(internal_nested_type)
: internal_nested_type;
ColumnWithTypeAndName column; ColumnWithTypeAndName column;
column.name = header_column.name; column.name = header_column.name;
column.type = internal_type; column.type = internal_type;
/// Data
MutableColumnPtr read_column = internal_nested_type->createColumn();
switch (arrow_type)
{
case arrow::Type::STRING:
case arrow::Type::BINARY:
//case arrow::Type::FIXED_SIZE_BINARY:
fillColumnWithStringData(arrow_column, read_column);
break;
case arrow::Type::BOOL:
fillColumnWithBooleanData(arrow_column, read_column);
break;
case arrow::Type::DATE32:
fillColumnWithDate32Data(arrow_column, read_column);
break;
case arrow::Type::DATE64:
fillColumnWithDate64Data(arrow_column, read_column);
break;
case arrow::Type::TIMESTAMP:
fillColumnWithTimestampData(arrow_column, read_column);
break;
case arrow::Type::DECIMAL:
//fillColumnWithNumericData<Decimal128, ColumnDecimal<Decimal128>>(arrow_column, read_column); // Have problems with trash values under NULL, but faster
fillColumnWithDecimalData(arrow_column, read_column /*, internal_nested_type*/);
break;
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \
fillColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, read_column); \
break;
FOR_ARROW_NUMERIC_TYPES(DISPATCH)
# undef DISPATCH
// TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds?
// TODO: read JSON as a string?
// TODO: read UUID as a string?
default:
throw Exception
{
"Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \""
+ header_column.name + "\"",
ErrorCodes::UNKNOWN_TYPE
};
}
if (column.type->isNullable())
{
MutableColumnPtr null_bytemap = DataTypeUInt8().createColumn();
fillByteMapFromArrowColumn(arrow_column, null_bytemap);
column.column = ColumnNullable::create(std::move(read_column), std::move(null_bytemap));
}
else
column.column = std::move(read_column); column.column = std::move(read_column);
column.column = castColumn(column, header_column.type); column.column = castColumn(column, header_column.type);

View File

@ -5,12 +5,16 @@
#include <Columns/ColumnFixedString.h> #include <Columns/ColumnFixedString.h>
#include <Columns/ColumnNullable.h> #include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Columns/ColumnArray.h>
#include <Core/callOnTypeIndex.h> #include <Core/callOnTypeIndex.h>
#include <DataTypes/DataTypeDateTime.h> #include <DataTypes/DataTypeDateTime.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesDecimal.h> #include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeArray.h>
#include <Processors/Formats/IOutputFormat.h> #include <Processors/Formats/IOutputFormat.h>
#include <arrow/api.h> #include <arrow/api.h>
#include <arrow/builder.h>
#include <arrow/type.h>
#include <arrow/util/decimal.h> #include <arrow/util/decimal.h>
#include <DataTypes/DataTypeLowCardinality.h> #include <DataTypes/DataTypeLowCardinality.h>
@ -47,12 +51,6 @@ namespace DB
{"FixedString", arrow::utf8()}, {"FixedString", arrow::utf8()},
}; };
static const PaddedPODArray<UInt8> * extractNullBytemapPtr(ColumnPtr column)
{
ColumnPtr null_column = assert_cast<const ColumnNullable &>(*column).getNullMapColumnPtr();
const PaddedPODArray<UInt8> & null_bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
return &null_bytemap;
}
static void checkStatus(const arrow::Status & status, const String & column_name, const String & format_name) static void checkStatus(const arrow::Status & status, const String & column_name, const String & format_name)
{ {
@ -63,12 +61,14 @@ namespace DB
template <typename NumericType, typename ArrowBuilderType> template <typename NumericType, typename ArrowBuilderType>
static void fillArrowArrayWithNumericColumnData( static void fillArrowArrayWithNumericColumnData(
ColumnPtr write_column, ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap, const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name) const String & format_name,
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{ {
const PaddedPODArray<NumericType> & internal_data = assert_cast<const ColumnVector<NumericType> &>(*write_column).getData(); const PaddedPODArray<NumericType> & internal_data = assert_cast<const ColumnVector<NumericType> &>(*write_column).getData();
ArrowBuilderType builder; ArrowBuilderType & builder = assert_cast<ArrowBuilderType &>(*array_builder);
arrow::Status status; arrow::Status status;
const UInt8 * arrow_null_bytemap_raw_ptr = nullptr; const UInt8 * arrow_null_bytemap_raw_ptr = nullptr;
@ -76,38 +76,75 @@ namespace DB
if (null_bytemap) if (null_bytemap)
{ {
/// Invert values since Arrow interprets 1 as a non-null value, while CH as a null /// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
arrow_null_bytemap.reserve(null_bytemap->size()); arrow_null_bytemap.reserve(end - start);
for (auto is_null : *null_bytemap) for (size_t i = start; i < end; ++i)
arrow_null_bytemap.emplace_back(!is_null); arrow_null_bytemap.template emplace_back(!(*null_bytemap)[i]);
arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data(); arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
} }
if constexpr (std::is_same_v<NumericType, UInt8>) if constexpr (std::is_same_v<NumericType, UInt8>)
status = builder.AppendValues( status = builder.AppendValues(
reinterpret_cast<const uint8_t *>(internal_data.data()), reinterpret_cast<const uint8_t *>(internal_data.data() + start),
internal_data.size(), end - start,
reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr)); reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr));
else else
status = builder.AppendValues(internal_data.data(), internal_data.size(), reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr)); status = builder.AppendValues(internal_data.data() + start, end - start, reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr));
checkStatus(status, write_column->getName(), format_name); checkStatus(status, write_column->getName(), format_name);
}
status = builder.Finish(&arrow_array); static void fillArrowArray(
checkStatus(status, write_column->getName(), format_name); const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
size_t start,
size_t end);
static void fillArrowArrayWithArrayColumnData(
const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
size_t start,
size_t end)
{
const auto * column_array = static_cast<const ColumnArray *>(column.get());
ColumnPtr nested_column = column_array->getDataPtr();
DataTypePtr nested_type = typeid_cast<const DataTypeArray *>(column_type.get())->getNestedType();
const auto & offsets = column_array->getOffsets();
arrow::ListBuilder & builder = assert_cast<arrow::ListBuilder &>(*array_builder);
arrow::ArrayBuilder * value_builder = builder.value_builder();
arrow::Status components_status;
for (size_t array_idx = start; array_idx < end; ++array_idx)
{
/// Start new array
components_status = builder.Append();
checkStatus(components_status, nested_column->getName(), format_name);
fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx]);
}
} }
template <typename ColumnType> template <typename ColumnType>
static void fillArrowArrayWithStringColumnData( static void fillArrowArrayWithStringColumnData(
ColumnPtr write_column, ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap, const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name) const String & format_name,
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{ {
const auto & internal_column = assert_cast<const ColumnType &>(*write_column); const auto & internal_column = assert_cast<const ColumnType &>(*write_column);
arrow::StringBuilder builder; arrow::StringBuilder & builder = assert_cast<arrow::StringBuilder &>(*array_builder);
arrow::Status status; arrow::Status status;
for (size_t string_i = 0, size = internal_column.size(); string_i < size; ++string_i) for (size_t string_i = start; string_i < end; ++string_i)
{ {
if (null_bytemap && (*null_bytemap)[string_i]) if (null_bytemap && (*null_bytemap)[string_i])
{ {
@ -121,23 +158,22 @@ namespace DB
checkStatus(status, write_column->getName(), format_name); checkStatus(status, write_column->getName(), format_name);
} }
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName(), format_name);
} }
static void fillArrowArrayWithDateColumnData( static void fillArrowArrayWithDateColumnData(
ColumnPtr write_column, ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap, const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name) const String & format_name,
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{ {
const PaddedPODArray<UInt16> & internal_data = assert_cast<const ColumnVector<UInt16> &>(*write_column).getData(); const PaddedPODArray<UInt16> & internal_data = assert_cast<const ColumnVector<UInt16> &>(*write_column).getData();
//arrow::Date32Builder date_builder; //arrow::Date32Builder date_builder;
arrow::UInt16Builder builder; arrow::UInt16Builder & builder = assert_cast<arrow::UInt16Builder &>(*array_builder);
arrow::Status status; arrow::Status status;
for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i) for (size_t value_i = start; value_i < end; ++value_i)
{ {
if (null_bytemap && (*null_bytemap)[value_i]) if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull(); status = builder.AppendNull();
@ -146,23 +182,22 @@ namespace DB
status = builder.Append(internal_data[value_i]); status = builder.Append(internal_data[value_i]);
checkStatus(status, write_column->getName(), format_name); checkStatus(status, write_column->getName(), format_name);
} }
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName(), format_name);
} }
static void fillArrowArrayWithDateTimeColumnData( static void fillArrowArrayWithDateTimeColumnData(
ColumnPtr write_column, ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap, const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name) const String & format_name,
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{ {
const auto & internal_data = assert_cast<const ColumnVector<UInt32> &>(*write_column).getData(); const auto & internal_data = assert_cast<const ColumnVector<UInt32> &>(*write_column).getData();
//arrow::Date64Builder builder; //arrow::Date64Builder builder;
arrow::UInt32Builder builder; arrow::UInt32Builder & builder = assert_cast<arrow::UInt32Builder &>(*array_builder);
arrow::Status status; arrow::Status status;
for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i) for (size_t value_i = start; value_i < end; ++value_i)
{ {
if (null_bytemap && (*null_bytemap)[value_i]) if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull(); status = builder.AppendNull();
@ -173,24 +208,98 @@ namespace DB
checkStatus(status, write_column->getName(), format_name); checkStatus(status, write_column->getName(), format_name);
} }
}
status = builder.Finish(&arrow_array); static void fillArrowArray(
checkStatus(status, write_column->getName(), format_name); const String & column_name,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name,
size_t start,
size_t end)
{
const String column_type_name = column_type->getFamilyName();
if ("Nullable" == column_type_name)
{
const ColumnNullable * column_nullable = checkAndGetColumn<ColumnNullable>(column.get());
ColumnPtr nested_column = column_nullable->getNestedColumnPtr();
DataTypePtr nested_type = typeid_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end);
}
else if ("String" == column_type_name)
{
fillArrowArrayWithStringColumnData<ColumnString>(column, null_bytemap, format_name, array_builder, start, end);
}
else if ("FixedString" == column_type_name)
{
fillArrowArrayWithStringColumnData<ColumnFixedString>(column, null_bytemap, format_name, array_builder, start, end);
}
else if ("Date" == column_type_name)
{
fillArrowArrayWithDateColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if ("DateTime" == column_type_name)
{
fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if ("Array" == column_type_name)
{
fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end);
}
else if (isDecimal(column_type))
{
auto fill_decimal = [&](const auto & types) -> bool
{
using Types = std::decay_t<decltype(types)>;
using ToDataType = typename Types::LeftType;
if constexpr (
std::is_same_v<ToDataType,DataTypeDecimal<Decimal32>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
fillArrowArrayWithDecimalColumnData<ToDataType>(column, null_bytemap, array_builder, format_name, start, end);
}
return false;
};
callOnIndexAndDataType<void>(column_type->getTypeId(), fill_decimal);
}
#define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
else if (#CPP_NUMERIC_TYPE == column_type_name) \
{ \
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(column, null_bytemap, format_name, array_builder, start, end); \
}
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
else
{
throw Exception
{
"Internal type \"" + column_type_name + "\" of a column \"" + column_name + "\" is not supported for conversion into a " + format_name + " data format",
ErrorCodes::UNKNOWN_TYPE
};
}
} }
template <typename DataType> template <typename DataType>
static void fillArrowArrayWithDecimalColumnData( static void fillArrowArrayWithDecimalColumnData(
ColumnPtr write_column, ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap, const PaddedPODArray<UInt8> * null_bytemap,
const DataType * decimal_type, arrow::ArrayBuilder * array_builder,
const String & format_name) const String & format_name,
size_t start,
size_t end)
{ {
const auto & column = static_cast<const typename DataType::ColumnType &>(*write_column); const auto & column = static_cast<const typename DataType::ColumnType &>(*write_column);
arrow::DecimalBuilder builder(arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale())); arrow::DecimalBuilder & builder = assert_cast<arrow::DecimalBuilder &>(*array_builder);
arrow::Status status; arrow::Status status;
for (size_t value_i = 0, size = column.size(); value_i < size; ++value_i) for (size_t value_i = start; value_i < end; ++value_i)
{ {
if (null_bytemap && (*null_bytemap)[value_i]) if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull(); status = builder.AppendNull();
@ -200,10 +309,63 @@ namespace DB
checkStatus(status, write_column->getName(), format_name); checkStatus(status, write_column->getName(), format_name);
} }
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName(), format_name); checkStatus(status, write_column->getName(), format_name);
} }
static std::shared_ptr<arrow::DataType> getArrowType(DataTypePtr column_type, const std::string & column_name, const std::string & format_name, bool * is_column_nullable)
{
if (column_type->isNullable())
{
DataTypePtr nested_type = typeid_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
auto arrow_type = getArrowType(nested_type, column_name, format_name, is_column_nullable);
*is_column_nullable = true;
return arrow_type;
}
if (isDecimal(column_type))
{
std::shared_ptr<arrow::DataType> arrow_type;
const auto create_arrow_type = [&](const auto & types) -> bool {
using Types = std::decay_t<decltype(types)>;
using ToDataType = typename Types::LeftType;
if constexpr (
std::is_same_v<ToDataType, DataTypeDecimal<Decimal32>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
const auto & decimal_type = static_cast<const ToDataType *>(column_type.get());
arrow_type = arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale());
}
return false;
};
callOnIndexAndDataType<void>(column_type->getTypeId(), create_arrow_type);
return arrow_type;
}
if (isArray(column_type))
{
auto nested_type = typeid_cast<const DataTypeArray *>(column_type.get())->getNestedType();
auto nested_arrow_type = getArrowType(nested_type, column_name, format_name, is_column_nullable);
return arrow::list(nested_arrow_type);
}
const std::string type_name = column_type->getFamilyName();
if (const auto * arrow_type_it = std::find_if(
internal_type_to_arrow_type.begin(),
internal_type_to_arrow_type.end(),
[=](auto && elem) { return elem.first == type_name; });
arrow_type_it != internal_type_to_arrow_type.end())
{
return arrow_type_it->second;
}
throw Exception{"The type \"" + column_name + "\" of a column \"" + column_name + "\""
" is not supported for conversion into a " + format_name + " data format",
ErrorCodes::UNKNOWN_TYPE};
}
void CHColumnToArrowColumn::chChunkToArrowTable( void CHColumnToArrowColumn::chChunkToArrowTable(
std::shared_ptr<arrow::Table> & res, std::shared_ptr<arrow::Table> & res,
const Block & header, const Block & header,
@ -224,101 +386,20 @@ namespace DB
column.column = recursiveRemoveLowCardinality(chunk.getColumns()[column_i]); column.column = recursiveRemoveLowCardinality(chunk.getColumns()[column_i]);
column.type = recursiveRemoveLowCardinality(column.type); column.type = recursiveRemoveLowCardinality(column.type);
const bool is_column_nullable = column.type->isNullable(); bool is_column_nullable = false;
const auto & column_nested_type auto arrow_type = getArrowType(column.type, column.name, format_name, &is_column_nullable);
= is_column_nullable ? static_cast<const DataTypeNullable *>(column.type.get())->getNestedType() : column.type; arrow_fields.emplace_back(std::make_shared<arrow::Field>(column.name, arrow_type, is_column_nullable));
const String column_nested_type_name = column_nested_type->getFamilyName();
if (isDecimal(column_nested_type)) arrow::MemoryPool* pool = arrow::default_memory_pool();
{ std::unique_ptr<arrow::ArrayBuilder> array_builder;
const auto add_decimal_field = [&](const auto & types) -> bool { arrow::Status status = MakeBuilder(pool, arrow_fields[column_i]->type(), &array_builder);
using Types = std::decay_t<decltype(types)>; checkStatus(status, column.column->getName(), format_name);
using ToDataType = typename Types::LeftType;
if constexpr ( fillArrowArray(column.name, column.column, column.type, nullptr, array_builder.get(), format_name, 0, column.column->size());
std::is_same_v<ToDataType, DataTypeDecimal<Decimal32>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
const auto & decimal_type = static_cast<const ToDataType *>(column_nested_type.get());
arrow_fields.emplace_back(std::make_shared<arrow::Field>(
column.name, arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()), is_column_nullable));
}
return false;
};
callOnIndexAndDataType<void>(column_nested_type->getTypeId(), add_decimal_field);
}
else
{
if (const auto * arrow_type_it = std::find_if(internal_type_to_arrow_type.begin(), internal_type_to_arrow_type.end(),
[=](auto && elem) { return elem.first == column_nested_type_name; });
arrow_type_it != internal_type_to_arrow_type.end())
{
arrow_fields.emplace_back(std::make_shared<arrow::Field>(column.name, arrow_type_it->second, is_column_nullable));
} else
{
throw Exception{"The type \"" + column_nested_type_name + "\" of a column \"" + column.name + "\""
" is not supported for conversion into a " + format_name + " data format",
ErrorCodes::UNKNOWN_TYPE};
}
}
ColumnPtr nested_column
= is_column_nullable ? assert_cast<const ColumnNullable &>(*column.column).getNestedColumnPtr() : column.column;
const PaddedPODArray<UInt8> * null_bytemap = is_column_nullable ? extractNullBytemapPtr(column.column) : nullptr;
std::shared_ptr<arrow::Array> arrow_array; std::shared_ptr<arrow::Array> arrow_array;
status = array_builder->Finish(&arrow_array);
if ("String" == column_nested_type_name) checkStatus(status, column.column->getName(), format_name);
{
fillArrowArrayWithStringColumnData<ColumnString>(nested_column, arrow_array, null_bytemap, format_name);
}
else if ("FixedString" == column_nested_type_name)
{
fillArrowArrayWithStringColumnData<ColumnFixedString>(nested_column, arrow_array, null_bytemap, format_name);
}
else if ("Date" == column_nested_type_name)
{
fillArrowArrayWithDateColumnData(nested_column, arrow_array, null_bytemap, format_name);
}
else if ("DateTime" == column_nested_type_name)
{
fillArrowArrayWithDateTimeColumnData(nested_column, arrow_array, null_bytemap, format_name);
}
else if (isDecimal(column_nested_type))
{
auto fill_decimal = [&](const auto & types) -> bool
{
using Types = std::decay_t<decltype(types)>;
using ToDataType = typename Types::LeftType;
if constexpr (
std::is_same_v<ToDataType,DataTypeDecimal<Decimal32>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
const auto & decimal_type = static_cast<const ToDataType *>(column_nested_type.get());
fillArrowArrayWithDecimalColumnData(nested_column, arrow_array, null_bytemap, decimal_type, format_name);
}
return false;
};
callOnIndexAndDataType<void>(column_nested_type->getTypeId(), fill_decimal);
}
#define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
else if (#CPP_NUMERIC_TYPE == column_nested_type_name) \
{ \
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(nested_column, arrow_array, null_bytemap, format_name); \
}
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
else
{
throw Exception{"Internal type \"" + column_nested_type_name + "\" of a column \"" + column.name + "\""
" is not supported for conversion into a " + format_name + " data format",
ErrorCodes::UNKNOWN_TYPE};
}
arrow_arrays.emplace_back(std::move(arrow_array)); arrow_arrays.emplace_back(std::move(arrow_array));
} }

View File

@ -67,6 +67,14 @@ void ORCBlockInputFormat::resetParser()
stripe_current = 0; stripe_current = 0;
} }
size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
{
if (type->id() == arrow::Type::LIST)
return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type()) + 1;
return 1;
}
void ORCBlockInputFormat::prepareReader() void ORCBlockInputFormat::prepareReader()
{ {
THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(in), arrow::default_memory_pool(), &file_reader)); THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(in), arrow::default_memory_pool(), &file_reader));
@ -76,11 +84,16 @@ void ORCBlockInputFormat::prepareReader()
std::shared_ptr<arrow::Schema> schema; std::shared_ptr<arrow::Schema> schema;
THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema)); THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema));
int index = 0;
for (int i = 0; i < schema->num_fields(); ++i) for (int i = 0; i < schema->num_fields(); ++i)
{ {
if (getPort().getHeader().has(schema->field(i)->name())) if (getPort().getHeader().has(schema->field(i)->name()))
{ {
include_indices.push_back(i+1); /// LIST type require 2 indices, so we should recursively
/// count the number of indices we need for this type.
int indexes_count = countIndicesForType(schema->field(i)->type());
for (int j = 0; j != indexes_count; ++j)
include_indices.push_back(index++);
} }
} }
} }

View File

@ -834,7 +834,7 @@ void Pipe::transform(const Transformer & transformer)
if (collected_processors) if (collected_processors)
{ {
for (const auto & processor : processors) for (const auto & processor : new_processors)
collected_processors->emplace_back(processor); collected_processors->emplace_back(processor);
} }

View File

@ -31,7 +31,7 @@ static ITransformingStep::Traits getTraits(const ActionsDAGPtr & actions)
ExpressionStep::ExpressionStep(const DataStream & input_stream_, ActionsDAGPtr actions_dag_) ExpressionStep::ExpressionStep(const DataStream & input_stream_, ActionsDAGPtr actions_dag_)
: ITransformingStep( : ITransformingStep(
input_stream_, input_stream_,
Transform::transformHeader(input_stream_.header, std::make_shared<ExpressionActions>(actions_dag_, ExpressionActionsSettings{})), ExpressionTransform::transformHeader(input_stream_.header, *actions_dag_),
getTraits(actions_dag_)) getTraits(actions_dag_))
, actions_dag(std::move(actions_dag_)) , actions_dag(std::move(actions_dag_))
{ {
@ -42,8 +42,7 @@ ExpressionStep::ExpressionStep(const DataStream & input_stream_, ActionsDAGPtr a
void ExpressionStep::updateInputStream(DataStream input_stream, bool keep_header) void ExpressionStep::updateInputStream(DataStream input_stream, bool keep_header)
{ {
Block out_header = keep_header ? std::move(output_stream->header) Block out_header = keep_header ? std::move(output_stream->header)
: Transform::transformHeader(input_stream.header, : ExpressionTransform::transformHeader(input_stream.header, *actions_dag);
std::make_shared<ExpressionActions>(actions_dag, ExpressionActionsSettings{}));
output_stream = createOutputStream( output_stream = createOutputStream(
input_stream, input_stream,
std::move(out_header), std::move(out_header),
@ -58,7 +57,7 @@ void ExpressionStep::transformPipeline(QueryPipeline & pipeline, const BuildQuer
auto expression = std::make_shared<ExpressionActions>(actions_dag, settings.getActionsSettings()); auto expression = std::make_shared<ExpressionActions>(actions_dag, settings.getActionsSettings());
pipeline.addSimpleTransform([&](const Block & header) pipeline.addSimpleTransform([&](const Block & header)
{ {
return std::make_shared<Transform>(header, expression); return std::make_shared<ExpressionTransform>(header, expression);
}); });
if (!blocksHaveEqualStructure(pipeline.getHeader(), output_stream->header)) if (!blocksHaveEqualStructure(pipeline.getHeader(), output_stream->header))

View File

@ -14,7 +14,6 @@ class JoiningTransform;
class ExpressionStep : public ITransformingStep class ExpressionStep : public ITransformingStep
{ {
public: public:
using Transform = ExpressionTransform;
explicit ExpressionStep(const DataStream & input_stream_, ActionsDAGPtr actions_dag_); explicit ExpressionStep(const DataStream & input_stream_, ActionsDAGPtr actions_dag_);
String getName() const override { return "Expression"; } String getName() const override { return "Expression"; }

View File

@ -34,7 +34,7 @@ FilterStep::FilterStep(
input_stream_, input_stream_,
FilterTransform::transformHeader( FilterTransform::transformHeader(
input_stream_.header, input_stream_.header,
std::make_shared<ExpressionActions>(actions_dag_, ExpressionActionsSettings{}), *actions_dag_,
filter_column_name_, filter_column_name_,
remove_filter_column_), remove_filter_column_),
getTraits(actions_dag_)) getTraits(actions_dag_))
@ -52,7 +52,7 @@ void FilterStep::updateInputStream(DataStream input_stream, bool keep_header)
if (keep_header) if (keep_header)
out_header = FilterTransform::transformHeader( out_header = FilterTransform::transformHeader(
input_stream.header, input_stream.header,
std::make_shared<ExpressionActions>(actions_dag, ExpressionActionsSettings{}), *actions_dag,
filter_column_name, filter_column_name,
remove_filter_column); remove_filter_column);

View File

@ -37,7 +37,7 @@ TotalsHavingStep::TotalsHavingStep(
input_stream_, input_stream_,
TotalsHavingTransform::transformHeader( TotalsHavingTransform::transformHeader(
input_stream_.header, input_stream_.header,
(actions_dag_ ? std::make_shared<ExpressionActions>(actions_dag_, ExpressionActionsSettings{}) : nullptr), actions_dag_.get(),
final_), final_),
getTraits(!filter_column_.empty())) getTraits(!filter_column_.empty()))
, overflow_row(overflow_row_) , overflow_row(overflow_row_)

View File

@ -3,16 +3,14 @@
namespace DB namespace DB
{ {
Block ExpressionTransform::transformHeader(Block header, const ExpressionActionsPtr & expression) Block ExpressionTransform::transformHeader(Block header, const ActionsDAG & expression)
{ {
size_t num_rows = header.rows(); return expression.updateHeader(std::move(header));
expression->execute(header, num_rows, true);
return header;
} }
ExpressionTransform::ExpressionTransform(const Block & header_, ExpressionActionsPtr expression_) ExpressionTransform::ExpressionTransform(const Block & header_, ExpressionActionsPtr expression_)
: ISimpleTransform(header_, transformHeader(header_, expression_), false) : ISimpleTransform(header_, transformHeader(header_, expression_->getActionsDAG()), false)
, expression(std::move(expression_)) , expression(std::move(expression_))
{ {
} }

View File

@ -7,6 +7,8 @@ namespace DB
class ExpressionActions; class ExpressionActions;
using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>; using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
class ActionsDAG;
/** Executes a certain expression over the block. /** Executes a certain expression over the block.
* The expression consists of column identifiers from the block, constants, common functions. * The expression consists of column identifiers from the block, constants, common functions.
* For example: hits * 2 + 3, url LIKE '%yandex%' * For example: hits * 2 + 3, url LIKE '%yandex%'
@ -21,7 +23,7 @@ public:
String getName() const override { return "ExpressionTransform"; } String getName() const override { return "ExpressionTransform"; }
static Block transformHeader(Block header, const ExpressionActionsPtr & expression); static Block transformHeader(Block header, const ActionsDAG & expression);
protected: protected:
void transform(Chunk & chunk) override; void transform(Chunk & chunk) override;

View File

@ -29,12 +29,11 @@ static void replaceFilterToConstant(Block & block, const String & filter_column_
Block FilterTransform::transformHeader( Block FilterTransform::transformHeader(
Block header, Block header,
const ExpressionActionsPtr & expression, const ActionsDAG & expression,
const String & filter_column_name, const String & filter_column_name,
bool remove_filter_column) bool remove_filter_column)
{ {
size_t num_rows = header.rows(); header = expression.updateHeader(std::move(header));
expression->execute(header, num_rows);
if (remove_filter_column) if (remove_filter_column)
header.erase(filter_column_name); header.erase(filter_column_name);
@ -50,7 +49,10 @@ FilterTransform::FilterTransform(
String filter_column_name_, String filter_column_name_,
bool remove_filter_column_, bool remove_filter_column_,
bool on_totals_) bool on_totals_)
: ISimpleTransform(header_, transformHeader(header_, expression_, filter_column_name_, remove_filter_column_), true) : ISimpleTransform(
header_,
transformHeader(header_, expression_->getActionsDAG(), filter_column_name_, remove_filter_column_),
true)
, expression(std::move(expression_)) , expression(std::move(expression_))
, filter_column_name(std::move(filter_column_name_)) , filter_column_name(std::move(filter_column_name_))
, remove_filter_column(remove_filter_column_) , remove_filter_column(remove_filter_column_)

View File

@ -8,6 +8,8 @@ namespace DB
class ExpressionActions; class ExpressionActions;
using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>; using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
class ActionsDAG;
/** Implements WHERE, HAVING operations. /** Implements WHERE, HAVING operations.
* Takes an expression, which adds to the block one ColumnUInt8 column containing the filtering conditions. * Takes an expression, which adds to the block one ColumnUInt8 column containing the filtering conditions.
* The expression is evaluated and result chunks contain only the filtered rows. * The expression is evaluated and result chunks contain only the filtered rows.
@ -22,7 +24,7 @@ public:
static Block transformHeader( static Block transformHeader(
Block header, Block header,
const ExpressionActionsPtr & expression, const ActionsDAG & expression,
const String & filter_column_name, const String & filter_column_name,
bool remove_filter_column); bool remove_filter_column);

View File

@ -28,15 +28,13 @@ void finalizeChunk(Chunk & chunk)
chunk.setColumns(std::move(columns), num_rows); chunk.setColumns(std::move(columns), num_rows);
} }
Block TotalsHavingTransform::transformHeader(Block block, const ExpressionActionsPtr & expression, bool final) Block TotalsHavingTransform::transformHeader(Block block, const ActionsDAG * expression, bool final)
{ {
if (final) if (final)
finalizeBlock(block); finalizeBlock(block);
size_t num_rows = block.rows();
if (expression) if (expression)
expression->execute(block, num_rows); block = expression->updateHeader(std::move(block));
return block; return block;
} }
@ -49,7 +47,7 @@ TotalsHavingTransform::TotalsHavingTransform(
TotalsMode totals_mode_, TotalsMode totals_mode_,
double auto_include_threshold_, double auto_include_threshold_,
bool final_) bool final_)
: ISimpleTransform(header, transformHeader(header, expression_, final_), true) : ISimpleTransform(header, transformHeader(header, expression_ ? &expression_->getActionsDAG() : nullptr, final_), true)
, overflow_row(overflow_row_) , overflow_row(overflow_row_)
, expression(expression_) , expression(expression_)
, filter_column_name(filter_column_) , filter_column_name(filter_column_)

View File

@ -12,6 +12,8 @@ using ArenaPtr = std::shared_ptr<Arena>;
class ExpressionActions; class ExpressionActions;
using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>; using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
class ActionsDAG;
enum class TotalsMode; enum class TotalsMode;
/** Takes blocks after grouping, with non-finalized aggregate functions. /** Takes blocks after grouping, with non-finalized aggregate functions.
@ -37,7 +39,7 @@ public:
Status prepare() override; Status prepare() override;
void work() override; void work() override;
static Block transformHeader(Block block, const ExpressionActionsPtr & expression, bool final); static Block transformHeader(Block block, const ActionsDAG * expression, bool final);
protected: protected:
void transform(Chunk & chunk) override; void transform(Chunk & chunk) override;

View File

@ -9,6 +9,16 @@
<settings> <settings>
<max_memory_usage>30000000000</max_memory_usage> <max_memory_usage>30000000000</max_memory_usage>
<!--
Because of random distribution of data between threads, the number
of unique keys per thread might differ. This means that sometimes
we switch to two-level aggregation, and sometimes we don't, based on
the "bytes" threshold. Two-level aggregation turns out to be twice
as fast, because it merges aggregation states in multiple threads.
Lower the threshold here, to avoid jitter. It is unclear whether it
would be beneficial to lower the default as well.
-->
<group_by_two_level_threshold_bytes>10000000</group_by_two_level_threshold_bytes>
</settings> </settings>
<substitutions> <substitutions>

View File

@ -1,6 +1,13 @@
SET max_rows_to_group_by = 100000; SET max_rows_to_group_by = 100000;
SET group_by_overflow_mode = 'any'; SET group_by_overflow_mode = 'any';
-- 'any' overflow mode might select different values for two-level and
-- single-level GROUP BY, so we set a big enough threshold here to ensure that
-- the switch doesn't happen, we only use single-level GROUP BY and get a
-- predictable result.
SET group_by_two_level_threshold_bytes = 100000000;
SET group_by_two_level_threshold = 1000000;
SET totals_mode = 'after_having_auto'; SET totals_mode = 'after_having_auto';
SELECT dummy, count() GROUP BY dummy WITH TOTALS; SELECT dummy, count() GROUP BY dummy WITH TOTALS;

View File

@ -4,6 +4,13 @@ SET max_rows_to_group_by = 100000;
SET max_block_size = 100001; SET max_block_size = 100001;
SET group_by_overflow_mode = 'any'; SET group_by_overflow_mode = 'any';
-- 'any' overflow mode might select different values for two-level and
-- single-level GROUP BY, so we set a big enough threshold here to ensure that
-- the switch doesn't happen, we only use single-level GROUP BY and get a
-- predictable result.
SET group_by_two_level_threshold_bytes = 100000000;
SET group_by_two_level_threshold = 1000000;
SELECT '**** totals_mode = after_having_auto'; SELECT '**** totals_mode = after_having_auto';
SET totals_mode = 'after_having_auto'; SET totals_mode = 'after_having_auto';
SELECT intDiv(number, 2) AS k, count(), argMax(toString(number), number) FROM (SELECT number FROM system.numbers LIMIT 500000) GROUP BY k WITH TOTALS ORDER BY k LIMIT 10; SELECT intDiv(number, 2) AS k, count(), argMax(toString(number), number) FROM (SELECT number FROM system.numbers LIMIT 500000) GROUP BY k WITH TOTALS ORDER BY k LIMIT 10;

View File

@ -60,3 +60,15 @@ dest from null:
-108 108 -1016 1116 -1032 1132 -1064 1164 -1.032 -1.064 string-0 fixedstring\0\0\0\0 2001-02-03 2002-02-03 04:05:06 -108 108 -1016 1116 -1032 1132 -1064 1164 -1.032 -1.064 string-0 fixedstring\0\0\0\0 2001-02-03 2002-02-03 04:05:06
127 255 32767 65535 2147483647 4294967295 9223372036854775807 9223372036854775807 -1.032 -1.064 string-2 fixedstring-2\0\0 2004-06-07 2004-02-03 04:05:06 127 255 32767 65535 2147483647 4294967295 9223372036854775807 9223372036854775807 -1.032 -1.064 string-2 fixedstring-2\0\0 2004-06-07 2004-02-03 04:05:06
\N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N
1 [1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
1 [1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
2 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
2 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
1 [1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
1 [1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
2 [NULL] [NULL] [NULL]
2 [NULL] [NULL] [NULL]
3 [] [] []
3 [] [] []
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]

View File

@ -127,6 +127,7 @@ ${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_types1 ORDER BY int8 FORMAT
echo dest from null: echo dest from null:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_types6 ORDER BY int8" ${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_types6 ORDER BY int8"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types5" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types5"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types6" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types6"
@ -135,3 +136,33 @@ ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types1"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types2" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types2"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types3" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types3"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types4" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types4"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_arrays (id UInt32, a1 Array(Int8), a2 Array(UInt8), a3 Array(Int16), a4 Array(UInt16), a5 Array(Int32), a6 Array(UInt32), a7 Array(Int64), a8 Array(UInt64), a9 Array(String), a10 Array(FixedString(4)), a11 Array(Float32), a12 Array(Float64), a13 Array(Date), a14 Array(Datetime), a15 Array(Decimal(4, 2)), a16 Array(Decimal(10, 2)), a17 Array(Decimal(25, 2))) engine=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays VALUES (1, [1,-2,3], [1,2,3], [100, -200, 300], [100, 200, 300], [10000000, -20000000, 30000000], [10000000, 2000000, 3000000], [100000000000000, -200000000000, 3000000000000], [100000000000000, 20000000000000, 3000000000000], ['Some string', 'Some string', 'Some string'], ['0000', '1111', '2222'], [42.42, 424.2, 0.4242], [424242.424242, 4242042420.242424, 42], ['2000-01-01', '2001-01-01', '2002-01-01'], ['2000-01-01', '2001-01-01', '2002-01-01'], [0.2, 10.003, 4.002], [4.000000001, 10000.10000, 10000.100001], [1000000000.000000001123, 90.0000000010010101, 0101001.0112341001])"
${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays VALUES (2, [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_arrays ORDER BY id"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_nullable_arrays (id UInt32, a1 Array(Nullable(UInt32)), a2 Array(Nullable(String)), a3 Array(Nullable(Decimal(4, 2)))) engine=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nullable_arrays VALUES (1, [1, Null, 2], [Null, 'Some string', Null], [0.001, Null, 42.42]), (2, [Null], [Null], [Null]), (3, [], [], [])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nullable_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nullable_arrays FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nullable_arrays ORDER BY id"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_nested_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_nested_arrays (a1 Array(Array(Array(UInt32))), a2 Array(Array(Array(String))), a3 Array(Array(Nullable(UInt32))), a4 Array(Array(Nullable(String)))) engine=Memory() "
${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nested_arrays VALUES ([[[1,2,3], [1,2,3]], [[1,2,3]], [[], [1,2,3]]], [[['Some string', 'Some string'], []], [['Some string']], [[]]], [[Null, 1, 2], [Null], [1, 2], []], [['Some string', Null, 'Some string'], [Null], []])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nested_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nested_arrays FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nested_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_nested_arrays"

View File

@ -1,6 +1,10 @@
=== Try load data from alltypes_dictionary.parquet === Try load data from alltypes_dictionary.parquet
0 1 0 0 0 0 0 0 01/01/09 0 1230768000 0 1 0 0 0 0 0 0 01/01/09 0 1230768000
1 0 1 1 1 10 1.1 10.1 01/01/09 1 1230768060 1 0 1 1 1 10 1.1 10.1 01/01/09 1 1230768060
=== Try load data from alltypes_list.parquet
[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
=== Try load data from alltypes_plain.parquet === Try load data from alltypes_plain.parquet
4 1 0 0 0 0 0 0 03/01/09 0 1235865600 4 1 0 0 0 0 0 0 03/01/09 0 1235865600
5 0 1 1 1 10 1.1 10.1 03/01/09 1 1235865660 5 0 1 1 1 10 1.1 10.1 03/01/09 1 1235865660
@ -13,6 +17,39 @@
=== Try load data from alltypes_plain.snappy.parquet === Try load data from alltypes_plain.snappy.parquet
6 1 0 0 0 0 0 0 04/01/09 0 1238544000 6 1 0 0 0 0 0 0 04/01/09 0 1238544000
7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060 7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060
=== Try load data from array_float.parquet
idx1 []
idx2 [10.2,8.2,7.2]
idx3 [10.2,8.2]
idx4 [10.2]
idx5 [10.2,8.2]
idx6 [10.2]
idx7 [10.2,8.2]
idx8 [10.2,8.2]
idx9 [10.2]
idx10 [10.2,8.2]
=== Try load data from array_int.parquet
idx1 [100,101,102]
idx2 [100,101]
idx3 [100,101,102,101]
idx4 [100]
idx5 [100,101]
idx6 [100,101]
idx7 [100,101]
idx8 [100,101]
idx9 [100,101,102]
idx10 [100,101,102]
=== Try load data from array_string.parquet
idx1 ['This','is','a','test']
idx2 ['cigarette','smoke']
idx3 ['the','grocery','clerks']
idx4 []
idx5 ['wants','to','get','out']
idx6 ['me','up?']
idx7 ['then','I','put','him','back']
idx8 ['make','a','man']
idx9 ['Which','Heaven','to','gaudy','day','denies']
idx10 ['This','is','a','test']
=== Try load data from binary.parquet === Try load data from binary.parquet
\0 \0
 
@ -225,8 +262,9 @@ Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Not y
23.00 23.00
24.00 24.00
=== Try load data from list_columns.parquet === Try load data from list_columns.parquet
Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin [1,2,3] ['abc','efg','hij']
[NULL,1] []
[4] ['efg',NULL,'hij','xyz']
=== Try load data from nation.dict-malformed.parquet === Try load data from nation.dict-malformed.parquet
0 ALGERIA 0 haggle. carefully final deposits detect slyly agai 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai
1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon
@ -253,9 +291,12 @@ Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not
22 RUSSIA 3 requests against the platelets use never according to the quickly regular pint 22 RUSSIA 3 requests against the platelets use never according to the quickly regular pint
23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull 23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull
24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be 24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be
=== Try load data from nested_lists.parquet
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]
=== Try load data from nested_lists.snappy.parquet === Try load data from nested_lists.snappy.parquet
Code: 70. DB::Ex---tion: The type "list" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin [[['a','b'],['c']],[[],['d']]] 1
[[['a','b'],['c','d']],[[],['e']]] 1
[[['a','b'],['c','d'],['e']],[[],['f']]] 1
=== Try load data from nested_maps.snappy.parquet === Try load data from nested_maps.snappy.parquet
Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
@ -268,6 +309,10 @@ Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported
../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
=== Try load data from nullable.impala.parquet === Try load data from nullable.impala.parquet
../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
=== Try load data from nullable_list.parquet
[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []
=== Try load data from nulls.snappy.parquet === Try load data from nulls.snappy.parquet
Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin

View File

@ -0,0 +1,4 @@
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_arrays (arr1 Array(Int8), arr2 Array(UInt8), arr3 Array(Int16), arr4 Array(UInt16), arr5 Array(Int32), arr6 Array(UInt32), arr7 Array(Int64), arr8 Array(UInt64), arr9 Array(String), arr10 Array(FixedString(4)), arr11 Array(Float32), arr12 Array(Float64), arr13 Array(Date), arr14 Array(Datetime), arr15 Array(Decimal(4, 2)), arr16 Array(Decimal(10, 2)), arr17 Array(Decimal(25, 2))) ENGINE=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO orc_arrays VALUES ([1,-2,3],[1,2,3],[100,-200,300],[100,200,300],[10000000,-20000000,30000000],[10000000,2000000,3000000],[100000000000000,-200000000000,3000000000000],[100000000000000,20000000000000,3000000000000],['Some string','Some string','Some string'],['0000','1111','2222'],[42.42,424.2,0.4242],[424242.424242,4242042420.242424,42],['2000-01-01','2001-01-01','2002-01-01'],['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'],[0.20,10.00,4.00],[4.00,10000.10,10000.10],[1000000000.00,90.00,101001.01]),([],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_arrays FORMAT ORC" > "${CLICKHOUSE_TMP}"/arrays.orc
cat "${CLICKHOUSE_TMP}"/arrays.orc | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_arrays FORMAT ORC"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE orc_arrays"

View File

@ -0,0 +1,2 @@
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_nested_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_nested_arrays (arr1 Array(Array(Array(UInt32))), arr2 Array(Array(Array(String))), arr3 Array(Array(Nullable(UInt32))), arr4 Array(Array(Nullable(String)))) engine=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO orc_nested_arrays VALUES ([[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]],[[['Some string','Some string'],[]],[['Some string']],[[]]],[[NULL,1,2],[NULL],[1,2],[]],[['Some string',NULL,'Some string'],[NULL],[]])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_nested_arrays FORMAT ORC" > "${CLICKHOUSE_TMP}"/nested_arrays.orc
cat "${CLICKHOUSE_TMP}"/nested_arrays.orc | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_nested_arrays FORMAT ORC"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_nested_arrays"
${CLICKHOUSE_CLIENT} --query="DROP table orc_nested_arrays"

View File

@ -0,0 +1,6 @@
[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []
[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_nullable_arrays (arr1 Array(Nullable(UInt32)), arr2 Array(Nullable(String)), arr3 Array(Nullable(Decimal(4, 2)))) ENGINE=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO orc_nullable_arrays VALUES ([1,NULL,2],[NULL,'Some string',NULL],[0.00,NULL,42.42]), ([NULL],[NULL],[NULL]), ([],[],[])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_nullable_arrays FORMAT ORC" > "${CLICKHOUSE_TMP}"/nullable_arrays.orc
cat "${CLICKHOUSE_TMP}"/nullable_arrays.orc | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_nullable_arrays FORMAT ORC"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE orc_nullable_arrays"

View File

@ -0,0 +1,4 @@
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']
[] [] [] [] [] [] [] [] [] [] [] [] [] []
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']
[] [] [] [] [] [] [] [] [] [] [] [] [] []

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_arrays (arr1 Array(Int8), arr2 Array(UInt8), arr3 Array(Int16), arr4 Array(UInt16), arr5 Array(Int32), arr6 Array(UInt32), arr7 Array(Int64), arr8 Array(UInt64), arr9 Array(String), arr10 Array(FixedString(4)), arr11 Array(Float32), arr12 Array(Float64), arr13 Array(Date), arr14 Array(Datetime)) ENGINE=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_arrays VALUES ([1,-2,3],[1,2,3],[100,-200,300],[100,200,300],[10000000,-20000000,30000000],[10000000,2000000,3000000],[100000000000000,-200000000000,3000000000000],[100000000000000,20000000000000,3000000000000],['Some string','Some string','Some string'],['0000','1111','2222'],[42.42,424.2,0.4242],[424242.424242,4242042420.242424,42],['2000-01-01','2001-01-01','2002-01-01'],['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']),([],[],[],[],[],[],[],[],[],[],[],[],[],[])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/arrays.arrow
cat "${CLICKHOUSE_TMP}"/arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_arrays FORMAT Arrow"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_arrays"

View File

@ -0,0 +1,2 @@
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_nested_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_nested_arrays (arr1 Array(Array(Array(UInt32))), arr2 Array(Array(Array(String))), arr3 Array(Array(Nullable(UInt32))), arr4 Array(Array(Nullable(String)))) engine=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_nested_arrays VALUES ([[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]],[[['Some string','Some string'],[]],[['Some string']],[[]]],[[NULL,1,2],[NULL],[1,2],[]],[['Some string',NULL,'Some string'],[NULL],[]])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nested_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/nested_arrays.arrow
cat "${CLICKHOUSE_TMP}"/nested_arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_nested_arrays FORMAT Arrow"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nested_arrays"
${CLICKHOUSE_CLIENT} --query="DROP table arrow_nested_arrays"

View File

@ -0,0 +1,6 @@
[1,NULL,2] [NULL,'Some string',NULL] [0,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []
[1,NULL,2] [NULL,'Some string',NULL] [0,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_nullable_arrays (arr1 Array(Nullable(UInt32)), arr2 Array(Nullable(String)), arr3 Array(Nullable(Float32))) ENGINE=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_nullable_arrays VALUES ([1,NULL,2],[NULL,'Some string',NULL],[0.00,NULL,42.42]), ([NULL],[NULL],[NULL]), ([],[],[])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nullable_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/nullable_arrays.arrow
cat "${CLICKHOUSE_TMP}"/nullable_arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_nullable_arrays FORMAT Arrow"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_nullable_arrays"

View File

@ -0,0 +1,31 @@
(Expression)
ExpressionTransform
(SettingQuotaAndLimits)
(Expression)
ExpressionTransform
(MergingFinal)
ReplacingSorted 2 → 1
(Expression)
ExpressionTransform × 2
(ReadFromMergeTree)
MergeTree × 2 0 → 1
0 0
1 1
2 2
3 3
4 4
5 5
6 6
(Expression)
ExpressionTransform × 2
(SettingQuotaAndLimits)
(Expression)
ExpressionTransform × 2
(MergingFinal)
ReplacingSorted × 2 2 → 1
Copy × 2 1 → 2
AddingSelector × 2
(Expression)
ExpressionTransform × 2
(ReadFromMergeTree)
MergeTree × 2 0 → 1

View File

@ -0,0 +1,10 @@
DROP TABLE IF EXISTS test;
CREATE TABLE test(a Int, b Int) Engine=ReplacingMergeTree order by a;
INSERT INTO test select number, number from numbers(5);
INSERT INTO test select number, number from numbers(5,2);
set max_threads =1;
explain pipeline select * from test final;
select * from test final;
set max_threads =2;
explain pipeline select * from test final;
DROP TABLE test;

View File

@ -224,6 +224,7 @@
01305_polygons_union 01305_polygons_union
01306_polygons_intersection 01306_polygons_intersection
01702_system_query_log 01702_system_query_log
01710_projection_fetch
01759_optimize_skip_unused_shards_zero_shards 01759_optimize_skip_unused_shards_zero_shards
01780_clickhouse_dictionary_source_loop 01780_clickhouse_dictionary_source_loop
01790_dist_INSERT_block_structure_mismatch_types_and_names 01790_dist_INSERT_block_structure_mismatch_types_and_names

View File

@ -0,0 +1 @@
`a1` Array(Int8), `a2` Array(UInt8), `a3` Array(Int16), `a4` Array(UInt16), `a5` Array(Int32), `a6` Array(UInt32), `a7` Array(Int64), `a8` Array(UInt64), `a9` Array(String), `a10` Array(FixedString(4)), `a11` Array(Float32), `a12` Array(Float64), `a13` Array(Date), `a14` Array(Datetime), `a15` Array(Decimal(4, 2)), `a16` Array(Decimal(10, 2)), `a17` Array(Decimal(25, 2))

Binary file not shown.

View File

@ -0,0 +1 @@
idx String, lst Array(Float32)

Binary file not shown.

View File

@ -0,0 +1 @@
`idx` String, `lst` Array(Int32)

Binary file not shown.

View File

@ -0,0 +1 @@
`idx` String, `lst` Array(String)

View File

@ -1 +1 @@
`int64_list` Nullable(Int64), `utf8_list` Nullable(String) `int64_list` Array(Nullable(Int64)), `utf8_list` Array(Nullable(String))

View File

@ -0,0 +1 @@
`a1` Array(Array(Array(UInt32))), `a2` Array(Array(Array(String))), `a3` Array(Array(Nullable(UInt32))), `a4` Array(Array(Nullable(String)))

View File

@ -1 +1 @@
`a` Nullable(String), `b` Nullable(Int32) `a` Array(Array(Array(Nullable(String)))), `b` Nullable(Int32)

View File

@ -0,0 +1 @@
`a1` Array(Nullable(UInt32)), `a2` Array(Nullable(String)), `a3` Array(Nullable(Decimal(4, 2)))