diff --git a/.gitmodules b/.gitmodules index be44e3268e3..0a7a6b4a3f9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -103,7 +103,7 @@ url = https://github.com/ClickHouse-Extras/fastops [submodule "contrib/orc"] path = contrib/orc - url = https://github.com/apache/orc + url = https://github.com/ClickHouse-Extras/orc [submodule "contrib/sparsehash-c11"] path = contrib/sparsehash-c11 url = https://github.com/sparsehash/sparsehash-c11.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d37fe182f9..8987082db30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,6 @@ #### Upgrade Notes -* One bug has been found after release: [#25187](https://github.com/ClickHouse/ClickHouse/issues/25187). -* Do not upgrade if you have partition key with `UUID`. * `zstd` compression library is updated to v1.5.0. You may get messages about "checksum does not match" in replication. These messages are expected due to update of compression algorithm and you can ignore them. These messages are informational and do not indicate any kinds of undesired behaviour. * The setting `compile_expressions` is enabled by default. Although it has been heavily tested on variety of scenarios, if you find some undesired behaviour on your servers, you can try turning this setting off. * Values of `UUID` type cannot be compared with integer. For example, instead of writing `uuid != 0` type `uuid != '00000000-0000-0000-0000-000000000000'`. diff --git a/README.md b/README.md index 5677837815c..21eda470f49 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,4 @@ ClickHouse® is an open-source column-oriented database management system that a * You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person. ## Upcoming Events -* [SF Bay Area ClickHouse Community Meetup (online)](https://www.meetup.com/San-Francisco-Bay-Area-ClickHouse-Meetup/events/278144089/) on 16 June 2021. +* [China ClickHouse Community Meetup (online)](http://hdxu.cn/rhbfZ) on 26 June 2021. diff --git a/base/glibc-compatibility/glibc-compatibility.c b/base/glibc-compatibility/glibc-compatibility.c index d4bb739a72c..e3f62b7948a 100644 --- a/base/glibc-compatibility/glibc-compatibility.c +++ b/base/glibc-compatibility/glibc-compatibility.c @@ -8,13 +8,6 @@ extern "C" { #endif -#include - -size_t __pthread_get_minstack(const pthread_attr_t * attr) -{ - return 1048576; /// This is a guess. Don't sure it is correct. -} - #include #include #include @@ -141,6 +134,8 @@ int __open_2(const char *path, int oflag) } +#include + /// No-ops. int pthread_setname_np(pthread_t thread, const char *name) { return 0; } int pthread_getname_np(pthread_t thread, char *name, size_t len) { name[0] = '\0'; return 0; }; diff --git a/base/mysqlxx/Query.cpp b/base/mysqlxx/Query.cpp index d4514c3e988..c0d5c20fdfd 100644 --- a/base/mysqlxx/Query.cpp +++ b/base/mysqlxx/Query.cpp @@ -2,7 +2,7 @@ #include #include #else -#include +#include //Y_IGNORE #include #endif diff --git a/base/mysqlxx/ya.make b/base/mysqlxx/ya.make new file mode 100644 index 00000000000..aabc9922e72 --- /dev/null +++ b/base/mysqlxx/ya.make @@ -0,0 +1,39 @@ +# This file is generated automatically, do not edit. See 'ya.make.in' and use 'utils/generate-ya-make' to regenerate it. +LIBRARY() + +OWNER(g:clickhouse) + +CFLAGS(-g0) + +PEERDIR( + contrib/restricted/boost/libs + contrib/libs/libmysql_r + contrib/libs/poco/Foundation + contrib/libs/poco/Util +) + +ADDINCL( + GLOBAL clickhouse/base + clickhouse/base + contrib/libs/libmysql_r +) + +NO_COMPILER_WARNINGS() + +NO_UTIL() + +SRCS( + Connection.cpp + Exception.cpp + Pool.cpp + PoolFactory.cpp + PoolWithFailover.cpp + Query.cpp + ResultBase.cpp + Row.cpp + UseQueryResult.cpp + Value.cpp + +) + +END() diff --git a/base/mysqlxx/ya.make.in b/base/mysqlxx/ya.make.in new file mode 100644 index 00000000000..10755078e20 --- /dev/null +++ b/base/mysqlxx/ya.make.in @@ -0,0 +1,28 @@ +LIBRARY() + +OWNER(g:clickhouse) + +CFLAGS(-g0) + +PEERDIR( + contrib/restricted/boost/libs + contrib/libs/libmysql_r + contrib/libs/poco/Foundation + contrib/libs/poco/Util +) + +ADDINCL( + GLOBAL clickhouse/base + clickhouse/base + contrib/libs/libmysql_r +) + +NO_COMPILER_WARNINGS() + +NO_UTIL() + +SRCS( + +) + +END() diff --git a/base/ya.make b/base/ya.make index 9f4cf0fd4a7..19a16044280 100644 --- a/base/ya.make +++ b/base/ya.make @@ -4,6 +4,7 @@ RECURSE( common daemon loggers + mysqlxx pcg-random widechar_width readpassphrase diff --git a/contrib/NuRaft b/contrib/NuRaft index 2a1bf7d87b4..976874b7aa7 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 2a1bf7d87b4a03561fc66fbb49cee8a288983c5d +Subproject commit 976874b7aa7f422bf4ea595bb7d1166c617b1c26 diff --git a/contrib/arrow b/contrib/arrow index 616b3dc76a0..debf751a129 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 616b3dc76a0c8450b4027ded8a78e9619d7c845f +Subproject commit debf751a129bdda9ff4d1e895e08957ff77000a1 diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index deefb244beb..2237be9913a 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -188,6 +188,7 @@ set(ARROW_SRCS "${LIBRARY_DIR}/array/util.cc" "${LIBRARY_DIR}/array/validate.cc" + "${LIBRARY_DIR}/compute/api_aggregate.cc" "${LIBRARY_DIR}/compute/api_scalar.cc" "${LIBRARY_DIR}/compute/api_vector.cc" "${LIBRARY_DIR}/compute/cast.cc" @@ -198,8 +199,11 @@ set(ARROW_SRCS "${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc" "${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc" + "${LIBRARY_DIR}/compute/kernels/aggregate_quantile.cc" + "${LIBRARY_DIR}/compute/kernels/aggregate_tdigest.cc" "${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc" "${LIBRARY_DIR}/compute/kernels/codegen_internal.cc" + "${LIBRARY_DIR}/compute/kernels/hash_aggregate.cc" "${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc" "${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc" "${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc" @@ -243,6 +247,7 @@ set(ARROW_SRCS "${LIBRARY_DIR}/io/interfaces.cc" "${LIBRARY_DIR}/io/memory.cc" "${LIBRARY_DIR}/io/slow.cc" + "${LIBRARY_DIR}/io/transform.cc" "${LIBRARY_DIR}/tensor/coo_converter.cc" "${LIBRARY_DIR}/tensor/csf_converter.cc" @@ -256,11 +261,8 @@ set(ARROW_SRCS "${LIBRARY_DIR}/util/bitmap_builders.cc" "${LIBRARY_DIR}/util/bitmap_ops.cc" "${LIBRARY_DIR}/util/bpacking.cc" + "${LIBRARY_DIR}/util/cancel.cc" "${LIBRARY_DIR}/util/compression.cc" - "${LIBRARY_DIR}/util/compression_lz4.cc" - "${LIBRARY_DIR}/util/compression_snappy.cc" - "${LIBRARY_DIR}/util/compression_zlib.cc" - "${LIBRARY_DIR}/util/compression_zstd.cc" "${LIBRARY_DIR}/util/cpu_info.cc" "${LIBRARY_DIR}/util/decimal.cc" "${LIBRARY_DIR}/util/delimiting.cc" @@ -268,13 +270,14 @@ set(ARROW_SRCS "${LIBRARY_DIR}/util/future.cc" "${LIBRARY_DIR}/util/int_util.cc" "${LIBRARY_DIR}/util/io_util.cc" - "${LIBRARY_DIR}/util/iterator.cc" "${LIBRARY_DIR}/util/key_value_metadata.cc" "${LIBRARY_DIR}/util/logging.cc" "${LIBRARY_DIR}/util/memory.cc" + "${LIBRARY_DIR}/util/mutex.cc" "${LIBRARY_DIR}/util/string_builder.cc" "${LIBRARY_DIR}/util/string.cc" "${LIBRARY_DIR}/util/task_group.cc" + "${LIBRARY_DIR}/util/tdigest.cc" "${LIBRARY_DIR}/util/thread_pool.cc" "${LIBRARY_DIR}/util/time.cc" "${LIBRARY_DIR}/util/trie.cc" @@ -368,14 +371,14 @@ set(PARQUET_SRCS "${LIBRARY_DIR}/column_reader.cc" "${LIBRARY_DIR}/column_scanner.cc" "${LIBRARY_DIR}/column_writer.cc" - "${LIBRARY_DIR}/deprecated_io.cc" "${LIBRARY_DIR}/encoding.cc" - "${LIBRARY_DIR}/encryption.cc" - "${LIBRARY_DIR}/encryption_internal.cc" + "${LIBRARY_DIR}/encryption/encryption.cc" + "${LIBRARY_DIR}/encryption/encryption_internal.cc" + "${LIBRARY_DIR}/encryption/internal_file_decryptor.cc" + "${LIBRARY_DIR}/encryption/internal_file_encryptor.cc" + "${LIBRARY_DIR}/exception.cc" "${LIBRARY_DIR}/file_reader.cc" "${LIBRARY_DIR}/file_writer.cc" - "${LIBRARY_DIR}/internal_file_decryptor.cc" - "${LIBRARY_DIR}/internal_file_encryptor.cc" "${LIBRARY_DIR}/level_conversion.cc" "${LIBRARY_DIR}/level_comparison.cc" "${LIBRARY_DIR}/metadata.cc" @@ -385,6 +388,8 @@ set(PARQUET_SRCS "${LIBRARY_DIR}/properties.cc" "${LIBRARY_DIR}/schema.cc" "${LIBRARY_DIR}/statistics.cc" + "${LIBRARY_DIR}/stream_reader.cc" + "${LIBRARY_DIR}/stream_writer.cc" "${LIBRARY_DIR}/types.cc" "${GEN_LIBRARY_DIR}/parquet_constants.cpp" diff --git a/contrib/flatbuffers b/contrib/flatbuffers index 22e3ffc66d2..eb3f8279482 160000 --- a/contrib/flatbuffers +++ b/contrib/flatbuffers @@ -1 +1 @@ -Subproject commit 22e3ffc66d2d7d72d1414390aa0f04ffd114a5a1 +Subproject commit eb3f827948241ce0e701516f16cd67324802bce9 diff --git a/contrib/orc b/contrib/orc index 5981208e394..0a936f6bbdb 160000 --- a/contrib/orc +++ b/contrib/orc @@ -1 +1 @@ -Subproject commit 5981208e39447df84827f6a961d1da76bacb6078 +Subproject commit 0a936f6bbdb9303308973073f8623b5a8d82eae1 diff --git a/contrib/replxx b/contrib/replxx index 2b24f14594d..c81be6c68b1 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit 2b24f14594d7606792b92544bb112a6322ba34d7 +Subproject commit c81be6c68b146f15f2096b7ef80e3f21fe27004c diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 2ebb9e0e0a5..1abfb0edbb3 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1591,6 +1591,18 @@ FORMAT PrettyCompactMonoBlock Default value: 0 +## distributed_push_down_limit (#distributed-push-down-limit} + +LIMIT will be applied on each shard separatelly. Usually you don't need to use it, since this will be done automatically if it is possible, i.e. for simple query SELECT FROM LIMIT. + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +!!! note "Note" + That with this setting the result of the query may be inaccurate. + ## optimize_skip_unused_shards_limit {#optimize-skip-unused-shards-limit} Limit for number of sharding key values, turns off `optimize_skip_unused_shards` if the limit is reached. diff --git a/docs/en/sql-reference/statements/attach.md b/docs/en/sql-reference/statements/attach.md index 01783e9cb2f..bebba01980e 100644 --- a/docs/en/sql-reference/statements/attach.md +++ b/docs/en/sql-reference/statements/attach.md @@ -56,4 +56,4 @@ Result: ATTACH TABLE name UUID '' (col1 Type1, ...) ``` -It creates new table with provided structure and attaches data from table with the specified UUID. \ No newline at end of file +It creates new table with provided structure and attaches data from table with the specified UUID. diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index a17c87d2326..1708d594641 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -38,6 +38,7 @@ The list of available `SYSTEM` statements: - [START REPLICATION QUEUES](#query_language-system-start-replication-queues) - [SYNC REPLICA](#query_language-system-sync-replica) - [RESTART REPLICA](#query_language-system-restart-replica) +- [RESTORE REPLICA](#query_language-system-restore-replica) - [RESTART REPLICAS](#query_language-system-restart-replicas) ## RELOAD EMBEDDED DICTIONARIES {#query_language-system-reload-emdedded-dictionaries} @@ -290,13 +291,60 @@ After running this statement the `[db.]replicated_merge_tree_family_table_name` ### RESTART REPLICA {#query_language-system-restart-replica} -Provides possibility to reinitialize Zookeeper sessions state for `ReplicatedMergeTree` table, will compare current state with Zookeeper as source of true and add tasks to Zookeeper queue if needed -Initialization replication quene based on ZooKeeper date happens in the same way as `ATTACH TABLE` statement. For a short time the table will be unavailable for any operations. +Provides possibility to reinitialize Zookeeper sessions state for `ReplicatedMergeTree` table, will compare current state with Zookeeper as source of true and add tasks to Zookeeper queue if needed. +Initialization replication queue based on ZooKeeper date happens in the same way as `ATTACH TABLE` statement. For a short time the table will be unavailable for any operations. ``` sql SYSTEM RESTART REPLICA [db.]replicated_merge_tree_family_table_name ``` +### RESTORE REPLICA {#query_language-system-restore-replica} + +Restores a replica if data is [possibly] present but Zookeeper metadata is lost. + +Works only on readonly `ReplicatedMergeTree` tables. + +One may execute query after: + + - ZooKeeper root `/` loss. + - Replicas path `/replicas` loss. + - Individual replica path `/replicas/replica_name/` loss. + +Replica attaches locally found parts and sends info about them to Zookeeper. +Parts present on replica before metadata loss are not re-fetched from other replicas if not being outdated +(so replica restoration does not mean re-downloading all data over the network). + +Caveat: parts in all states are moved to `detached/` folder. Parts active before data loss (Committed) are attached. + +#### Syntax + +```sql +SYSTEM RESTORE REPLICA [db.]replicated_merge_tree_family_table_name [ON CLUSTER cluster_name] +``` + +Alternative syntax: + +```sql +SYSTEM RESTORE REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name +``` + +#### Example + +```sql +-- Creating table on multiple servers + +CREATE TABLE test(n UInt32) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/', '{replica}') +ORDER BY n PARTITION BY n % 10; + +INSERT INTO test SELECT * FROM numbers(1000); + +-- zookeeper_delete_path("/clickhouse/tables/test", recursive=True) <- root loss. + +SYSTEM RESTART REPLICA test; -- Table will attach as readonly as metadata is missing. +SYSTEM RESTORE REPLICA test; -- Need to execute on every replica, another way: RESTORE REPLICA test ON CLUSTER cluster +``` + ### RESTART REPLICAS {#query_language-system-restart-replicas} Provides possibility to reinitialize Zookeeper sessions state for all `ReplicatedMergeTree` tables, will compare current state with Zookeeper as source of true and add tasks to Zookeeper queue if needed diff --git a/docs/zh/interfaces/formats.md b/docs/zh/interfaces/formats.md index 7537166a0bd..ef18679af12 100644 --- a/docs/zh/interfaces/formats.md +++ b/docs/zh/interfaces/formats.md @@ -5,11 +5,12 @@ toc_title: 输入/输出格式 # 输入/输出格式 {#formats} -ClickHouse可以接受和返回各种格式的数据。输入支持的格式可以用来解析提供给`INSERT`的数据,可以从文件备份表(如File, URL或HDFS)执行`SELECT`,或者读取外部字典。输出支持的格式可用于获取`SELECT`的结果,并支持执行`INSERT`文件的表中。 +ClickHouse可以接受和返回各种格式的数据。受支持的输入格式可用于提交给`INSERT`语句、从文件表(File,URL,HDFS或者外部目录)执行`SELECT`语句,受支持的输出格式可用于格式化`SELECT`语句的返回结果,或者通过`INSERT`写入到文件表。 + 以下是支持的格式: -| 格式 | 输入 | 输出 | +| 格式 | 输入 | 输出 | |-----------------------------------------------------------------------------------------|-------|--------| | [TabSeparated](#tabseparated) | ✔ | ✔ | | [TabSeparatedRaw](#tabseparatedraw) | ✔ | ✔ | @@ -30,8 +31,8 @@ ClickHouse可以接受和返回各种格式的数据。输入支持的格式可 | [JSONCompactString](#jsoncompactstring) | ✗ | ✔ | | [JSONEachRow](#jsoneachrow) | ✔ | ✔ | | [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ | -| [JSONStringEachRow](#jsonstringeachrow) | ✔ | ✔ | -| [JSONStringEachRowWithProgress](#jsonstringeachrowwithprogress) | ✗ | ✔ | +| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ | +| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ | | [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ | | [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ | | [JSONCompactStringEachRow](#jsoncompactstringeachrow) | ✔ | ✔ | @@ -49,7 +50,7 @@ ClickHouse可以接受和返回各种格式的数据。输入支持的格式可 | [Parquet](#data-format-parquet) | ✔ | ✔ | | [Arrow](#data-format-arrow) | ✔ | ✔ | | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | +| [ORC](#data-format-orc) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ | @@ -57,21 +58,25 @@ ClickHouse可以接受和返回各种格式的数据。输入支持的格式可 | [XML](#xml) | ✗ | ✔ | | [CapnProto](#capnproto) | ✔ | ✗ | | [LineAsString](#lineasstring) | ✔ | ✗ | +| [Regexp](#data-format-regexp) | ✔ | ✗ | +| [RawBLOB](#rawblob) | ✔ | ✔ | -您可以使用ClickHouse设置控制一些格式处理参数。更多详情设置请参考[设置](../operations/settings/settings.md) + +您可以使用ClickHouse设置一些格式化参数。更多详情设置请参考[设置](../operations/settings/settings.md) ## TabSeparated {#tabseparated} -在TabSeparated分隔格式中,数据按行写入。每行包含由制表符分隔的值。每个值后跟一个制表符,除了行中最后一个值后跟换行。在任何地方都采用严格的Unix换行。最后一行还必须在末尾包含换行。值以文本格式编写,不包含引号,并使用转义的特殊字符。 +在TabSeparated分隔格式中,数据按行写入。每行包含由制表符分隔的值,每个值后跟一个制表符,除了行中最后一个值,最后的值后面是一个换行符。在任何地方都采用严格的Unix换行(\n)。最后一行结束后必须再插入一个换行符。值以文本格式编写,不包含引号,并使用转义的特殊字符。 -这种格式也可以用`TSV`来表示。 +这种格式也被称为`TSV`。 -`TabSeparated`格式便于使用自定义程序和脚本处理数据。默认情况下,它在HTTP接口和命令行客户端的批处理模式中使用。这种格式还允许在不同dbms之间传输数据。例如,您可以从MySQL获取转储并将其上传到ClickHouse,反之亦然。 +`TabSeparated`格式便于其他的程序和脚本处理数据。默认情况下,HTTP接口和命令行客户端的批处理模式中会使用这个格式。这种格式还允许在不同dbms之间传输数据。例如,您可以从MySQL获取转储并将其上传到ClickHouse,反之亦然。 -`TabSeparated`格式支持输出total值(与TOTALS一起使用时)和extreme值(当`extreme`被设置为1时)。在这种情况下,total值和extreme值会在主数据后输出。主要结果、总值和极值之间用空行分隔。示例: +`TabSeparated`格式支持输出总计的结果(当SQL语句包含`WITH TOTALS`)和极值(当`extremes`被设置为1时)。在这种情况下,总计值和极值会在主数据后输出。主要结果、总值和极值之间用空行分隔。示例: ``` sql -SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` +set extremes=1; +SELECT EventDate, count() AS c FROM test.hits_v1 GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated; ``` ``` text @@ -83,7 +88,7 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD 2014-03-22 1031592 2014-03-23 1046491 -1970-01-01 8873898 +0000-00-00 8873898 2014-03-17 1031592 2014-03-23 1406958 @@ -91,39 +96,41 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD ### 数据格式化 {#data-formatting} -整数是用十进制形式写的。数字可以在开头包含一个额外的`+`字符(解析时忽略,格式化时不记录)。非负数不能包含负号。在读取时,允许将空字符串解析为零,或者(对于有符号类型)将仅由一个负号组成的字符串解析为零。不符合相应数据类型的数字可以被解析为不同的数字,而不会出现错误消息。 +整数是用十进制形式写的。数字可以在开头包含一个额外的`+`字符(解析时忽略该符号,格式化时不记录该符号)。非负数不能包含负号。在读取时,允许将空字符串解析为零,或者(对于有符号类型)将'-'(仅有减号的字符串)解析为零。不符合相应数据类型的数字可能被解析为数值,而不会出现错误消息。 -浮点数以十进制形式书写。`.`号用作十进制分隔符。支持指数符号,如`inf`、`+inf`、`-inf`和`nan`。浮点数的条目可以以小数点开始或结束。 -在格式化期间,浮点数可能会丢失准确性。 +浮点数以十进制形式书写。用`.`作为小数点的符号。支持指数符号,如`inf`、`+inf`、`-inf`和`nan`。小数点前或后可以不出现数字(如123.或.123)。 +在格式化期间,浮点数精度可能会丢失。 在解析期间,并不严格要求读取与机器可以表示的最接近的数值。 -日期以YYYY-MM-DD格式编写,并以相同的格式解析,但使用任何字符作为分隔符。 -日期和时间以`YYYY-MM-DD hh:mm:ss`的格式书写,并以相同的格式解析,但使用任何字符作为分隔符。 -这一切都发生在客户端或服务器启动时的系统时区(取决于它们对数据的格式)。对于带有时间的日期,夏时制时间未指定。因此,如果转储在夏令时有时间,则转储不会明确地与数据匹配,解析将选择这两次中的一次。 -在读取操作期间,不正确的日期和具有时间的日期可以使用自然溢出或null日期和时间进行分析,而不会出现错误消息。 +日期以`YYYY-MM-DD`格式编写,并以相同的格式解析,但允许使用任何字符作为分隔符。 +日期和时间以`YYYY-MM-DD hh:mm:ss`的格式书写,并以相同的格式解析,但允许使用任何字符作为分隔符。 +时区采用客户端或服务器端时区(取决于谁对数据进行格式化)。对于带有时间的日期,没有是哦用夏时制时间。因此,如果导出的数据采用了夏时制,则实际入库的时间不一定与预期的时间对应,解析将根据解析动作发起方选择时间。 +在读取操作期间,不正确的日期和具有时间的日期可以自然溢出(如2021-01-32)或设置成空日期和时间,而不会出现错误消息。 -有个例外情况,Unix时间戳格式也支持用时间解析日期(如果它恰好由10个十进制数字组成)。其结果与时间区域无关。格式`YYYY-MM-DD hh:mm:ss`和`NNNNNNNNNN`是自动区分的。 +有个例外情况,时间解析也支持Unix时间戳(如果它恰好由10个十进制数字组成)。其结果与时区无关。格式`YYYY-MM-DD hh:mm:ss`和`NNNNNNNNNN`这两种格式会自动转换。 -字符串以反斜杠转义的特殊字符输出。下面的转义序列用于输出:`\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`。解析还支持`\a`、`\v`和`\xHH`(十六进制转义字符)和任何`\c`字符,其中`c`是任何字符(这些序列被转换为`c`)。因此,读取数据支持这样一种格式,即可以将换行符写成`\n`或`\`,或者写成换行符。例如,字符串`Hello world`在单词之间有换行符,而不是空格,可以用以下语法进行解析: +字符串输出时,特殊字符会自动转义。以下转义序列用于输出:`\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`。解析还支持`\a`、`\v`和`\xHH`(HH代表十六进制编码)和`\c`,其中`c`是任何字符(这些序列被转换为`c`)。因此,读取数据时,换行符可以写成`\n`或`\`。例如,如果要表示字符串`Hello world`中'Hello'与'world'中间的空格实际上是个换行符,可以写成下面的形式: ``` text Hello\nworld - +``` +等同于 +``` text Hello\ world ``` -第二种形式是支持的,因为MySQL读取tab-separated格式数据集的时候也会使用它。 +第二种形式也受支持,因为MySQL导出tab-separated格式的数据时使用这种格式。 -在TabSeparated分隔格式传递数据时需要转义的最小字符集:`Tab`、换行符(LF)和反斜杠。 +使用TabSeparated格式传递数据时至少需要转义以下特殊字符:制表符(\t)、换行符(\n)和反斜杠(\\)。 -只有一小部分符号被转义。您可以很容易地找到一个字符串值,而您的终端将在输出中不显示它。 +只有一小部分符号被转义。您可以很容易地找到一个能够破坏命令行终端输出的特殊字符。 -数组写在方括号内的逗号分隔值列表中。数组中的数字项按正常格式进行格式化。`Date`和`DateTime`类型用单引号表示。字符串使用与上面相同的转义规则在单引号中编写。 +数组用方括号包裹、逗号分隔的形式表示(例如`[11,22,33]`)。数组中的数字项按上述规则进行格式化。`日期`和`日期时间`类型用单引号包裹。字符串用单引号包裹,遵循上述转义规则。 -[NULL](../sql-reference/syntax.md)将输出为`\N`。 +[NULL](https://clickhouse.tech/docs/zh/sql-reference/syntax/)将输出为`\N`。 -[Nested](../sql-reference/data-types/nested-data-structures/nested.md)结构的每个元素都表示为数组。 +[Nested](https://clickhouse.tech/docs/zh/sql-reference/data-types/nested-data-structures/nested/)结构的每个元素都表示为数组。 示例: @@ -153,45 +160,45 @@ SELECT * FROM nestedt FORMAT TSV ## TabSeparatedRaw {#tabseparatedraw} -与`TabSeparated`格式的不同之处在于,写入的行没有转义。 -使用这种格式解析时,每个字段中不允许使用制表符或换行符。 +与`TabSeparated`格式的不同之处在于,写入的数据不会进行转义处理。 +使用这种格式解析时,每个字段中不允许出现制表符或换行符。 -这种格式也可以使用名称`TSVRaw`来表示。 +这种格式也被称为`TSVRaw`。 ## TabSeparatedWithNames {#tabseparatedwithnames} -与`TabSeparated`格式不同的是列名写在第一行。 -在解析过程中,第一行被完全忽略。不能使用列名来确定它们的位置或检查它们的正确性。 +不同于`TabSeparated`,列名会写在第一行。 +在解析过程中,第一行被完全忽略。您不能依赖列名来确定它们的位置或检查它们的正确性。 (将来可能会添加对头行解析的支持。) -这种格式也可以使用名称`TSVWithNames`来表示。 +这种格式也被称为`TSVWithNames`。 ## TabSeparatedWithNamesAndTypes {#tabseparatedwithnamesandtypes} 与`TabSeparated`格式不同的是列名写在第一行,而列类型写在第二行。 在解析过程中,将完全忽略第一行和第二行。 -这种格式也可以使用名称`TSVWithNamesAndTypes`来表示。 +这种格式也被称为`TSVWithNamesAndTypes`。 ## Template {#format-template} 此格式允许指定带有占位符的自定义格式字符串,这些占位符用于指定转义规则。 -它使用设置`format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter`以及其他格式的一些设置(例如转义`JSON`时使用`output_format_json_quote_64bit_integers`) +它使用`format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter`以及其他格式的一些设置(例如转义`JSON`时使用`output_format_json_quote_64bit_integers`,具体请向下阅读) -设置`format_template_row`指定文件的路径,该文件包含以下语法的行格式字符串: +设置`format_template_row`用于指定行格式文件的路径,该格式文件包含行格式字符串,语法如下: -`delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, +`delimiter_i${column_i:serializeAs_i}delimiter_i${column_i:serializeAs_i} ... delimiter_i`, -其中,`delimiter_i`是值之间的分隔符(`$`符号可以转义为`$$`), -`column_i`是要选择或插入其值的列的名称或索引(如果为空,则跳过该列), +其中,`delimiter_i`是各值之间的分隔符(`$`符号可以转义为`$$`), +`column_i`是选择或插入值的列的名称或索引(如果为空,则跳过该列), `serializeAs_i`是列值的转义规则。支持以下转义规则: - `CSV`, `JSON`, `XML` (类似于相同名称的格式) - `Escaped` (类似于`TSV`) - `Quoted` (类似于`Values`) -- `Raw` (类似于`TSVRaw`) -- `None` +- `Raw` (不转义,类似于`TSVRaw`) +- `None` (不转义,具体请向下阅读) 如果省略了转义规则,那么将使用`None`。`XML`和`Raw`只适用于输出。 @@ -199,26 +206,26 @@ SELECT * FROM nestedt FORMAT TSV `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` -`SearchPhrase`、`c`和`price`列的值被转义为`quotation`、`Escaped`和`JSON`将分别在`Search phrase:`, `, count: `, `, ad price: $`和`;`分隔符之间打印(用于选择)或expected(用于插入)。例如: +`SearchPhrase`、`c`和`price`列的值遵循`Quoted`、`Escaped`和`JSON`转义规则,将分别在`Search phrase:`, `, count: `, `, ad price: $`和`;`分隔符之间打印(用于`SELECT`)或输入期望的值(用于`INSERT`)。例如: `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` -`format_template_rows_between_delimiter`设置指定行之间的分隔符,它将打印(或expected)在每一行之后,最后一行除外(默认为`\n`)。 +`format_template_rows_between_delimiter`设置指定行之间的分隔符,它将打印(或输入期望值)在每一行之后,最后一行除外(该设置默认值为`\n`)。 -设置`format_template_resultset`指定文件路径,该文件包含resultset的格式字符串。resultset的格式字符串与row的格式字符串具有相同的语法,允许指定前缀、后缀和打印一些附加信息的方法。它包含以下占位符而不是列名: +设置`format_template_resultset`指定结果集格式文件路径,该文件包含结果集的格式字符串。结果集的格式字符串与上述的行格式字符串具有相同的语法,并允许指定前缀、后缀,还提供打印一些附加信息的方法。该文件使用如下占位符,用于取代行格式字符串的列名的位置(即`column_i`): -- `data` `format_template_row`格式的数据行,由`format_template_rows_between_delimiter`分隔。此占位符必须是格式字符串中的第一个占位符。 -- `totals` `format_template_row`格式的总值(和WITH TOTALS一起使用) -- `min` `format_template_row`格式的最小值(当极值设置为1时) -- `max` `format_template_row`格式的最大值(当极值设置为1时) -- `rows` 输出行的总数 -- `rows_before_limit` 没有LIMIT的最小行数。仅当查询包含LIMIT时输出。如果查询包含GROUP BY,那么rows_before_limit_at_least就是没有LIMIT的确切行数。 -- `time` 请求执行时间(秒) -- `rows_read` 已读取的行数 -- `bytes_read` 已读取(未压缩)的字节数 +- `data` 代表遵循`format_template_row`格式的数据行,由`format_template_rows_between_delimiter`设置制定的字符分隔。此占位符必须是格式字符串中的第一个占位符。 +- `totals` 代表遵循`format_template_row`格式的数据行,该行用于代表结果的总计值(当SQL语句包含了`WITH TOTALS`) +- `min` 代表遵循`format_template_row`格式的数据行,该行用于代表结果的最小值(当`extremes`设置为1时) +- `max` 代表遵循`format_template_row`格式的数据行,该行用于代表结果的最大值(当`extremes`设置为1时) +- `rows` 代表输出行的总数 +- `rows_before_limit` 代表没有LIMIT限制的结果最小行数。仅当查询包含LIMIT时才输出此值。如果查询包含GROUP BY,那么`rows_before_limit_at_least`就是没有LIMIT的确切行数。 +- `time` 代表请求执行时间(秒) +- `rows_read` 代表已读取的行数 +- `bytes_read` 代表已读取(未压缩)的字节数 -占位符`data`、`totals`、`min`和`max`必须没有指定转义规则(或者必须显式指定`None`)。其余占位符可以指定任何转义规则。 -如果`format_template_resultset`设置为空字符串,则使用`${data}`作为默认值。 +占位符`data`、`totals`、`min`和`max`不允许指定转义规则(允许显式指定`None`)。其余占位符可以指定任何转义规则。 +如果`format_template_resultset`设置为空,则使用`${data}`作为默认值。 对于insert查询,格式允许跳过某些列或某些字段的前缀或后缀(参见示例)。 Select示例: @@ -300,13 +307,13 @@ Some header\n${data}\nTotal rows: ${:CSV}\n Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV} ``` -`PageViews`, `UserID`, `Duration`和`Sign` 内部占位符是表中列的名称。将忽略行中`Useless field`后面和后缀中`\nTotal rows:`之后的值。 +`PageViews`, `UserID`, `Duration`和`Sign` 占位符是表中列的名称。将忽略行中`Useless field`后面和后缀中`\nTotal rows:`之后的值。 输入数据中的所有分隔符必须严格等于指定格式字符串中的分隔符。 ## TemplateIgnoreSpaces {#templateignorespaces} 这种格式只适用于输入。 -类似于`Template`,但跳过输入流中分隔符和值之间的空白字符。但是,如果格式字符串包含空格字符,这些字符将会出现在输入流中。还允许指定空占位符(`${}`或`${:None}`)来将一些分隔符分割为单独的部分,以忽略它们之间的空格。这种占位符仅用于跳过空白字符。 +类似于`Template`,但跳过输入流中分隔符和值之间的空白字符。但是,如果格式字符串包含空格字符,这些字符将会出现在输入流中。也允许指定空占位符(`${}`或`${:None}`)来将一些分隔符分割为单独的部分,以忽略它们之间的空格。这种占位符仅用于跳过空白字符。 如果列的值在所有行的顺序相同,那么可以使用这种格式读取`JSON`。可以使用以下请求从格式为[JSON](#json)的输出示例中插入数据: ``` sql @@ -328,7 +335,7 @@ format_template_resultset = '/some/path/resultset.format', format_template_row = ## TSKV {#tskv} -类似于TabSeparated,但是输出的值是name=value格式。名称的转义方式与TabSeparated格式相同,=符号也是转义的。 +类似于TabSeparated,但是输出的值是name=value格式。名称的转义方式与TabSeparated格式相同,=符号也会被转义。 ``` text SearchPhrase= count()=8267016 @@ -343,7 +350,7 @@ SearchPhrase=curtain designs count()=1064 SearchPhrase=baku count()=1000 ``` -[NULL](../sql-reference/syntax.md)格式为`\N`。 +[NULL](../sql-reference/syntax.md)转化为`\N`。 ``` sql SELECT * FROM t_null FORMAT TSKV @@ -353,28 +360,29 @@ SELECT * FROM t_null FORMAT TSKV x=1 y=\N ``` -当有大量的小列时,这种格式是无效的,并且通常没有理由使用它。不过,就效率而言,它并不比JSONEachRow差。 -这种格式支持数据输出和解析。对于解析,不同列的值支持任何顺序。省略某些值是可以接受的——它们被视为与其默认值相等。在这种情况下,0和空白行被用作默认值。不支持在表中指定的复杂值作为缺省值。 +当有大量的小列时,这种格式效率十分低下,并且通常没有理由使用它。不过,就效率而言,它并不比JSONEachRow差。 +这种格式支持数据输出和解析。用于解析时,可以任意指定列的顺序,也可以省略某些列,那些列的值为该列的默认值,一般情况下为0或空白。不支持将可在表中可指定的复杂值设为默认值。 -解析允许存在不带等号或值的附加字段`tskv`。此字段被忽略。 +解析时允许出现后没有=的字段`tskv`。此字段会被忽略。 ## CSV {#csv} 按`,`分隔的数据格式([RFC](https://tools.ietf.org/html/rfc4180))。 -格式化时,行是用双引号括起来的。字符串中的双引号会以两个双引号输出,除此之外没有其他规则来做字符转义了。日期和时间也会以双引号包括。数字的输出不带引号。值由一个单独的字符隔开,这个字符默认是`,`。行使用Unix换行符(LF)分隔。数组序列化成CSV规则如下:首先将数组序列化为`TabSeparated`格式的字符串,然后将结果字符串用双引号包括输出到`CSV`。`CSV`格式的元组被序列化为单独的列(即它们在元组中的嵌套关系会丢失)。 +格式化时,每一行的值会用双引号括起,日期和时间也会以双引号包括。数字不用双引号括起,字符串中的双引号会以两个双引号输出,除此之外没有其他规则来做字符转义了。值由分隔符隔开,这个分隔符默认是`,`。每一行使用Unix换行符(LF,\n)分隔。 +数组序列化成CSV规则如下:首先将数组序列化为`TabSeparated`格式的字符串,然后将结果字符串用双引号括起后输出到`CSV`。`CSV`格式的元组被序列化为单独的列(即它们在元组中的嵌套关系会丢失)。 ``` bash $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv ``` -\* 默认情况下间隔符是`,` ,在[format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter)中可以了解更多分隔符配置。 +\* 默认情况下分隔符是`,` ,在[format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter)中可以了解更多分隔符配置。 -解析的时候,可以使用或不使用引号来解析所有值。支持双引号和单引号。行也可以不用引号排列。在这种情况下,它们被解析为逗号或换行符(`CR或`LF`)。在解析不带引号的行时,若违反`RFC`规则,会忽略前缀和结尾的空格和制表符。对于换行,全部支持Unix(LF),Windows(CR LF)和Mac OS Classic(CR LF)。 +解析的时候,值可以使用或不使用双引号或者单引号括起来。在这种情况下,每行通过分隔符或换行符(`CR`或`LF`)区分。违反`RFC`规则的是,在解析未用引号括起的行时,会忽略前缀和结尾的空格和制表符。对于换行符,Unix(LF,\n),Windows(CR LF\r\n)和Mac OS Classic(CR LF\t\n)都受支持。 -如果启用[input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields),空的末尾加引号的输入值将替换为相应列的默认值。 +如果启用[input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields),对应列如果存在未输入的空白,且没有用双引号括起,将用默认值替换。 -`NULL`被格式化为`\N`或`NULL`或一个空的非引号字符串(详见配置[input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null)或[input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields))。 +`NULL`被格式化为`\N`或`NULL`或一个不是引号的其他字符串(详见配置[input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null)或[input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields))。 `CSV`格式支持输出总数和极值的方式与`TabSeparated`相同。 @@ -384,8 +392,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR ## CustomSeparated {#format-customseparated} -类似于[Template](#format-template), 但它打印或读取所有列和使用转义规则在设置`format_custom_escaping_rule`和分隔符设置`format_custom_field_delimiter`,`format_custom_row_before_delimiter`,`format_custom_row_after_delimiter`,`format_custom_row_between_delimiter`,`format_custom_result_before_delimiter`,`format_custom_result_after_delimiter`中,而不是从格式字符串。 -也有`CustomSeparatedIgnoreSpaces`格式,这是类似于`TemplateIgnoreSpaces`。 +类似于[Template](#format-template), 但它打印或读取所有列,并使用设置`format_custom_escaping_rule`和分隔符设置`format_custom_field_delimiter`,`format_custom_row_before_delimiter`,`format_custom_row_after_delimiter`,`format_custom_row_between_delimiter`,`format_custom_result_before_delimiter`,`format_custom_result_after_delimiter`的转义规则,而不是从格式字符串。 +也有`CustomSeparatedIgnoreSpaces`格式,这个类似于`TemplateIgnoreSpaces`。 ## JSON {#json} @@ -438,18 +446,17 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTA } ``` -JSON与JavaScript兼容。为了确保这一点,一些字符被另外转义:斜线`/`被转义为`\/`; 替代的换行符`U+2028`和`U+2029`会打断一些浏览器解析,它们会被转义为`\uXXXX`。 ASCII控制字符被转义:退格,换页,换行,回车和水平制表符被替换为`\b`,`\f`,`\n`,`\r`,`\t` 作为使用`\uXXXX`序列的00-1F范围内的剩余字节。 无效的UTF-8序列更改为替换字符,因此输出文本将包含有效的UTF-8序列。 为了与JavaScript兼容,默认情况下,Int64和UInt64整数用双引号引起来。要除去引号,可以将配置参数`output_format_json_quote_64bit_integers`设置为0。 +JSON与JavaScript兼容。为了确保这一点,一些字符被另外转义:斜线`/`被转义为`\/`; 换行符`U+2028`和`U+2029`会打断一些浏览器的解析,它们会被转义为`\uXXXX`。 ASCII控制字符被转义:退格,换页,换行,回车和制表符被转义为`\b`,`\f`,`\n`,`\r`,`\t`。剩下的0x00-0x1F被转义成相应的`\uXXXX`序列。 无效的UTF-8序列替换为字符�,使得输出文本包含有效的UTF-8序列。 为了与JavaScript兼容,默认情况下,Int64和UInt64整数用双引号引起来。要除去引号,可以将配置参数`output_format_json_quote_64bit_integers`设置为0。 -`rows` – 结果输出的行数。 +`rows`代表结果输出的行数。 -`rows_before_limit_at_least`去掉 LIMIT过滤后的最小行总数。 只会在查询包含LIMIT条件时输出。 -若查询包含 GROUP BY,`rows_before_limit_at_least`就是去掉LIMIT后过滤后的准确行数。 +`rows_before_limit_at_least`代表去掉LIMIT过滤后的最小行总数。只会在查询包含LIMIT条件时输出。若查询包含 GROUP BY,`rows_before_limit_at_least`就是去掉LIMIT后过滤后的准确行数。 -`totals` – 总值 (当使用TOTALS条件时)。 +`totals` – 总值 (当指定`WITH TOTALS`时)。 -`extremes` – 极值(当extremes设置为1时)。 +`extremes` – 极值(当extremes设置为1时)。 -该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。 +该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。 ClickHouse支持[NULL](../sql-reference/syntax.md), 在JSON输出中显示为`null`。若要在输出中启用`+nan`、`-nan`、`+inf`、`-inf`值,请设置[output_format_json_quote_denormals](../operations/settings/settings.md#settings-output_format_json_quote_denormals)为1。 @@ -506,12 +513,13 @@ ClickHouse支持[NULL](../sql-reference/syntax.md), 在JSON输出中显示为`nu "rows_before_limit_at_least": 3 } ``` +注意range(5)的值。 ## JSONAsString {#jsonasstring} 在这种格式中,一个JSON对象被解释为一个值。如果输入有几个JSON对象(逗号分隔),它们将被解释为独立的行。 -这种格式只能对具有单个字段类型的表进行解析[String](../sql-reference/data-types/string.md)。其余的列必须设置为[DEFAULT](../sql-reference/statements/create.md)或[MATERIALIZED](../sql-reference/statements/create.md),或者忽略。一旦将整个JSON对象收集为字符串,就可以使用[JSON函数](../sql-reference/functions/json-functions.md)运行它。 +这种格式只能针对有一列类型为[String](../sql-reference/data-types/string.md)的表。表中其余的列必须设置为[DEFAULT](../sql-reference/statements/create.md)或[MATERIALIZED](../sql-reference/statements/create.md),或者忽略。一旦将整个JSON对象收集为字符串,就可以使用[JSON函数](../sql-reference/functions/json-functions.md)运行它。 **示例** @@ -610,7 +618,7 @@ SELECT * FROM json_as_string; ## JSONCompactEachRow {#jsoncompacteachrow} ## JSONCompactStringEachRow {#jsoncompactstringeachrow} -使用这些格式时,ClickHouse会将行输出为分隔的、换行分隔的JSON值,但数据作为一个整体不是有效的JSON。 +使用这些格式时,ClickHouse会将行输出为用换行符分隔的JSON值,这些输出数据作为一个整体时,由于没有分隔符(,)因而不是有效的JSON文档。 ``` json {"some_int":42,"some_str":"hello","some_tuple":[1,"a"]} // JSONEachRow @@ -635,7 +643,7 @@ SELECT * FROM json_as_string; ## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes} ## JSONCompactStringEachRowWithNamesAndTypes {#jsoncompactstringeachrowwithnamesandtypes} -与`JSONCompactEachRow`/`JSONCompactStringEachRow`不同的是,其中列名和类型被写入前两行。 +与`JSONCompactEachRow`/`JSONCompactStringEachRow`不同的是,列名和类型被写入前两行。 ```json ["'hello'", "multiply(42, number)", "range(5)"] @@ -653,18 +661,18 @@ INSERT INTO UserActivity FORMAT JSONEachRow {"PageViews":5, "UserID":"4324182021 ClickHouse允许: -- 对象中key-value的任何顺序。 +- 以任意顺序排列列名,后跟对应的值。 - 省略一些值。 -ClickHouse忽略元素之间的空格和对象后面的逗号。您可以在一行中传递所有对象。你不需要用换行符把它们分开。 +ClickHouse忽略元素之间的空格和对象后面的逗号。您可以在一行中传递所有对象,不需要用换行符把它们分开。 **省略值处理** -ClickHouse将省略的值替换为对应的[data types](../sql-reference/data-types/index.md)默认值。 +ClickHouse将省略的值替换为对应的[数据类型](../sql-reference/data-types/index.md)默认值。 如果指定了`DEFAULT expr`,则ClickHouse根据属性使用不同的替换规则,详看[input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)设置。 -参考下表: +参考下面的例子: ``` sql CREATE TABLE IF NOT EXISTS example_table @@ -678,7 +686,7 @@ CREATE TABLE IF NOT EXISTS example_table - 如果`input_format_defaults_for_omitted_fields = 1`, 那么`x`的默认值为`0`,但`a`的默认值为`x * 2`。 !!! note "注意" -当使用`insert_sample_with_metadata = 1`插入数据时,与使用`insert_sample_with_metadata = 0`插入数据相比,ClickHouse消耗更多的计算资源。 +当使用`insert_sample_with_metadata = 1`插入数据时,与使用`insert_sample_with_metadata = 0`相比,ClickHouse消耗更多的计算资源。 ### Selecting Data {#selecting-data} @@ -713,13 +721,13 @@ CREATE TABLE IF NOT EXISTS example_table CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory ``` -正如您在`Nested`数据类型描述中看到的,ClickHouse将嵌套结构的每个组件作为一个单独的列(`n.s`和`n.i`是我们的表)。您可以通过以下方式插入数据: +正如您在`Nested`数据类型描述中看到的,ClickHouse将嵌套结构的每个部分作为一个单独的列(`n.s`和`n.i`)。您可以通过以下方式插入数据: ``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} ``` -将数据作为分层JSON对象集插入[input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json)。 +将数据作为分层JSON对象集插入,需要设置[input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json)。 ``` json { @@ -764,7 +772,7 @@ SELECT * FROM json_each_row_nested ## Native {#native} -最高性能的格式。通过二进制格式的块进行写入和读取。对于每个块,该中的行数,列数,列名称和类型以及列的部分将被相继记录。 换句话说,这种格式是`columnar`的 - 它不会将列转换为行。这是用于在服务器之间进行交互的本地界面中使用的格式,用于使用命令行客户端和C++客户端。 +最高性能的格式。通过二进制格式的块进行写入和读取。对于每个块,该中的行数,列数,列名称和类型以及列的部分将被相继记录。换句话说,这种格式是`columnar`的 - 它不会将列转换为行。这种格式用于服务器间交互、命令行客户端和C++客户端与服务器交互。 您可以使用此格式快速生成只能由ClickHouse DBMS读取的格式。但自己处理这种格式是没有意义的。 @@ -777,23 +785,27 @@ SELECT * FROM json_each_row_nested 将数据以表格形式输出,也可以使用ANSI转义字符在终端中设置颜色。 它会绘制一个完整的表格,每行数据在终端中占用两行。 -每个结果块作为一个单独的表输出。这是必要的,以便在输出块时不需要缓冲结果(为了预先计算所有值的可见宽度,缓冲是必要的)。 +每个结果块作为一个单独的表输出。这是必要的,以便在输出块时不缓冲结果(为了预先计算所有值的可见宽度,缓冲是必要的)。 [NULL](../sql-reference/syntax.md)输出为`ᴺᵁᴸᴸ`。 -示例(显示[PrettyCompact](#prettycompact)格式) +示例 ``` sql -SELECT * FROM t_null +SELECT * FROM system.numbers limit 2 format Pretty; ``` ``` text -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -└───┴──────┘ +┏━━━━━━━━┓ +┃ number ┃ +┡━━━━━━━━┩ +│ 0 │ +├────────┤ +│ 1 │ +└────────┘ ``` -行没有转义为Pretty\* 格式。示例显示了[PrettyCompact](#prettycompact)格式: +Pretty的所有格式不进行字符转义。示例显示了[PrettyCompact](#prettycompact)格式: ``` sql SELECT 'String with \'quotes\' and \t character' AS Escaping_test @@ -806,7 +818,7 @@ SELECT 'String with \'quotes\' and \t character' AS Escaping_test ``` 为避免将太多数据传输到终端,只打印前10,000行。 如果行数大于或等于10,000,则会显示消息`Showed first 10 000`。 -该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。 +该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。 Pretty格式支持输出合计值(当使用WITH TOTALS时)和极值(当`extremes`设置为1时)。在这些情况下,合计值和极值将输出在主要数据之后,在单独的表中。示例(显示为[PrettyCompact](#prettycompact)格式): @@ -841,10 +853,20 @@ Extremes: 与[Pretty](#pretty)格式不一样的是`PrettyCompact`去掉了行之间的表格分割线,这样使得结果更加紧凑。 这种格式会在交互命令行客户端下默认使用。 +``` sql +select * from system.numbers limit 2 format PrettyCompact; +``` +``` text +┌─number─┐ +│ 0 │ +│ 1 │ +└────────┘ + +``` ## PrettyCompactMonoBlock {#prettycompactmonoblock} -与[PrettyCompact](#prettycompact)格式不一样的是,它支持10,000行数据缓冲,然后输出在一个表格中,不会按照块来区分。 +与[PrettyCompact](#prettycompact)格式不一样的是,它支持10,000行数据缓冲,然后输出在一个表格中,而不分块。 ## PrettyNoEscapes {#prettynoescapes} @@ -866,17 +888,18 @@ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FOR 用法类似上述。 -### PrettyCompactNoEscapes {#prettycompactnoescapes} - -与前面的设置相同。 - -### PrettySpaceNoEscapes {#prettyspacenoescapes} - -与前面的设置相同。 - ## PrettySpace {#prettyspace} 与[PrettyCompact](#prettycompact)格式不一样的是,它使用空格来代替网格来显示数据。 +``` sql +select * from system.numbers limit 2 format PrettySpace; +``` +``` text +number + + 0 + 1 +``` ## RowBinary {#rowbinary} @@ -885,35 +908,35 @@ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FOR 整数使用固定长度的小端表示法。 例如,UInt64 使用8个字节。 DateTime 被表示为 UInt32 类型的Unix 时间戳值。 -Date 被表示为 UInt16 对象,它的值为 1970-01-01以来的天数。 -字符串表示为 varint 长度(无符号 [LEB128](https://en.wikipedia.org/wiki/LEB128)),后跟字符串的字节数。 -FixedString 被简单地表示为一个字节序列。 +Date 被表示为 UInt16 对象,它的值为自1970-01-01以来经过的天数。 +字符串表示为 varint 长度(无符号 [LEB128](https://en.wikipedia.org/wiki/LEB128)),后跟字符串的字节数。 +FixedString 被简单地表示为字节序列。 -数组表示为 varint 长度(无符号 [LEB128](https://en.wikipedia.org/wiki/LEB128)),后跟有序的数组元素。 +数组表示为 varint 长度(无符号 [LEB128](https://en.wikipedia.org/wiki/LEB128)),后跟有序的数组元素。 对于 [NULL](../sql-reference/syntax.md#null-literal) 的支持, 一个为 1 或 0 的字节会加在每个 [可为空](../sql-reference/data-types/nullable.md) 值前面。如果为 1, 那么该值就是 `NULL`。 如果为 0,则不为 `NULL`。 ## RowBinaryWithNamesAndTypes {#rowbinarywithnamesandtypes} -类似于 [RowBinary](#rowbinary),但添加了标题: +类似于 [RowBinary](#rowbinary),但添加了头部信息: -- [LEB128](https://en.wikipedia.org/wiki/LEB128)-编码列数(N) +- [LEB128](https://en.wikipedia.org/wiki/LEB128)-编码列数(N) - N `String`s指定列名 - N `String`s指定列类型 -## 值 {#data-format-values} +## Values {#data-format-values} -在括号中打印每一行。行由逗号分隔。最后一行之后没有逗号。括号内的值也用逗号分隔。数字以十进制格式输出,不含引号。 数组以方括号输出。带有时间的字符串,日期和时间用引号包围输出。转义字符的解析规则与 [TabSeparated](#tabseparated) 格式类似。 在格式化过程中,不插入额外的空格,但在解析过程中,空格是被允许并跳过的(除了数组值之外的空格,这是不允许的)。[NULL](../sql-reference/syntax.md) 为 `NULL`。 +在括号中打印每一行。行由逗号分隔。最后一行之后没有逗号。括号内的值也用逗号分隔。数字以十进制格式输出,不含引号。 数组以方括号输出。字符串、日期、日期时间用引号包围输出。转义字符的解析规则与 [TabSeparated](#tabseparated) 格式类似。 在格式化过程中,不插入额外的空格,但在解析过程中,空格是被允许并跳过的(除了数组值之外的空格,这是不允许的)。[NULL](../sql-reference/syntax.md) 为 `NULL`。 以 Values 格式传递数据时需要转义的最小字符集是:单引号和反斜线。 这是 `INSERT INTO t VALUES ...` 中可以使用的格式,但您也可以将其用于查询结果。 -## 垂直 {#vertical} +另见:[input_format_values_interpret_expressions](https://clickhouse.tech/docs/en/operations/settings/settings/#settings-input_format_values_interpret_expressions)和[input_format_values_deduce_templates_of_expressions](https://clickhouse.tech/docs/en/operations/settings/settings/#settings-input_format_values_deduce_templates_of_expressions)。 -使用指定的列名在单独的行上打印每个值。如果每行都包含大量列,则此格式便于打印一行或几行。 +## Vertical {#vertical} -[NULL](../sql-reference/syntax.md) 输出为 `ᴺᵁᴸᴸ`。 +根据指定的列名,打印出每一行的值。这种格式适用于具有大量的列时,显示几个列。[NULL](../sql-reference/syntax.md) 输出为 `ᴺᵁᴸᴸ`。 示例: @@ -926,12 +949,12 @@ SELECT * FROM t_null FORMAT Vertical x: 1 y: ᴺᵁᴸᴸ -该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。 +该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。 ## VerticalRaw {#verticalraw} 和 `Vertical` 格式不同点在于,行是不会被转义的。 -这种格式仅仅适用于输出,但不适用于解析输入(将数据插入到表中)。 +这种格式仅仅适用于输出,但不适用于解析输入(将数据插入到表中)。 示例: @@ -1020,24 +1043,24 @@ SELECT * FROM t_null FORMAT Vertical ``` 如果列名称没有可接受的格式,则仅使用 `field` 作为元素名称。 通常,XML 结构遵循 JSON 结构。 -就像JSON一样,将无效的 UTF-8 字符都作替换,以便输出文本将包含有效的 UTF-8 字符序列。 +就像JSON一样,将无效的 UTF-8 字符都替换成字符�,以便输出文本将包含有效的 UTF-8 字符序列。 -在字符串值中,字符 `<` 和 `&` 被转义为 `<` 和 `&`。 +在字符串值中,字符 `<` 和 `&` 被转义为 `<` 和 `&`。 -数组输出为 ` Hello World ... `,元组输出为 ` Hello World ... ` 。 +数组输出类似于 ` Hello World ... `,元组输出类似于 ` Hello World ... ` 。 ## CapnProto {#capnproto} -Cap’n Proto 是一种二进制消息格式,类似 Protocol Buffers 和 Thriftis,但与 JSON 或 MessagePack 格式不一样。 +Cap’n Proto 是一种二进制消息格式,类似Protobuf和Thriftis,但与 JSON 或 MessagePack 格式不一样。 -Cap’n Proto 消息格式是严格类型的,而不是自我描述,这意味着它们不需要外部的描述。这种格式可以实时地应用,并针对每个查询进行缓存。 +Cap’n Proto 消息格式是严格类型的,而不是自我描述,这意味着它们需要架构描述。架构描述可以实时地应用,并针对每个查询进行缓存。 ``` sql SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase FORMAT CapnProto SETTINGS schema = 'schema:Message' ``` -其中 `schema.capnp` 描述如下: +其中 `schema.capnp` 描述如下:6y2 struct Message { SearchPhrase @0 :Text; @@ -1050,10 +1073,10 @@ Cap’n Proto 反序列化是很高效的,通常不会增加系统的负载。 ## Protobuf {#protobuf} -Protobuf-是一个 [协议缓冲区](https://developers.google.com/protocol-buffers/) 格式。 +Protobuf-是一个 [Protocol Buffers](https://developers.google.com/protocol-buffers/) 格式。 -此格式需要外部格式架构。 在查询之间缓存架构。 -ClickHouse支持 `proto2` 和 `proto3` 语法 支持重复/可选/必填字段。 +此格式需要外部格式描述文件(proto文件)。 该描述文件会进行缓存,以备后续查询。 +ClickHouse支持 `proto2` 和 `proto3` 语法的proto文件,支持重复/可选/必填字段。 使用示例: @@ -1065,7 +1088,7 @@ SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:Me cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" ``` -哪里的文件 `schemafile.proto` 看起来像这样: +proto文件 `schemafile.proto` 看起来像这样: ``` capnp syntax = "proto3"; @@ -1077,12 +1100,9 @@ message MessageType { repeated string phoneNumbers = 4; }; ``` +Clickhouse通过字段名称来对应列名称。字段名称不区分大小写,`_`与`.`视为相同符号。如果Proto文件指定的字段类型与列类型不相符,会进行转换。 -要查找协议缓冲区的消息类型的表列和字段之间的对应关系,ClickHouse比较它们的名称。 -这种比较是不区分大小写和字符 `_` (下划线)和 `.` (点)被认为是相等的。 -如果协议缓冲区消息的列和字段的类型不同,则应用必要的转换。 - -支持嵌套消息。 例如,对于字段 `z` 在下面的消息类型 +支持Protobuf嵌套消息。 例如,对于下面Proto文件中的z字段: ``` capnp message MessageType { @@ -1096,10 +1116,10 @@ message MessageType { }; ``` -ClickHouse尝试找到一个名为 `x.y.z` (或 `x_y_z` 或 `X.y_Z` 等)。 +ClickHouse会试图找到一个名为 `x.y.z` (或 `x_y_z` 或 `X.y_Z` 等)的列。 嵌套消息适用于输入或输出一个 [嵌套数据结构](../sql-reference/data-types/nested-data-structures/nested.md). -在protobuf模式中定义的默认值,如下所示 +在protobuf模式中定义的默认值,如下: ``` capnp syntax = "proto2"; @@ -1109,9 +1129,9 @@ message MessageType { } ``` -不应用;该 [表默认值](../sql-reference/statements/create.md#create-default-values) 用来代替它们。 +该默认值会被忽略,Clickhouse会使用 [表默认值](../sql-reference/statements/create.md#create-default-values)作为默认值。 -ClickHouse在输入和输出protobuf消息 `length-delimited` 格式。 +ClickHouse在输入和输出protobuf消息采用`length-delimited` 格式。 这意味着每个消息之前,应该写它的长度作为一个 [varint](https://developers.google.com/protocol-buffers/docs/encoding#varints). 另请参阅 [如何在流行语言中读取/写入长度分隔的protobuf消息](https://cwiki.apache.org/confluence/display/GEODE/Delimiting+Protobuf+Messages). @@ -1131,21 +1151,21 @@ ClickHouse Avro格式支持读取和写入 [Avro数据文件](http://avro.apache | `boolean`, `int`, `long`, `float`, `double` | [Int64](../sql-reference/data-types/int-uint.md), [UInt64](../sql-reference/data-types/int-uint.md) | `long` | | `boolean`, `int`, `long`, `float`, `double` | [Float32](../sql-reference/data-types/float.md) | `float` | | `boolean`, `int`, `long`, `float`, `double` | [Float64](../sql-reference/data-types/float.md) | `double` | -| `bytes`, `string`, `fixed`, `enum` | [字符串](../sql-reference/data-types/string.md) | `bytes` | -| `bytes`, `string`, `fixed` | [固定字符串(N)](../sql-reference/data-types/fixedstring.md) | `fixed(N)` | -| `enum` | [枚举(8/16)](../sql-reference/data-types/enum.md) | `enum` | -| `array(T)` | [阵列(T)](../sql-reference/data-types/array.md) | `array(T)` | -| `union(null, T)`, `union(T, null)` | [可为空(T)](../sql-reference/data-types/date.md) | `union(null, T)` | -| `null` | [可为空(无)](../sql-reference/data-types/special-data-types/nothing.md) | `null` | -| `int (date)` \* | [日期](../sql-reference/data-types/date.md) | `int (date)` \* | +| `bytes`, `string`, `fixed`, `enum` | [String](../sql-reference/data-types/string.md) | `bytes` | +| `bytes`, `string`, `fixed` | [FixedString(N)](../sql-reference/data-types/fixedstring.md) | `fixed(N)` | +| `enum` | [Enum(8\|16)](../sql-reference/data-types/enum.md) | `enum` | +| `array(T)` | [Array(T)](../sql-reference/data-types/array.md) | `array(T)` | +| `union(null, T)`, `union(T, null)` | [Nullable(T)](../sql-reference/data-types/date.md) | `union(null, T)` | +| `null` | [Nullable(Nothing)](../sql-reference/data-types/special-data-types/nothing.md) | `null` | +| `int (date)` \* | [Date](../sql-reference/data-types/date.md) | `int (date)` \* | | `long (timestamp-millis)` \* | [DateTime64(3)](../sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \* | | `long (timestamp-micros)` \* | [DateTime64(6)](../sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \* | \* [Avro逻辑类型](http://avro.apache.org/docs/current/spec.html#Logical+Types) -不支持的Avro数据类型: `record` (非根), `map` +不支持的Avro数据类型: `record` (非根架构), `map` -不支持的Avro逻辑数据类型: `uuid`, `time-millis`, `time-micros`, `duration` +不支持的Avro逻辑数据类型: `time-millis`, `time-micros`, `duration` ### 插入数据 {#inserting-data} @@ -1155,12 +1175,11 @@ ClickHouse Avro格式支持读取和写入 [Avro数据文件](http://avro.apache $ cat file.avro | clickhouse-client --query="INSERT INTO {some_table} FORMAT Avro" ``` -输入Avro文件的根模式必须是 `record` 类型。 +输入Avro文件的根架构必须是 `record` 类型。 -要查找Avro schema的表列和字段之间的对应关系,ClickHouse比较它们的名称。 此比较区分大小写。 -跳过未使用的字段。 +Clickhouse通过字段名称来对应架构的列名称。字段名称区分大小写。未使用的字段会被跳过。 -ClickHouse表列的数据类型可能与插入的Avro数据的相应字段不同。 插入数据时,ClickHouse根据上表解释数据类型,然后 [投](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) 将数据转换为相应的列类型。 +ClickHouse表列的数据类型可能与插入的Avro数据的相应字段不同。 插入数据时,ClickHouse根据上表解释数据类型,然后通过 [Cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) 将数据转换为相应的列类型。 ### 选择数据 {#selecting-data} @@ -1172,14 +1191,14 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Avro" > file.avro 列名必须: -- 名,名,名,名 `[A-Za-z_]` +- 以 `[A-Za-z_]` 开始 - 随后只包含 `[A-Za-z0-9_]` -输出Avro文件压缩和同步间隔可以配置 [output_format_avro_codec](../operations/settings/settings.md#settings-output_format_avro_codec) 和 [output_format_avro_sync_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) 分别。 +输出Avro文件压缩和同步间隔可以经由 [output_format_avro_codec](../operations/settings/settings.md#settings-output_format_avro_codec) 和 [output_format_avro_sync_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) 设置。 ## AvroConfluent {#data-format-avro-confluent} -AvroConfluent支持解码单对象Avro消息常用于 [卡夫卡](https://kafka.apache.org/) 和 [汇合的模式注册表](https://docs.confluent.io/current/schema-registry/index.html). +AvroConfluent支持解码单个对象的Avro消息,这常用于 [Kafka](https://kafka.apache.org/) 和 [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/index.html)。 每个Avro消息都嵌入了一个架构id,该架构id可以在架构注册表的帮助下解析为实际架构。 @@ -1189,11 +1208,11 @@ AvroConfluent支持解码单对象Avro消息常用于 [卡夫卡](https://kafka. ### 数据类型匹配{#sql_reference/data_types-matching-1} {#data-types-matching-sql_referencedata_types-matching-1} -和 [Avro](#data-format-avro) +和 [Avro](#data-format-avro)相同。 ### 用途 {#usage} -要快速验证架构解析,您可以使用 [kafkacat](https://github.com/edenhill/kafkacat) 与 [ツ环板-ョツ嘉ッツ偲](../operations/utilities/clickhouse-local.md): +要快速验证架构解析,您可以使用 [kafkacat](https://github.com/edenhill/kafkacat) 与 [clickhouse-local](../operations/utilities/clickhouse-local.md): ``` bash $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse-local --input-format AvroConfluent --format_avro_schema_registry_url 'http://schema-registry' -S "field1 Int64, field2 String" -q 'select * from table' @@ -1202,7 +1221,7 @@ $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse- 3 c ``` -使用 `AvroConfluent` 与 [卡夫卡](../engines/table-engines/integrations/kafka.md): +使用 `AvroConfluent` 与 [Kafka](../engines/table-engines/integrations/kafka.md): ``` sql CREATE TABLE topic1_stream @@ -1223,15 +1242,16 @@ SELECT * FROM topic1_stream; ``` !!! note "警告" - 设置 `format_avro_schema_registry_url` 需要在配置 `users.xml` restart动后保持它的价值。 + 设置 `format_avro_schema_registry_url` 需要写入配置文件`users.xml`以在Clickhouse重启后,该设置仍为您的设定值。您也可以在使用Kafka引擎的时候指定该设置。 + ## Parquet {#data-format-parquet} -[Apache Parquet](http://parquet.apache.org/) 是Hadoop生态系统中普遍存在的列式存储格式。 ClickHouse支持此格式的读写操作。 +[Apache Parquet](http://parquet.apache.org/) 是Hadoop生态系统中普遍使用的列式存储格式。 ClickHouse支持此格式的读写操作。 ### 数据类型匹配{#sql_reference/data_types-matching-2} {#data-types-matching-sql_referencedata_types-matching-2} -下表显示了支持的数据类型以及它们如何匹配ClickHouse [数据类型](../sql-reference/data-types/index.md) 在 `INSERT` 和 `SELECT` 查询。 +下表显示了Clickhouse支持的数据类型以及它们在 `INSERT` 和 `SELECT` 查询如何对应Clickhouse的 [data types](../sql-reference/data-types/index.md) 。 | Parquet数据类型 (`INSERT`) | ClickHouse数据类型 | Parquet数据类型 (`SELECT`) | |----------------------------|----------------------------------------------------------|----------------------------| @@ -1245,17 +1265,17 @@ SELECT * FROM topic1_stream; | `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | | `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | | `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | -| `DATE32` | [日期](../sql-reference/data-types/date.md) | `UINT16` | -| `DATE64`, `TIMESTAMP` | [日期时间](../sql-reference/data-types/datetime.md) | `UINT32` | -| `STRING`, `BINARY` | [字符串](../sql-reference/data-types/string.md) | `STRING` | -| — | [固定字符串](../sql-reference/data-types/fixedstring.md) | `STRING` | -| `DECIMAL` | [十进制](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `UINT16` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `STRING` | +| — | [FixedString](../sql-reference/data-types/fixedstring.md) | `STRING` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | -ClickHouse支持可配置的精度 `Decimal` 类型。 该 `INSERT` 查询对待 Parquet `DECIMAL` 键入为ClickHouse `Decimal128` 类型。 +ClickHouse支持对 `Decimal` 类型设置精度。 `INSERT` 查询将 Parquet `DECIMAL` 类型视为ClickHouse `Decimal128` 类型。 不支持的Parquet数据类型: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -ClickHouse表列的数据类型可能与插入的Parquet数据的相应字段不同。 插入数据时,ClickHouse根据上表解释数据类型,然后 [投](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) 为ClickHouse表列设置的数据类型的数据。 +ClickHouse表列的数据类型可能与插入的Parquet数据的相应字段不同。 插入数据时,ClickHouse根据上表解释数据类型,然后 [Cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) 为ClickHouse表列设置的数据类型的数据。 ### 插入和选择数据 {#inserting-and-selecting-data} @@ -1265,44 +1285,52 @@ ClickHouse表列的数据类型可能与插入的Parquet数据的相应字段不 $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" ``` -您可以从ClickHouse表中选择数据,并通过以下命令将它们保存到Parquet格式的某个文件中: +您可以从ClickHouse表中选择数据,并通过以下命令将它们保存到Parquet格式的文件中: ``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} ``` -要与Hadoop交换数据,您可以使用 [HDFS表引擎](../engines/table-engines/integrations/hdfs.md). +要与Hadoop交换数据,您可以使用 [HDFS table engine](../engines/table-engines/integrations/hdfs.md). + +## Arrow {#data-format-arrow} +[Apache Arrow](https://arrow.apache.org/)是一种用于内存数据库的格式,共有两种模式,文件与流模式。Clickhouse支持对这两种格式进行读写。 + +`Arrow`对应的是文件模式,这种格式适用于内存的随机访问。 + +## ArrowStream {#data-format-arrow} +`ArrowStream`对应的是Arrow的流模式,这种格式适用于内存的流式处理。 ## ORC {#data-format-orc} - -[阿帕奇兽人](https://orc.apache.org/) 是Hadoop生态系统中普遍存在的列式存储格式。 您只能将此格式的数据插入ClickHouse。 +[Apache ORC](https://orc.apache.org/) 是Hadoop生态系统中普遍存在的列式存储格式。 ### 数据类型匹配{#sql_reference/data_types-matching-3} {#data-types-matching-sql_referencedata_types-matching-3} -下表显示了支持的数据类型以及它们如何匹配ClickHouse [数据类型](../sql-reference/data-types/index.md) 在 `INSERT` 查询。 +下表显示了支持的数据类型以及它们如何在`SELECT`与`INSERT`查询中匹配ClickHouse的 [数据类型](../sql-reference/data-types/index.md)。 -| ORC数据类型 (`INSERT`) | ClickHouse数据类型 | -|------------------------|-----------------------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | -| `DATE32` | [日期](../sql-reference/data-types/date.md) | -| `DATE64`, `TIMESTAMP` | [日期时间](../sql-reference/data-types/datetime.md) | -| `STRING`, `BINARY` | [字符串](../sql-reference/data-types/string.md) | -| `DECIMAL` | [十进制](../sql-reference/data-types/decimal.md) | +| ORC 数据类型 (`INSERT`) | ClickHouse 数据类型 | ORC 数据类型 (`SELECT`) | +|--------------------------|-----------------------------------------------------|--------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` | -ClickHouse支持的可配置精度 `Decimal` 类型。 该 `INSERT` 查询对待兽人 `DECIMAL` 键入为ClickHouse `Decimal128` 类型。 +ClickHouse支持的可配置精度的 `Decimal` 类型。 `INSERT` 查询将ORC格式的 `DECIMAL` 类型视为ClickHouse的 `Decimal128` 类型。 -不支持的ORC数据类型: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. +不支持的ORC数据类型: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -ClickHouse表列的数据类型不必匹配相应的ORC数据字段。 插入数据时,ClickHouse根据上表解释数据类型,然后 [投](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) 将数据转换为ClickHouse表列的数据类型集。 +ClickHouse表列的数据类型不必匹配相应的ORC数据字段。 插入数据时,ClickHouse根据上表解释数据类型,然后 [Cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) 将数据转换为ClickHouse表列的数据类型集。 ### 插入数据 {#inserting-data-1} @@ -1312,33 +1340,143 @@ ClickHouse表列的数据类型不必匹配相应的ORC数据字段。 插入数 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` +### 选择数据 {#selecting-data-1} + +您可以通过以下命令将ClickHouse表中某些数据导出到ORC文件: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} +``` + 要与Hadoop交换数据,您可以使用 [HDFS表引擎](../engines/table-engines/integrations/hdfs.md). +## LineAsString {#lineasstring} +这种格式下,每行输入数据都会当做一个字符串。这种格式仅适用于仅有一列[String](https://clickhouse.tech/docs/en/sql-reference/data-types/string/)类型的列的表。其余列必须设置为[DEFAULT](https://clickhouse.tech/docs/en/sql-reference/statements/create/table/#default)、[MATERIALIZED](https://clickhouse.tech/docs/en/sql-reference/statements/create/table/#materialized)或者被忽略。 + +### 示例: +查询如下: +``` sql +DROP TABLE IF EXISTS line_as_string; +CREATE TABLE line_as_string (field String) ENGINE = Memory; +INSERT INTO line_as_string FORMAT LineAsString "I love apple", "I love banana", "I love orange"; +SELECT * FROM line_as_string; +``` +结果如下: +``` text +┌─field─────────────────────────────────────────────┐ +│ "I love apple", "I love banana", "I love orange"; │ +└───────────────────────────────────────────────────┘ +``` +## Regexp {#regexp} +每一列输入数据根据正则表达式解析。使用`Regexp`格式时,可以使用如下设置: + +- `format_regexp`,[String](https://clickhouse.tech/docs/en/sql-reference/data-types/string/)类型。包含[re2](https://github.com/google/re2/wiki/Syntax)格式的正则表达式。 +- `format_regexp_escaping_rule`,[String](https://clickhouse.tech/docs/en/sql-reference/data-types/string/)类型。支持如下转义规则: + - CSV(规则相同于[CSV](https://clickhouse.tech/docs/zh/interfaces/formats/#csv)) + - JSON(相同于[JSONEachRow](https://clickhouse.tech/docs/zh/interfaces/formats/#jsoneachrow)) + - Escaped(相同于[TSV](https://clickhouse.tech/docs/zh/interfaces/formats/#tabseparated)) + - Quoted(相同于[Values](https://clickhouse.tech/docs/zh/interfaces/formats/#data-format-values)) + - Raw(将整个子匹配项进行提取,不转义) +- `format_regexp_skip_unmatched`,[UInt8](https://clickhouse.tech/docs/zh/sql-reference/data-types/int-uint/)类型。当`format_regexp`表达式没有匹配到结果时是否抛出异常。可为0或1。 + +### 用法 {#usage-1} +`format_regexp`设置会应用于每一行输入数据。正则表达式的子匹配项数必须等于输入数据期望得到的列数。 +每一行输入数据通过换行符`\n`或者`\r\n`分隔。 +匹配到的子匹配项会根据每一列的数据格式进行解析,转义规则根据`format_regexp_escaping_rule`进行。 +当正则表达式对某行没有匹配到结果,`format_regexp_skip_unmatched`设为1时,该行会被跳过。`format_regexp_skip_unmatched`设为0时,会抛出异常。 + +### 示例 {#example-1} +设有如下data.tsv: +``` text +id: 1 array: [1,2,3] string: str1 date: 2020-01-01 +id: 2 array: [1,2,3] string: str2 date: 2020-01-02 +id: 3 array: [1,2,3] string: str3 date: 2020-01-03 +``` +与表: +``` sql +CREATE TABLE imp_regex_table (id UInt32, array Array(UInt32), string String, date Date) ENGINE = Memory; +``` +导入命令: +``` bash +$ cat data.tsv | clickhouse-client --query "INSERT INTO imp_regex_table FORMAT Regexp SETTINGS format_regexp='id: (.+?) array: (.+?) string: (.+?) date: (.+?)', format_regexp_escaping_rule='Escaped', format_regexp_skip_unmatched=0;" +``` +查询: +``` sql +SELECT * FROM imp_regex_table; +``` +结果: +``` text +┌─id─┬─array───┬─string─┬───────date─┐ +│ 1 │ [1,2,3] │ str1 │ 2020-01-01 │ +│ 2 │ [1,2,3] │ str2 │ 2020-01-02 │ +│ 3 │ [1,2,3] │ str3 │ 2020-01-03 │ +└────┴─────────┴────────┴────────────┘ +``` + + + +## RawBLOB {#rawblob} +这种格式下,所有输入数据视为一个值。该格式仅适用于仅有一String类型的列的表。输出时,使用二进制格式输出。当输出结果不唯一时,输出是有歧义的,并且不能通过该输出还原原数据。 +下面是`RawBLOB`与[TabSeparatedRaw](https://clickhouse.tech/docs/zh/interfaces/formats/#tabseparatedraw)的对比: + +`RawBloB`: + +- 二进制格式输出,无转义。 +- 值之间没有分隔符。 +- 每行最后的值后面没有换行符。 + +`TabSeparatedRaw`: + +- 数据无转义输出。 +- 每行的值通过制表符分隔。 +- 每行最后的值得后面有换行符。 + +下面是`RawBLOB`与[RowBinary](https://clickhouse.tech/docs/zh/interfaces/formats/#rowbinary)的对比: + +`RawBloB`: + +- 字符串前面没有表示长度的标志 + +`RowBinary`: + +- 字符串前面有变长标志([LEB128](https://en.wikipedia.org/wiki/LEB128)格式表示),用于表示字符串长度,后接字符串内容。 + +当传入空数据,Clickhouse会抛出异常: +``` text +Code: 108. DB::Exception: No data to insert +``` +### 示例 {#example-4} +``` bash +$ clickhouse-client --query "CREATE TABLE {some_table} (a String) ENGINE = Memory;" +$ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT RawBLOB" +$ clickhouse-client --query "SELECT * FROM {some_table} FORMAT RawBLOB" | md5sum +``` +结果: +``` text +f9725a22f9191e064120d718e26862a9 - +``` + ## 格式架构 {#formatschema} -包含格式架构的文件名由该设置设置 `format_schema`. -当使用其中一种格式时,需要设置此设置 `Cap'n Proto` 和 `Protobuf`. -格式架构是文件名和此文件中消息类型的名称的组合,用冒号分隔, -e.g. `schemafile.proto:MessageType`. -如果文件具有格式的标准扩展名(例如, `.proto` 为 `Protobuf`), -它可以被省略,在这种情况下,格式模式如下所示 `schemafile:MessageType`. +包含格式架构的文件名由设置 `format_schema`指定.当使用`CapnProto` 或 `Protobuf`其中一种格式时,需要设置该项. +格式架构为架构文件名和此文件中消息类型的组合,用冒号分隔,例如 `schemafile.proto:MessageType`. +如果文件具有格式的标准扩展名(例如, `Protobuf`格式的架构文件标准扩展名为`.proto`),它可以被省略,在这种情况下,格式模式如下所示 `schemafile:MessageType`. -如果您通过输入或输出数据 [客户](../interfaces/cli.md) 在交互模式下,格式架构中指定的文件名 -可以包含绝对路径或相对于客户端上当前目录的路径。 -如果在批处理模式下使用客户端,则由于安全原因,架构的路径必须是相对的。 +如果您通过[Client](../interfaces/cli.md) 在 [交互模式](https://clickhouse.tech/docs/zh/interfaces/cli/#cli_usage)下输入或输出数据,格式架构中指定的文件名可以使用绝对路径或客户端当前目录的相对路径。 +如果在[批处理模式](https://clickhouse.tech/docs/zh/interfaces/cli/#cli_usage)下使用客户端,则由于安全原因,架构的路径必须使用相对路径。 -如果您通过输入或输出数据 [HTTP接口](../interfaces/http.md) 格式架构中指定的文件名 -应该位于指定的目录中 [format_schema_path](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-format_schema_path) -在服务器配置中。 +如果您通过 HTTP接口](../interfaces/http.md)输入或输出数据,格式架构中指定的文件名应该位于服务器设置的[format_schema_path](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-format_schema_path)指定的目录中。 -[原始文章](https://clickhouse.tech/docs/en/interfaces/formats/) ## 跳过错误 {#skippingerrors} -一些格式,如 `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` 和 `Protobuf` 如果发生解析错误,可以跳过断开的行,并从下一行开始继续解析。 看 [input_format_allow_errors_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) 和 -[input_format_allow_errors_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio) 设置。 +一些格式,如 `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` 和 `Protobuf` 如果发生解析错误,可以跳过引发错误的行,并从下一行开始继续解析。 详情请见设置[input_format_allow_errors_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) 和 +[input_format_allow_errors_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio) 。 + 限制: --在解析错误的情况下 `JSONEachRow` 跳过所有数据,直到新行(或EOF),所以行必须由 `\n` 正确计算错误。 -- `Template` 和 `CustomSeparated` 在最后一列之后使用分隔符,并在行之间使用分隔符来查找下一行的开头,所以跳过错误只有在其中至少有一个不为空时才有效。 +- 在解析错误的情况下 `JSONEachRow` 跳过该行的所有数据,直到遇到新行(或EOF),所以行必须由换行符分隔以正确统计错误行的数量。 +- `Template` 和 `CustomSeparated` 在最后一列之后和行之间使用分隔符来查找下一行的开头,所以跳过错误只有在行分隔符和列分隔符其中至少有一个不为空时才有效。 + + [来源文章](https://clickhouse.tech/docs/zh/interfaces/formats/) diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 12aec76a303..7f85a3fc3d7 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -33,6 +33,7 @@ option (ENABLE_CLICKHOUSE_OBFUSCATOR "Table data obfuscator (convert real data t ${ENABLE_CLICKHOUSE_ALL}) # https://clickhouse.tech/docs/en/operations/utilities/odbc-bridge/ +# TODO Also needs NANODBC. if (ENABLE_ODBC) option (ENABLE_CLICKHOUSE_ODBC_BRIDGE "HTTP-server working like a proxy to ODBC driver" ${ENABLE_CLICKHOUSE_ALL}) diff --git a/programs/bash-completion/completions/clickhouse-bootstrap b/programs/bash-completion/completions/clickhouse-bootstrap index 7109148a192..793d47501dd 100644 --- a/programs/bash-completion/completions/clickhouse-bootstrap +++ b/programs/bash-completion/completions/clickhouse-bootstrap @@ -20,6 +20,7 @@ CLICKHOUSE_QueryProcessingStage=( fetch_columns with_mergeable_state with_mergeable_state_after_aggregation + with_mergeable_state_after_aggregation_and_limit ) CLICKHOUSE_Format=( diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp index 2e48c5d20c5..c8f1a4eef47 100644 --- a/programs/benchmark/Benchmark.cpp +++ b/programs/benchmark/Benchmark.cpp @@ -580,7 +580,7 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv) ("query", value()->default_value(""), "query to execute") ("concurrency,c", value()->default_value(1), "number of parallel queries") ("delay,d", value()->default_value(1), "delay between intermediate reports in seconds (set 0 to disable reports)") - ("stage", value()->default_value("complete"), "request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation") + ("stage", value()->default_value("complete"), "request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation,with_mergeable_state_after_aggregation_and_limit") ("iterations,i", value()->default_value(0), "amount of queries to be executed") ("timelimit,t", value()->default_value(0.), "stop launch of queries after specified time limit") ("randomize,r", value()->default_value(false), "randomize order of execution") diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 2a973f9967e..c6748b16723 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -85,7 +84,7 @@ #include #include #include -#include +#include #include #include @@ -113,6 +112,7 @@ namespace ErrorCodes extern const int DEADLOCK_AVOIDED; extern const int UNRECOGNIZED_ARGUMENTS; extern const int SYNTAX_ERROR; + extern const int TOO_DEEP_RECURSION; } @@ -230,13 +230,13 @@ private: String server_version; String server_display_name; - Stopwatch watch; + /// true by default - for interactive mode, might be changed when --progress option is checked for + /// non-interactive mode. + bool need_render_progress = true; - /// The server periodically sends information about how much data was read since last time. - Progress progress; + bool written_first_block = false; - /// Progress bar - ProgressBar progress_bar; + ProgressIndication progress_indication; /// External tables info. std::list external_tables; @@ -536,7 +536,7 @@ private: if (!is_interactive) { - progress_bar.need_render_progress = config().getBool("progress", false); + need_render_progress = config().getBool("progress", false); echo_queries = config().getBool("echo", false); ignore_error = config().getBool("ignore-error", false); } @@ -1268,7 +1268,8 @@ private: } catch (const Exception & e) { - if (e.code() != ErrorCodes::SYNTAX_ERROR) + if (e.code() != ErrorCodes::SYNTAX_ERROR && + e.code() != ErrorCodes::TOO_DEEP_RECURSION) throw; } @@ -1450,10 +1451,9 @@ private: } catch (Exception & e) { - if (e.code() != ErrorCodes::SYNTAX_ERROR) - { + if (e.code() != ErrorCodes::SYNTAX_ERROR && + e.code() != ErrorCodes::TOO_DEEP_RECURSION) throw; - } } if (ast_2) @@ -1578,12 +1578,9 @@ private: } } - watch.restart(); processed_rows = 0; - progress.reset(); - progress_bar.show_progress_bar = false; - progress_bar.written_progress_chars = 0; - progress_bar.written_first_block = false; + written_first_block = false; + progress_indication.resetProgress(); { /// Temporarily apply query settings to context. @@ -1651,16 +1648,15 @@ private: if (is_interactive) { - std::cout << std::endl << processed_rows << " rows in set. Elapsed: " << watch.elapsedSeconds() << " sec. "; - - if (progress.read_rows >= 1000) - writeFinalProgress(); + std::cout << std::endl << processed_rows << " rows in set. Elapsed: " << progress_indication.elapsedSeconds() << " sec. "; + /// Write final progress if it makes sense to do so. + writeFinalProgress(); std::cout << std::endl << std::endl; } else if (print_time_to_stderr) { - std::cerr << watch.elapsedSeconds() << "\n"; + std::cerr << progress_indication.elapsedSeconds() << "\n"; } } @@ -1835,6 +1831,19 @@ private: /// Send data read from stdin. try { + if (need_render_progress) + { + /// Set total_bytes_to_read for current fd. + FileProgress file_progress(0, std_in.size()); + progress_indication.updateProgress(Progress(file_progress)); + + /// Set callback to be called on file progress. + progress_indication.setFileProgressCallback(context, true); + + /// Add callback to track reading from fd. + std_in.setProgressCallback(context); + } + sendDataFrom(std_in, sample, columns_description); } catch (Exception & e) @@ -1957,7 +1966,7 @@ private: cancelled = true; if (is_interactive) { - progress_bar.clearProgress(); + progress_indication.clearProgressOutput(); std::cout << "Cancelling query." << std::endl; } @@ -2184,7 +2193,7 @@ private: current_format = "Vertical"; /// It is not clear how to write progress with parallel formatting. It may increase code complexity significantly. - if (!progress_bar.need_render_progress) + if (!need_render_progress) block_out_stream = context->getOutputStreamParallelIfPossible(current_format, *out_buf, block); else block_out_stream = context->getOutputStream(current_format, *out_buf, block); @@ -2243,25 +2252,25 @@ private: if (block.rows() == 0 || (query_fuzzer_runs != 0 && processed_rows >= 100)) return; - if (progress_bar.need_render_progress) - progress_bar.clearProgress(); + if (need_render_progress) + progress_indication.clearProgressOutput(); block_out_stream->write(block); - progress_bar.written_first_block = true; + written_first_block = true; /// Received data block is immediately displayed to the user. block_out_stream->flush(); /// Restore progress bar after data block. - if (progress_bar.need_render_progress) - progress_bar.writeProgress(progress, watch.elapsed()); + if (need_render_progress) + progress_indication.writeProgress(); } void onLogData(Block & block) { initLogsOutputStream(); - progress_bar.clearProgress(); + progress_indication.clearProgressOutput(); logs_out_stream->write(block); logs_out_stream->flush(); } @@ -2282,28 +2291,23 @@ private: void onProgress(const Progress & value) { - if (!progress_bar.updateProgress(progress, value)) + if (!progress_indication.updateProgress(value)) { // Just a keep-alive update. return; } + if (block_out_stream) block_out_stream->onProgress(value); - progress_bar.writeProgress(progress, watch.elapsed()); + + if (need_render_progress) + progress_indication.writeProgress(); } void writeFinalProgress() { - std::cout << "Processed " << formatReadableQuantity(progress.read_rows) << " rows, " - << formatReadableSizeWithDecimalSuffix(progress.read_bytes); - - size_t elapsed_ns = watch.elapsed(); - if (elapsed_ns) - std::cout << " (" << formatReadableQuantity(progress.read_rows * 1000000000.0 / elapsed_ns) << " rows/s., " - << formatReadableSizeWithDecimalSuffix(progress.read_bytes * 1000000000.0 / elapsed_ns) << "/s.)"; - else - std::cout << ". "; + progress_indication.writeFinalProgress(); } @@ -2324,7 +2328,7 @@ private: void onEndOfStream() { - progress_bar.clearProgress(); + progress_indication.clearProgressOutput(); if (block_out_stream) block_out_stream->writeSuffix(); @@ -2334,9 +2338,9 @@ private: resetOutput(); - if (is_interactive && !progress_bar.written_first_block) + if (is_interactive && !written_first_block) { - progress_bar.clearProgress(); + progress_indication.clearProgressOutput(); std::cout << "Ok." << std::endl; } } @@ -2468,7 +2472,7 @@ public: ("password", po::value()->implicit_value("\n", ""), "password") ("ask-password", "ask-password") ("quota_key", po::value(), "A string to differentiate quotas when the user have keyed quotas configured on server") - ("stage", po::value()->default_value("complete"), "Request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation") + ("stage", po::value()->default_value("complete"), "Request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation,with_mergeable_state_after_aggregation_and_limit") ("query_id", po::value(), "query_id") ("query,q", po::value(), "query") ("database,d", po::value(), "database") diff --git a/programs/copier/TaskTableAndShard.h b/programs/copier/TaskTableAndShard.h index 0e18efd2975..35170c134ff 100644 --- a/programs/copier/TaskTableAndShard.h +++ b/programs/copier/TaskTableAndShard.h @@ -286,7 +286,7 @@ inline TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConf + "." + escapeForFileName(table_push.first) + "." + escapeForFileName(table_push.second); - engine_push_str = config.getString(table_prefix + "engine"); + engine_push_str = config.getString(table_prefix + "engine", "rand()"); { ParserStorage parser_storage; diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index f48e8d4d0a0..2633f0e9426 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -389,32 +389,29 @@ void LocalServer::processQueries() CurrentThread::QueryScope query_scope_holder(context); ///Set progress show - progress_bar.need_render_progress = config().getBool("progress", false); + need_render_progress = config().getBool("progress", false); - if (progress_bar.need_render_progress) + if (need_render_progress) { context->setProgressCallback([&](const Progress & value) - { - if (!progress_bar.updateProgress(progress, value)) - { - // Just a keep-alive update. - return; - } - progress_bar.writeProgress(progress, watch.elapsed()); - }); + { + /// Write progress only if progress was updated + if (progress_indication.updateProgress(value)) + progress_indication.writeProgress(); + }); } bool echo_queries = config().hasOption("echo") || config().hasOption("verbose"); + + if (need_render_progress) + progress_indication.setFileProgressCallback(context); + std::exception_ptr exception; for (const auto & query : queries) { - watch.restart(); - progress.reset(); - progress_bar.show_progress_bar = false; - progress_bar.written_progress_chars = 0; - progress_bar.written_first_block = false; - + written_first_block = false; + progress_indication.resetProgress(); ReadBufferFromString read_buf(query); WriteBufferFromFileDescriptor write_buf(STDOUT_FILENO); diff --git a/programs/local/LocalServer.h b/programs/local/LocalServer.h index cf8886d9652..e82caad7542 100644 --- a/programs/local/LocalServer.h +++ b/programs/local/LocalServer.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include namespace DB { @@ -49,9 +49,12 @@ protected: /// Settings specified via command line args Settings cmd_settings; - ProgressBar progress_bar; - Progress progress; - Stopwatch watch; + + bool need_render_progress = false; + + bool written_first_block = false; + + ProgressIndication progress_indication; std::optional temporary_directory_to_delete; }; diff --git a/programs/odbc-bridge/ODBCBlockInputStream.cpp b/programs/odbc-bridge/ODBCBlockInputStream.cpp index b23d09e0481..25c953c0b71 100644 --- a/programs/odbc-bridge/ODBCBlockInputStream.cpp +++ b/programs/odbc-bridge/ODBCBlockInputStream.cpp @@ -132,7 +132,7 @@ void ODBCBlockInputStream::insertValue( auto value = row.get(idx); ReadBufferFromString in(value); time_t time = 0; - readDateTimeText(time, in); + readDateTimeText(time, in, assert_cast(data_type.get())->getTimeZone()); if (time < 0) time = 0; assert_cast(column).insertValue(time); diff --git a/src/Access/AccessType.h b/src/Access/AccessType.h index cef2de12b30..0e295985303 100644 --- a/src/Access/AccessType.h +++ b/src/Access/AccessType.h @@ -154,6 +154,7 @@ enum class AccessType M(SYSTEM_DROP_REPLICA, "DROP REPLICA", TABLE, SYSTEM) \ M(SYSTEM_SYNC_REPLICA, "SYNC REPLICA", TABLE, SYSTEM) \ M(SYSTEM_RESTART_REPLICA, "RESTART REPLICA", TABLE, SYSTEM) \ + M(SYSTEM_RESTORE_REPLICA, "RESTORE REPLICA", TABLE, SYSTEM) \ M(SYSTEM_FLUSH_DISTRIBUTED, "FLUSH DISTRIBUTED", TABLE, SYSTEM_FLUSH) \ M(SYSTEM_FLUSH_LOGS, "FLUSH LOGS", GLOBAL, SYSTEM_FLUSH) \ M(SYSTEM_FLUSH, "", GROUP, SYSTEM) \ diff --git a/src/Columns/ColumnLowCardinality.h b/src/Columns/ColumnLowCardinality.h index 4af2cb2f36e..698f65b1281 100644 --- a/src/Columns/ColumnLowCardinality.h +++ b/src/Columns/ColumnLowCardinality.h @@ -191,6 +191,7 @@ public: void nestedRemoveNullable() { dictionary.getColumnUnique().nestedRemoveNullable(); } const IColumnUnique & getDictionary() const { return dictionary.getColumnUnique(); } + IColumnUnique & getDictionary() { return dictionary.getColumnUnique(); } const ColumnPtr & getDictionaryPtr() const { return dictionary.getColumnUniquePtr(); } /// IColumnUnique & getUnique() { return static_cast(*column_unique); } /// ColumnPtr getUniquePtr() const { return column_unique; } diff --git a/src/Common/Config/ConfigProcessor.cpp b/src/Common/Config/ConfigProcessor.cpp index 3f6dcc86ecd..81360c6794b 100644 --- a/src/Common/Config/ConfigProcessor.cpp +++ b/src/Common/Config/ConfigProcessor.cpp @@ -664,8 +664,8 @@ void ConfigProcessor::savePreprocessedConfig(const LoadedConfig & loaded_config, { fs::path preprocessed_configs_path("preprocessed_configs/"); auto new_path = loaded_config.config_path; - if (new_path.substr(0, main_config_path.size()) == main_config_path) - new_path.replace(0, main_config_path.size(), ""); + if (new_path.starts_with(main_config_path)) + new_path.erase(0, main_config_path.size()); std::replace(new_path.begin(), new_path.end(), '/', '_'); if (preprocessed_dir.empty()) @@ -708,6 +708,8 @@ void ConfigProcessor::savePreprocessedConfig(const LoadedConfig & loaded_config, void ConfigProcessor::setConfigPath(const std::string & config_path) { main_config_path = config_path; + if (!main_config_path.ends_with('/')) + main_config_path += '/'; } } diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 5afba23657d..384f930843b 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -554,7 +554,8 @@ M(584, PROJECTION_NOT_USED) \ M(585, CANNOT_PARSE_YAML) \ M(586, CANNOT_CREATE_FILE) \ - M(587, DISTINCT_ON_AND_LIMIT_BY_TOGETHER) \ + M(587, CONCURRENT_ACCESS_NOT_SUPPORTED) \ + M(588, DISTINCT_ON_AND_LIMIT_BY_TOGETHER) \ \ M(998, POSTGRESQL_CONNECTION_FAILURE) \ M(999, KEEPER_EXCEPTION) \ diff --git a/src/Common/ProgressBar.h b/src/Common/ProgressBar.h deleted file mode 100644 index 895cea51f6a..00000000000 --- a/src/Common/ProgressBar.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include -#include - -/// http://en.wikipedia.org/wiki/ANSI_escape_code -#define CLEAR_TO_END_OF_LINE "\033[K" - -namespace DB -{ - -struct ProgressBar -{ -public: - - static bool updateProgress(Progress & progress, const Progress & value); - void writeProgress(const Progress & progress, const size_t elapsed_ns); - void clearProgress(); - - /// For interactive mode always show progress bar, for non-interactive mode it is accessed from config(). - bool need_render_progress = true; - - bool show_progress_bar = false; - - size_t written_progress_chars = 0; - - bool written_first_block = false; - - bool clear_progress = false; -}; - -} diff --git a/src/Common/ProgressBar.cpp b/src/Common/ProgressIndication.cpp similarity index 55% rename from src/Common/ProgressBar.cpp rename to src/Common/ProgressIndication.cpp index 27e5eca1ea5..e1a7c420c54 100644 --- a/src/Common/ProgressBar.cpp +++ b/src/Common/ProgressIndication.cpp @@ -1,22 +1,69 @@ -#include "ProgressBar.h" +#include "ProgressIndication.h" #include #include #include #include +/// FIXME: progress bar in clickhouse-local needs to be cleared after query execution +/// - same as it is now in clickhouse-client. Also there is no writeFinalProgress call +/// in clickhouse-local. + namespace DB { -bool ProgressBar::updateProgress(Progress & progress, const Progress & value) +bool ProgressIndication::updateProgress(const Progress & value) { return progress.incrementPiecewiseAtomically(value); } -void ProgressBar::writeProgress(const Progress & progress, const size_t elapsed_ns) +void ProgressIndication::clearProgressOutput() { - if (!need_render_progress) + if (written_progress_chars) + { + written_progress_chars = 0; + std::cerr << "\r" CLEAR_TO_END_OF_LINE; + } +} + +void ProgressIndication::resetProgress() +{ + watch.restart(); + progress.reset(); + show_progress_bar = false; + written_progress_chars = 0; + write_progress_on_update = false; +} + +void ProgressIndication::setFileProgressCallback(ContextMutablePtr context, bool write_progress_on_update_) +{ + write_progress_on_update = write_progress_on_update_; + context->setFileProgressCallback([&](const FileProgress & file_progress) + { + progress.incrementPiecewiseAtomically(Progress(file_progress)); + + if (write_progress_on_update) + writeProgress(); + }); +} + +void ProgressIndication::writeFinalProgress() +{ + if (progress.read_rows < 1000) return; + std::cout << "Processed " << formatReadableQuantity(progress.read_rows) << " rows, " + << formatReadableSizeWithDecimalSuffix(progress.read_bytes); + + size_t elapsed_ns = watch.elapsed(); + if (elapsed_ns) + std::cout << " (" << formatReadableQuantity(progress.read_rows * 1000000000.0 / elapsed_ns) << " rows/s., " + << formatReadableSizeWithDecimalSuffix(progress.read_bytes * 1000000000.0 / elapsed_ns) << "/s.)"; + else + std::cout << ". "; +} + +void ProgressIndication::writeProgress() +{ /// Output all progress bar commands to stderr at once to avoid flicker. WriteBufferFromFileDescriptor message(STDERR_FILENO, 1024); @@ -45,26 +92,37 @@ void ProgressBar::writeProgress(const Progress & progress, const size_t elapsed_ message << '\r'; size_t prefix_size = message.count(); + size_t read_bytes = progress.read_raw_bytes ? progress.read_raw_bytes : progress.read_bytes; message << indicator << " Progress: "; - message << formatReadableQuantity(progress.read_rows) << " rows, " - << formatReadableSizeWithDecimalSuffix(progress.read_bytes); + << formatReadableSizeWithDecimalSuffix(read_bytes); + auto elapsed_ns = watch.elapsed(); if (elapsed_ns) message << " (" << formatReadableQuantity(progress.read_rows * 1000000000.0 / elapsed_ns) << " rows/s., " - << formatReadableSizeWithDecimalSuffix(progress.read_bytes * 1000000000.0 / elapsed_ns) << "/s.) "; + << formatReadableSizeWithDecimalSuffix(read_bytes * 1000000000.0 / elapsed_ns) << "/s.) "; else message << ". "; written_progress_chars = message.count() - prefix_size - (strlen(indicator) - 2); /// Don't count invisible output (escape sequences). /// If the approximate number of rows to process is known, we can display a progress bar and percentage. - if (progress.total_rows_to_read > 0) + if (progress.total_rows_to_read || progress.total_raw_bytes_to_read) { - size_t total_rows_corrected = std::max(progress.read_rows, progress.total_rows_to_read); + size_t current_count, max_count; + if (progress.total_rows_to_read) + { + current_count = progress.read_rows; + max_count = std::max(progress.read_rows, progress.total_rows_to_read); + } + else + { + current_count = progress.read_raw_bytes; + max_count = std::max(progress.read_raw_bytes, progress.total_raw_bytes_to_read); + } /// To avoid flicker, display progress bar only if .5 seconds have passed since query execution start /// and the query is less than halfway done. @@ -72,7 +130,7 @@ void ProgressBar::writeProgress(const Progress & progress, const size_t elapsed_ if (elapsed_ns > 500000000) { /// Trigger to start displaying progress bar. If query is mostly done, don't display it. - if (progress.read_rows * 2 < total_rows_corrected) + if (current_count * 2 < max_count) show_progress_bar = true; if (show_progress_bar) @@ -81,7 +139,7 @@ void ProgressBar::writeProgress(const Progress & progress, const size_t elapsed_ if (width_of_progress_bar > 0) { std::string bar - = UnicodeBar::render(UnicodeBar::getWidth(progress.read_rows, 0, total_rows_corrected, width_of_progress_bar)); + = UnicodeBar::render(UnicodeBar::getWidth(current_count, 0, max_count, width_of_progress_bar)); message << "\033[0;32m" << bar << "\033[0m"; if (width_of_progress_bar > static_cast(bar.size() / UNICODE_BAR_CHAR_SIZE)) message << std::string(width_of_progress_bar - bar.size() / UNICODE_BAR_CHAR_SIZE, ' '); @@ -90,7 +148,7 @@ void ProgressBar::writeProgress(const Progress & progress, const size_t elapsed_ } /// Underestimate percentage a bit to avoid displaying 100%. - message << ' ' << (99 * progress.read_rows / total_rows_corrected) << '%'; + message << ' ' << (99 * current_count / max_count) << '%'; } message << CLEAR_TO_END_OF_LINE; @@ -99,13 +157,4 @@ void ProgressBar::writeProgress(const Progress & progress, const size_t elapsed_ message.next(); } -void ProgressBar::clearProgress() -{ - if (written_progress_chars) - { - written_progress_chars = 0; - std::cerr << "\r" CLEAR_TO_END_OF_LINE; - } -} - } diff --git a/src/Common/ProgressIndication.h b/src/Common/ProgressIndication.h new file mode 100644 index 00000000000..044d8cb1a89 --- /dev/null +++ b/src/Common/ProgressIndication.h @@ -0,0 +1,63 @@ +#pragma once + +#include +#include +#include + + +/// http://en.wikipedia.org/wiki/ANSI_escape_code +#define CLEAR_TO_END_OF_LINE "\033[K" + +namespace DB +{ + +class ProgressIndication +{ +public: + /// Write progress to stderr. + void writeProgress(); + + void writeFinalProgress(); + + /// Clear stderr output. + void clearProgressOutput(); + + /// Reset progress values. + void resetProgress(); + + /// Update Progress object. It can be updated from: + /// 1. onProgress in clickhouse-client; + /// 2. ProgressCallback via setProgressCallback methrod in: + /// - context (used in clickhouse-local, can also be added in arbitrary place) + /// - SourceWithProgress (also in streams) + /// - readBufferFromFileDescriptor (for file processing progress) + bool updateProgress(const Progress & value); + + /// In some cases there is a need to update progress value, when there is no access to progress_inidcation object. + /// In this case it is added via context. + /// `write_progress_on_update` is needed to write progress for loading files data via pipe in non-interactive mode. + void setFileProgressCallback(ContextMutablePtr context, bool write_progress_on_update = false); + + /// How much seconds passed since query execution start. + double elapsedSeconds() const { return watch.elapsedSeconds(); } + +private: + /// This flag controls whether to show the progress bar. We start showing it after + /// the query has been executing for 0.5 seconds, and is still less than half complete. + bool show_progress_bar = false; + + /// Width of how much has been printed currently into stderr. Used to define size of progress bar and + /// to check whether progress output needs to be cleared. + size_t written_progress_chars = 0; + + /// The server periodically sends information about how much data was read since last time. + /// This information is stored here. + Progress progress; + + /// Track query execution time. + Stopwatch watch; + + bool write_progress_on_update = false; +}; + +} diff --git a/src/Common/TerminalSize.cpp b/src/Common/TerminalSize.cpp index a020098aa44..c53494fe9a0 100644 --- a/src/Common/TerminalSize.cpp +++ b/src/Common/TerminalSize.cpp @@ -15,16 +15,19 @@ namespace DB::ErrorCodes uint16_t getTerminalWidth() { + struct winsize terminal_size {}; if (isatty(STDIN_FILENO)) { - struct winsize terminal_size {}; - if (ioctl(STDIN_FILENO, TIOCGWINSZ, &terminal_size)) DB::throwFromErrno("Cannot obtain terminal window size (ioctl TIOCGWINSZ)", DB::ErrorCodes::SYSTEM_ERROR); - - return terminal_size.ws_col; } - return 0; + else if (isatty(STDERR_FILENO)) + { + if (ioctl(STDERR_FILENO, TIOCGWINSZ, &terminal_size)) + DB::throwFromErrno("Cannot obtain terminal window size (ioctl TIOCGWINSZ)", DB::ErrorCodes::SYSTEM_ERROR); + } + /// Default - 0. + return terminal_size.ws_col; } po::options_description createOptionsDescription(const std::string & caption, uint16_t terminal_width) diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 1493d30ea01..1622e12712e 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -7,8 +7,9 @@ #include #include +#include -#include +#include namespace DB @@ -25,8 +26,48 @@ thread_local ThreadStatus * current_thread = nullptr; thread_local ThreadStatus * main_thread = nullptr; #if !defined(SANITIZER) && !defined(ARCADIA_BUILD) - alignas(4096) static thread_local char alt_stack[std::max(MINSIGSTKSZ, 4096)]; - static thread_local bool has_alt_stack = false; +namespace +{ + +/// Alternative stack for signal handling. +/// +/// This stack should not be located in the TLS (thread local storage), since: +/// - TLS locates data on the per-thread stack +/// - And in case of stack in the signal handler will grow too much, +/// it will start overwriting TLS storage +/// (and note, that it is not too small, due to StackTrace obtaining) +/// - Plus there is no way to determine TLS block size, yes there is +/// __pthread_get_minstack() in glibc, but it is private and hence not portable. +/// +/// Also we should not use getStackSize() (pthread_attr_getstack()) since it +/// will return 8MB, and this is too huge for signal stack. +struct ThreadStack +{ + ThreadStack() + : data(aligned_alloc(getPageSize(), size)) + { + /// Add a guard page + /// (and since the stack grows downward, we need to protect the first page). + mprotect(data, getPageSize(), PROT_NONE); + } + ~ThreadStack() + { + mprotect(data, getPageSize(), PROT_WRITE|PROT_READ); + free(data); + } + + static size_t getSize() { return size; } + void * getData() const { return data; } + +private: + static constexpr size_t size = 16 << 10; /// 16 KiB - not too big but enough to handle error. + void * data; +}; + +} + +static thread_local ThreadStack alt_stack; +static thread_local bool has_alt_stack = false; #endif @@ -54,9 +95,9 @@ ThreadStatus::ThreadStatus() /// We have to call 'sigaltstack' before first 'sigaction'. (It does not work other way, for unknown reason). stack_t altstack_description{}; - altstack_description.ss_sp = alt_stack; + altstack_description.ss_sp = alt_stack.getData(); altstack_description.ss_flags = 0; - altstack_description.ss_size = sizeof(alt_stack); + altstack_description.ss_size = alt_stack.getSize(); if (0 != sigaltstack(&altstack_description, nullptr)) { diff --git a/src/Common/checkStackSize.cpp b/src/Common/checkStackSize.cpp index 8278b510282..dfb1910a8eb 100644 --- a/src/Common/checkStackSize.cpp +++ b/src/Common/checkStackSize.cpp @@ -22,6 +22,52 @@ namespace DB static thread_local void * stack_address = nullptr; static thread_local size_t max_stack_size = 0; +/** + * @param out_address - if not nullptr, here the address of the stack will be written. + * @return stack size + */ +size_t getStackSize(void ** out_address) +{ + using namespace DB; + + size_t size; + void * address; + +#if defined(OS_DARWIN) + // pthread_get_stacksize_np() returns a value too low for the main thread on + // OSX 10.9, http://mail.openjdk.java.net/pipermail/hotspot-dev/2013-October/011369.html + // + // Multiple workarounds possible, adopt the one made by https://github.com/robovm/robovm/issues/274 + // https://developer.apple.com/library/mac/documentation/Cocoa/Conceptual/Multithreading/CreatingThreads/CreatingThreads.html + // Stack size for the main thread is 8MB on OSX excluding the guard page size. + pthread_t thread = pthread_self(); + size = pthread_main_np() ? (8 * 1024 * 1024) : pthread_get_stacksize_np(thread); + + // stack address points to the start of the stack, not the end how it's returned by pthread_get_stackaddr_np + address = reinterpret_cast(reinterpret_cast(pthread_get_stackaddr_np(thread)) - max_stack_size); +#else + pthread_attr_t attr; +# if defined(__FreeBSD__) || defined(OS_SUNOS) + pthread_attr_init(&attr); + if (0 != pthread_attr_get_np(pthread_self(), &attr)) + throwFromErrno("Cannot pthread_attr_get_np", ErrorCodes::CANNOT_PTHREAD_ATTR); +# else + if (0 != pthread_getattr_np(pthread_self(), &attr)) + throwFromErrno("Cannot pthread_getattr_np", ErrorCodes::CANNOT_PTHREAD_ATTR); +# endif + + SCOPE_EXIT({ pthread_attr_destroy(&attr); }); + + if (0 != pthread_attr_getstack(&attr, &address, &size)) + throwFromErrno("Cannot pthread_getattr_np", ErrorCodes::CANNOT_PTHREAD_ATTR); +#endif // OS_DARWIN + + if (out_address) + *out_address = address; + + return size; +} + /** It works fine when interpreters are instantiated by ClickHouse code in properly prepared threads, * but there are cases when ClickHouse runs as a library inside another application. * If application is using user-space lightweight threads with manually allocated stacks, @@ -34,36 +80,7 @@ __attribute__((__weak__)) void checkStackSize() using namespace DB; if (!stack_address) - { -#if defined(OS_DARWIN) - // pthread_get_stacksize_np() returns a value too low for the main thread on - // OSX 10.9, http://mail.openjdk.java.net/pipermail/hotspot-dev/2013-October/011369.html - // - // Multiple workarounds possible, adopt the one made by https://github.com/robovm/robovm/issues/274 - // https://developer.apple.com/library/mac/documentation/Cocoa/Conceptual/Multithreading/CreatingThreads/CreatingThreads.html - // Stack size for the main thread is 8MB on OSX excluding the guard page size. - pthread_t thread = pthread_self(); - max_stack_size = pthread_main_np() ? (8 * 1024 * 1024) : pthread_get_stacksize_np(thread); - - // stack_address points to the start of the stack, not the end how it's returned by pthread_get_stackaddr_np - stack_address = reinterpret_cast(reinterpret_cast(pthread_get_stackaddr_np(thread)) - max_stack_size); -#else - pthread_attr_t attr; -# if defined(__FreeBSD__) || defined(OS_SUNOS) - pthread_attr_init(&attr); - if (0 != pthread_attr_get_np(pthread_self(), &attr)) - throwFromErrno("Cannot pthread_attr_get_np", ErrorCodes::CANNOT_PTHREAD_ATTR); -# else - if (0 != pthread_getattr_np(pthread_self(), &attr)) - throwFromErrno("Cannot pthread_getattr_np", ErrorCodes::CANNOT_PTHREAD_ATTR); -# endif - - SCOPE_EXIT({ pthread_attr_destroy(&attr); }); - - if (0 != pthread_attr_getstack(&attr, &stack_address, &max_stack_size)) - throwFromErrno("Cannot pthread_getattr_np", ErrorCodes::CANNOT_PTHREAD_ATTR); -#endif // OS_DARWIN - } + max_stack_size = getStackSize(&stack_address); const void * frame_address = __builtin_frame_address(0); uintptr_t int_frame_address = reinterpret_cast(frame_address); diff --git a/src/Common/ya.make b/src/Common/ya.make index 4558cd432ad..60dfd5f6bee 100644 --- a/src/Common/ya.make +++ b/src/Common/ya.make @@ -63,7 +63,7 @@ SRCS( PipeFDs.cpp ProcfsMetricsProvider.cpp ProfileEvents.cpp - ProgressBar.cpp + ProgressIndication.cpp QueryProfiler.cpp RWLock.cpp RemoteHostFilter.cpp diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index a50b734c5b4..893fe16abdf 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -23,9 +23,10 @@ using IndexToLogEntry = std::unordered_map; enum class ChangelogVersion : uint8_t { V0 = 0, + V1 = 1, /// with 64 bit buffer header }; -static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0; +static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V1; struct ChangelogRecordHeader { diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 7520f9b3ba2..3575966410c 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -204,7 +204,7 @@ SnapshotMetadataPtr KeeperStorageSnapshot::deserialize(KeeperStorage & storage, uint8_t version; readBinary(version, in); SnapshotVersion current_version = static_cast(version); - if (current_version > SnapshotVersion::V1) + if (current_version > CURRENT_SNAPSHOT_VERSION) throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported snapshot version {}", version); SnapshotMetadataPtr result = deserializeSnapshotMetadata(in); diff --git a/src/Coordination/KeeperSnapshotManager.h b/src/Coordination/KeeperSnapshotManager.h index 3dbd7c9328e..57347c37b9f 100644 --- a/src/Coordination/KeeperSnapshotManager.h +++ b/src/Coordination/KeeperSnapshotManager.h @@ -14,8 +14,11 @@ enum SnapshotVersion : uint8_t { V0 = 0, V1 = 1, /// with ACL map + V2 = 2, /// with 64 bit buffer header }; +static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V2; + struct KeeperStorageSnapshot { public: @@ -30,7 +33,7 @@ public: KeeperStorage * storage; - SnapshotVersion version = SnapshotVersion::V1; + SnapshotVersion version = CURRENT_SNAPSHOT_VERSION; SnapshotMetadataPtr snapshot_meta; int64_t session_id; size_t snapshot_container_size; diff --git a/src/Core/QueryProcessingStage.cpp b/src/Core/QueryProcessingStage.cpp index 14bde0e548d..b5b837e1f61 100644 --- a/src/Core/QueryProcessingStage.cpp +++ b/src/Core/QueryProcessingStage.cpp @@ -24,6 +24,8 @@ namespace QueryProcessingStage stage = WithMergeableState; else if (stage_string == "with_mergeable_state_after_aggregation") stage = WithMergeableStateAfterAggregation; + else if (stage_string == "with_mergeable_state_after_aggregation_and_limit") + stage = WithMergeableStateAfterAggregationAndLimit; else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown query processing stage: {}", stage_string); diff --git a/src/Core/QueryProcessingStage.h b/src/Core/QueryProcessingStage.h index 01e7e12ab1e..7ccaa17eaed 100644 --- a/src/Core/QueryProcessingStage.h +++ b/src/Core/QueryProcessingStage.h @@ -26,8 +26,15 @@ namespace QueryProcessingStage /// It is used for auto distributed_group_by_no_merge optimization for distributed engine. /// (See comments in StorageDistributed). WithMergeableStateAfterAggregation = 3, + /// Same as WithMergeableStateAfterAggregation but also will apply limit on each shard. + /// + /// This query stage will be used for auto + /// distributed_group_by_no_merge/distributed_push_down_limit + /// optimization. + /// (See comments in StorageDistributed). + WithMergeableStateAfterAggregationAndLimit = 4, - MAX = 4, + MAX = 5, }; inline const char * toString(UInt64 stage) @@ -38,6 +45,7 @@ namespace QueryProcessingStage "WithMergeableState", "Complete", "WithMergeableStateAfterAggregation", + "WithMergeableStateAfterAggregationAndLimit", }; return stage < MAX ? data[stage] diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a2e131dd0b8..84e7500b064 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -118,6 +118,7 @@ class IColumn; \ M(UInt64, parallel_distributed_insert_select, 0, "Process distributed INSERT SELECT query in the same cluster on local tables on every shard, if 1 SELECT is executed on each shard, if 2 SELECT and INSERT is executed on each shard", 0) \ M(UInt64, distributed_group_by_no_merge, 0, "If 1, Do not merge aggregation states from different servers for distributed queries (shards will process query up to the Complete stage, initiator just proxies the data from the shards). If 2 the initiator will apply ORDER BY and LIMIT stages (it is not in case when shard process query up to the Complete stage)", 0) \ + M(UInt64, distributed_push_down_limit, 0, "If 1, LIMIT will be applied on each shard separatelly. Usually you don't need to use it, since this will be done automatically if it is possible, i.e. for simple query SELECT FROM LIMIT.", 0) \ M(Bool, optimize_distributed_group_by_sharding_key, false, "Optimize GROUP BY sharding_key queries (by avoiding costly aggregation on the initiator server).", 0) \ M(UInt64, optimize_skip_unused_shards_limit, 1000, "Limit for number of sharding key values, turns off optimize_skip_unused_shards if the limit is reached", 0) \ M(Bool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.", 0) \ @@ -563,7 +564,8 @@ class IColumn; M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \ M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \ M(Bool, cross_to_inner_join_rewrite, true, "Use inner join instead of comma/cross join if possible", 0) \ - + \ + M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/DataStreams/PostgreSQLBlockInputStream.cpp b/src/DataStreams/PostgreSQLBlockInputStream.cpp index a41280847a5..a486df83025 100644 --- a/src/DataStreams/PostgreSQLBlockInputStream.cpp +++ b/src/DataStreams/PostgreSQLBlockInputStream.cpp @@ -170,7 +170,7 @@ void PostgreSQLBlockInputStream::insertValue(IColumn & column, std::string_view { ReadBufferFromString in(value); time_t time = 0; - readDateTimeText(time, in); + readDateTimeText(time, in, assert_cast(data_type.get())->getTimeZone()); if (time < 0) time = 0; assert_cast(column).insertValue(time); @@ -272,11 +272,11 @@ void PostgreSQLBlockInputStream::prepareArrayInfo(size_t column_idx, const DataT else if (which.isDate()) parser = [](std::string & field) -> Field { return UInt16{LocalDate{field}.getDayNum()}; }; else if (which.isDateTime()) - parser = [](std::string & field) -> Field + parser = [nested](std::string & field) -> Field { ReadBufferFromString in(field); time_t time = 0; - readDateTimeText(time, in); + readDateTimeText(time, in, assert_cast(nested.get())->getTimeZone()); return time; }; else if (which.isDecimal32()) diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index f1f60ae2ac4..8b7cf9635b4 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -112,6 +112,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; format_settings.with_names_use_header = settings.input_format_with_names_use_header; format_settings.write_statistics = settings.output_format_write_statistics; + format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 33d51b1797f..1773f2cc2c6 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -52,6 +52,7 @@ struct FormatSettings struct { UInt64 row_group_size = 1000000; + bool low_cardinality_as_dictionary = false; } arrow; struct diff --git a/src/Formats/MySQLBlockInputStream.cpp b/src/Formats/MySQLBlockInputStream.cpp index 3ea86c82fa3..91e51de4603 100644 --- a/src/Formats/MySQLBlockInputStream.cpp +++ b/src/Formats/MySQLBlockInputStream.cpp @@ -169,7 +169,7 @@ namespace { ReadBufferFromString in(value); time_t time = 0; - readDateTimeText(time, in); + readDateTimeText(time, in, assert_cast(data_type).getTimeZone()); if (time < 0) time = 0; assert_cast(column).insertValue(time); diff --git a/src/IO/Progress.cpp b/src/IO/Progress.cpp index f255a39955b..7cc1504a978 100644 --- a/src/IO/Progress.cpp +++ b/src/IO/Progress.cpp @@ -63,6 +63,83 @@ void ProgressValues::writeJSON(WriteBuffer & out) const writeCString("\"}", out); } +bool Progress::incrementPiecewiseAtomically(const Progress & rhs) +{ + read_rows += rhs.read_rows; + read_bytes += rhs.read_bytes; + read_raw_bytes += rhs.read_raw_bytes; + + total_rows_to_read += rhs.total_rows_to_read; + total_raw_bytes_to_read += rhs.total_raw_bytes_to_read; + + written_rows += rhs.written_rows; + written_bytes += rhs.written_bytes; + + return rhs.read_rows || rhs.written_rows; +} + +void Progress::reset() +{ + read_rows = 0; + read_bytes = 0; + read_raw_bytes = 0; + + total_rows_to_read = 0; + total_raw_bytes_to_read = 0; + + written_rows = 0; + written_bytes = 0; +} + +ProgressValues Progress::getValues() const +{ + ProgressValues res; + + res.read_rows = read_rows.load(std::memory_order_relaxed); + res.read_bytes = read_bytes.load(std::memory_order_relaxed); + res.read_raw_bytes = read_raw_bytes.load(std::memory_order_relaxed); + + res.total_rows_to_read = total_rows_to_read.load(std::memory_order_relaxed); + res.total_raw_bytes_to_read = total_raw_bytes_to_read.load(std::memory_order_relaxed); + + res.written_rows = written_rows.load(std::memory_order_relaxed); + res.written_bytes = written_bytes.load(std::memory_order_relaxed); + + return res; +} + +ProgressValues Progress::fetchAndResetPiecewiseAtomically() +{ + ProgressValues res; + + res.read_rows = read_rows.fetch_and(0); + res.read_bytes = read_bytes.fetch_and(0); + res.read_raw_bytes = read_raw_bytes.fetch_and(0); + + res.total_rows_to_read = total_rows_to_read.fetch_and(0); + res.total_raw_bytes_to_read = total_raw_bytes_to_read.fetch_and(0); + + res.written_rows = written_rows.fetch_and(0); + res.written_bytes = written_bytes.fetch_and(0); + + return res; +} + +Progress & Progress::operator=(Progress && other) +{ + read_rows = other.read_rows.load(std::memory_order_relaxed); + read_bytes = other.read_bytes.load(std::memory_order_relaxed); + read_raw_bytes = other.read_raw_bytes.load(std::memory_order_relaxed); + + total_rows_to_read = other.total_rows_to_read.load(std::memory_order_relaxed); + total_raw_bytes_to_read = other.total_raw_bytes_to_read.load(std::memory_order_relaxed); + + written_rows = other.written_rows.load(std::memory_order_relaxed); + written_bytes = other.written_bytes.load(std::memory_order_relaxed); + + return *this; +} + void Progress::read(ReadBuffer & in, UInt64 server_revision) { ProgressValues values; diff --git a/src/IO/Progress.h b/src/IO/Progress.h index 64bf3a404af..446acef9abd 100644 --- a/src/IO/Progress.h +++ b/src/IO/Progress.h @@ -5,6 +5,8 @@ #include #include +#include + namespace DB { @@ -17,7 +19,11 @@ struct ProgressValues { size_t read_rows; size_t read_bytes; + size_t read_raw_bytes; + size_t total_rows_to_read; + size_t total_raw_bytes_to_read; + size_t written_rows; size_t written_bytes; @@ -45,6 +51,16 @@ struct WriteProgress : written_rows(written_rows_), written_bytes(written_bytes_) {} }; +struct FileProgress +{ + /// Here read_bytes (raw bytes) - do not equal ReadProgress::read_bytes, which are calculated according to column types. + size_t read_bytes; + size_t total_bytes_to_read; + + FileProgress(size_t read_bytes_, size_t total_bytes_to_read_ = 0) : read_bytes(read_bytes_), total_bytes_to_read(total_bytes_to_read_) {} +}; + + /** Progress of query execution. * Values, transferred over network are deltas - how much was done after previously sent value. * The same struct is also used for summarized values. @@ -53,87 +69,50 @@ struct Progress { std::atomic read_rows {0}; /// Rows (source) processed. std::atomic read_bytes {0}; /// Bytes (uncompressed, source) processed. + std::atomic read_raw_bytes {0}; /// Raw bytes processed. - /** How much rows must be processed, in total, approximately. Non-zero value is sent when there is information about some new part of job. - * Received values must be summed to get estimate of total rows to process. + /** How much rows/bytes must be processed, in total, approximately. Non-zero value is sent when there is information about + * some new part of job. Received values must be summed to get estimate of total rows to process. + * `total_raw_bytes_to_process` is used for file table engine or when reading from file descriptor. * Used for rendering progress bar on client. */ std::atomic total_rows_to_read {0}; - + std::atomic total_raw_bytes_to_read {0}; std::atomic written_rows {0}; std::atomic written_bytes {0}; Progress() = default; + Progress(size_t read_rows_, size_t read_bytes_, size_t total_rows_to_read_ = 0) : read_rows(read_rows_), read_bytes(read_bytes_), total_rows_to_read(total_rows_to_read_) {} + explicit Progress(ReadProgress read_progress) : read_rows(read_progress.read_rows), read_bytes(read_progress.read_bytes), total_rows_to_read(read_progress.total_rows_to_read) {} + explicit Progress(WriteProgress write_progress) : written_rows(write_progress.written_rows), written_bytes(write_progress.written_bytes) {} + explicit Progress(FileProgress file_progress) + : read_raw_bytes(file_progress.read_bytes), total_raw_bytes_to_read(file_progress.total_bytes_to_read) {} + void read(ReadBuffer & in, UInt64 server_revision); + void write(WriteBuffer & out, UInt64 client_revision) const; + /// Progress in JSON format (single line, without whitespaces) is used in HTTP headers. void writeJSON(WriteBuffer & out) const; /// Each value separately is changed atomically (but not whole object). - bool incrementPiecewiseAtomically(const Progress & rhs) - { - read_rows += rhs.read_rows; - read_bytes += rhs.read_bytes; - total_rows_to_read += rhs.total_rows_to_read; - written_rows += rhs.written_rows; - written_bytes += rhs.written_bytes; + bool incrementPiecewiseAtomically(const Progress & rhs); - return rhs.read_rows || rhs.written_rows; - } + void reset(); - void reset() - { - read_rows = 0; - read_bytes = 0; - total_rows_to_read = 0; - written_rows = 0; - written_bytes = 0; - } + ProgressValues getValues() const; - ProgressValues getValues() const - { - ProgressValues res; + ProgressValues fetchAndResetPiecewiseAtomically(); - res.read_rows = read_rows.load(std::memory_order_relaxed); - res.read_bytes = read_bytes.load(std::memory_order_relaxed); - res.total_rows_to_read = total_rows_to_read.load(std::memory_order_relaxed); - res.written_rows = written_rows.load(std::memory_order_relaxed); - res.written_bytes = written_bytes.load(std::memory_order_relaxed); - - return res; - } - - ProgressValues fetchAndResetPiecewiseAtomically() - { - ProgressValues res; - - res.read_rows = read_rows.fetch_and(0); - res.read_bytes = read_bytes.fetch_and(0); - res.total_rows_to_read = total_rows_to_read.fetch_and(0); - res.written_rows = written_rows.fetch_and(0); - res.written_bytes = written_bytes.fetch_and(0); - - return res; - } - - Progress & operator=(Progress && other) - { - read_rows = other.read_rows.load(std::memory_order_relaxed); - read_bytes = other.read_bytes.load(std::memory_order_relaxed); - total_rows_to_read = other.total_rows_to_read.load(std::memory_order_relaxed); - written_rows = other.written_rows.load(std::memory_order_relaxed); - written_bytes = other.written_bytes.load(std::memory_order_relaxed); - - return *this; - } + Progress & operator=(Progress && other); Progress(Progress && other) { diff --git a/src/IO/ReadBufferFromFileDescriptor.cpp b/src/IO/ReadBufferFromFileDescriptor.cpp index dd5d9e67cd7..babdc953514 100644 --- a/src/IO/ReadBufferFromFileDescriptor.cpp +++ b/src/IO/ReadBufferFromFileDescriptor.cpp @@ -6,7 +6,12 @@ #include #include #include +#include #include +#include +#include +#include +#include namespace ProfileEvents @@ -32,6 +37,7 @@ namespace ErrorCodes extern const int ARGUMENT_OUT_OF_BOUND; extern const int CANNOT_SEEK_THROUGH_FILE; extern const int CANNOT_SELECT; + extern const int CANNOT_FSTAT; } @@ -170,4 +176,28 @@ bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds) return res > 0; } + +off_t ReadBufferFromFileDescriptor::size() +{ + struct stat buf; + int res = fstat(fd, &buf); + if (-1 == res) + throwFromErrnoWithPath("Cannot execute fstat " + getFileName(), getFileName(), ErrorCodes::CANNOT_FSTAT); + return buf.st_size; +} + + +void ReadBufferFromFileDescriptor::setProgressCallback(ContextPtr context) +{ + auto file_progress_callback = context->getFileProgressCallback(); + + if (!file_progress_callback) + return; + + setProfileCallback([file_progress_callback](const ProfileInfo & progress) + { + file_progress_callback(FileProgress(progress.bytes_read, 0)); + }); +} + } diff --git a/src/IO/ReadBufferFromFileDescriptor.h b/src/IO/ReadBufferFromFileDescriptor.h index 0779b215067..bf22bb3d4a3 100644 --- a/src/IO/ReadBufferFromFileDescriptor.h +++ b/src/IO/ReadBufferFromFileDescriptor.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -38,6 +39,9 @@ public: /// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen. off_t seek(off_t off, int whence) override; + off_t size(); + + void setProgressCallback(ContextPtr context); private: /// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout. bool poll(size_t timeout_microseconds); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index c42482a8b62..c673eb0d408 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -180,10 +180,15 @@ private: std::shared_ptr access; std::shared_ptr initial_row_policy; String current_database; - Settings settings; /// Setting for query execution. + Settings settings; /// Setting for query execution. + using ProgressCallback = std::function; - ProgressCallback progress_callback; /// Callback for tracking progress of query execution. - QueryStatus * process_list_elem = nullptr; /// For tracking total resource usage for query. + ProgressCallback progress_callback; /// Callback for tracking progress of query execution. + + using FileProgressCallback = std::function; + FileProgressCallback file_progress_callback; /// Callback for tracking progress of file loading. + + QueryStatus * process_list_elem = nullptr; /// For tracking total resource usage for query. StorageID insertion_table = StorageID::createEmpty(); /// Saved insertion table in query context String default_format; /// Format, used when server formats data by itself and if query does not have FORMAT specification. @@ -588,6 +593,9 @@ public: /// Used in InterpreterSelectQuery to pass it to the IBlockInputStream. ProgressCallback getProgressCallback() const; + void setFileProgressCallback(FileProgressCallback && callback) { file_progress_callback = callback; } + FileProgressCallback getFileProgressCallback() const { return file_progress_callback; } + /** Set in executeQuery and InterpreterSelectQuery. Then it is used in IBlockInputStream, * to update and monitor information about the total number of resources spent for the query. */ diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index ee208631c9b..28d88bdd8df 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -831,14 +831,17 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (create.attach && !create.storage && !create.columns_list) { auto database = DatabaseCatalog::instance().getDatabase(database_name); + if (database->getEngineName() == "Replicated") { auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, create.table); - if (typeid_cast(database.get()) && getContext()->getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) + + if (auto* ptr = typeid_cast(database.get()); + ptr && getContext()->getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { create.database = database_name; guard->releaseTableLock(); - return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, getContext()); + return ptr->tryEnqueueReplicatedDDL(query_ptr, getContext()); } } @@ -926,11 +929,13 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (need_add_to_database && database->getEngineName() == "Replicated") { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); - if (typeid_cast(database.get()) && getContext()->getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) + + if (auto * ptr = typeid_cast(database.get()); + ptr && getContext()->getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { assertOrSetUUID(create, database); guard->releaseTableLock(); - return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, getContext()); + return ptr->tryEnqueueReplicatedDDL(query_ptr, getContext()); } } @@ -992,8 +997,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } data_path = database->getTableDataPath(create); + if (!create.attach && !data_path.empty() && fs::exists(fs::path{getContext()->getPath()} / data_path)) - throw Exception(storage_already_exists_error_code, "Directory for {} data {} already exists", Poco::toLower(storage_name), String(data_path)); + throw Exception(storage_already_exists_error_code, + "Directory for {} data {} already exists", Poco::toLower(storage_name), String(data_path)); } else { diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 225bf9ec651..4d9e293d762 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -189,12 +189,11 @@ BlockIO InterpreterInsertQuery::execute() const auto & union_modes = select_query.list_of_modes; /// ASTSelectWithUnionQuery is not normalized now, so it may pass some queries which can be Trivial select queries - is_trivial_insert_select - = std::all_of( - union_modes.begin(), - union_modes.end(), - [](const ASTSelectWithUnionQuery::Mode & mode) { return mode == ASTSelectWithUnionQuery::Mode::ALL; }) - && std::all_of(selects.begin(), selects.end(), [](const ASTPtr & select) { return isTrivialSelect(select); }); + const auto mode_is_all = [](const auto & mode) { return mode == ASTSelectWithUnionQuery::Mode::ALL; }; + + is_trivial_insert_select = + std::all_of(union_modes.begin(), union_modes.end(), std::move(mode_is_all)) + && std::all_of(selects.begin(), selects.end(), isTrivialSelect); } if (is_trivial_insert_select) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 7c1ad143594..7cca527cbc1 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -633,7 +633,7 @@ Block InterpreterSelectQuery::getSampleBlockImpl() /// Running on the initiating server during distributed processing or if query is not distributed. /// /// Also note that with distributed_group_by_no_merge=2 (i.e. when optimize_distributed_group_by_sharding_key takes place) - /// the query on the remote server will be processed up to WithMergeableStateAfterAggregation, + /// the query on the remote server will be processed up to WithMergeableStateAfterAggregationAndLimit, /// So it will do partial second stage (second_stage=true), and initiator will do the final part. bool second_stage = from_stage <= QueryProcessingStage::WithMergeableState && options.to_stage > QueryProcessingStage::WithMergeableState; @@ -705,7 +705,7 @@ Block InterpreterSelectQuery::getSampleBlockImpl() return res; } - if (options.to_stage == QueryProcessingStage::Enum::WithMergeableStateAfterAggregation) + if (options.to_stage >= QueryProcessingStage::Enum::WithMergeableStateAfterAggregation) { // It's different from selected_columns, see the comment above for // WithMergeableState stage. @@ -1012,10 +1012,10 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu /// Support optimize_distributed_group_by_sharding_key /// Is running on the initiating server during distributed processing? - if (from_stage == QueryProcessingStage::WithMergeableStateAfterAggregation) + if (from_stage >= QueryProcessingStage::WithMergeableStateAfterAggregation) from_aggregation_stage = true; /// Is running on remote servers during distributed processing? - if (options.to_stage == QueryProcessingStage::WithMergeableStateAfterAggregation) + if (options.to_stage >= QueryProcessingStage::WithMergeableStateAfterAggregation) to_aggregation_stage = true; /// Read the data from Storage. from_stage - to what stage the request was completed in Storage. @@ -1301,7 +1301,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu * but there is no aggregation, then on the remote servers ORDER BY was made * - therefore, we merge the sorted streams from remote servers. * - * Also in case of remote servers was process the query up to WithMergeableStateAfterAggregation + * Also in case of remote servers was process the query up to WithMergeableStateAfterAggregationAndLimit * (distributed_group_by_no_merge=2 or optimize_distributed_group_by_sharding_key=1 takes place), * then merge the sorted streams is enough, since remote servers already did full ORDER BY. */ @@ -1335,13 +1335,15 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu } } + bool apply_limit = options.to_stage != QueryProcessingStage::WithMergeableStateAfterAggregation; + bool apply_offset = options.to_stage != QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit; bool has_prelimit = false; - if (!to_aggregation_stage && + if (apply_limit && query.limitLength() && !query.limit_with_ties && !hasWithTotalsInAnySubqueryInFromClause(query) && !query.arrayJoinExpressionList() && !query.distinct && !expressions.hasLimitBy() && !settings.extremes && !has_withfill) { - executePreLimit(query_plan, false); + executePreLimit(query_plan, /* do_not_skip_offset= */!apply_offset); has_prelimit = true; } @@ -1368,7 +1370,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu } /// Projection not be done on the shards, since then initiator will not find column in blocks. - /// (significant only for WithMergeableStateAfterAggregation). + /// (significant only for WithMergeableStateAfterAggregation/WithMergeableStateAfterAggregationAndLimit). if (!to_aggregation_stage) { /// We must do projection after DISTINCT because projection may remove some columns. @@ -1379,10 +1381,10 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu executeExtremes(query_plan); /// Limit is no longer needed if there is prelimit. - if (!to_aggregation_stage && !has_prelimit) + if (apply_limit && !has_prelimit) executeLimit(query_plan); - if (!to_aggregation_stage) + if (apply_offset) executeOffset(query_plan); } } diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index f97001883bd..f76d51e765b 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -43,11 +43,8 @@ # include "config_core.h" #endif - namespace DB { - - namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -56,6 +53,7 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; extern const int TIMEOUT_EXCEEDED; extern const int TABLE_WAS_NOT_DROPPED; + extern const int NO_ZOOKEEPER; } @@ -131,6 +129,8 @@ AccessType getRequiredAccessType(StorageActionBlockType action_type) throw Exception("Unknown action type: " + std::to_string(action_type), ErrorCodes::LOGICAL_ERROR); } +constexpr std::string_view table_is_not_replicated = "Table {} is not replicated"; + } /// Implements SYSTEM [START|STOP] @@ -212,11 +212,16 @@ BlockIO InterpreterSystemQuery::execute() system_context->setSetting("profile", getContext()->getSystemProfileName()); /// Make canonical query for simpler processing - if (!query.table.empty()) + if (query.type == Type::RELOAD_DICTIONARY) + { + if (!query.database.empty()) + query.table = query.database + "." + query.table; + } + else if (!query.table.empty()) + { table_id = getContext()->resolveStorageID(StorageID(query.database, query.table), Context::ResolveOrdinary); + } - if (!query.target_dictionary.empty() && !query.database.empty()) - query.target_dictionary = query.database + "." + query.target_dictionary; volume_ptr = {}; if (!query.storage_policy.empty() && !query.volume.empty()) @@ -286,7 +291,7 @@ BlockIO InterpreterSystemQuery::execute() getContext()->checkAccess(AccessType::SYSTEM_RELOAD_DICTIONARY); auto & external_dictionaries_loader = system_context->getExternalDictionariesLoader(); - external_dictionaries_loader.reloadDictionary(query.target_dictionary, getContext()); + external_dictionaries_loader.reloadDictionary(query.table, getContext()); ExternalDictionariesLoader::resetAll(); @@ -296,8 +301,8 @@ BlockIO InterpreterSystemQuery::execute() { getContext()->checkAccess(AccessType::SYSTEM_RELOAD_DICTIONARY); executeCommandsAndThrowIfError( - [&] () { system_context->getExternalDictionariesLoader().reloadAllTriedToLoad(); }, - [&] () { system_context->getEmbeddedDictionaries().reload(); } + [&] { system_context->getExternalDictionariesLoader().reloadAllTriedToLoad(); }, + [&] { system_context->getEmbeddedDictionaries().reload(); } ); ExternalDictionariesLoader::resetAll(); break; @@ -392,8 +397,10 @@ BlockIO InterpreterSystemQuery::execute() break; case Type::RESTART_REPLICA: if (!tryRestartReplica(table_id, system_context)) - throw Exception("There is no " + query.database + "." + query.table + " replicated table", - ErrorCodes::BAD_ARGUMENTS); + throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs()); + break; + case Type::RESTORE_REPLICA: + restoreReplica(); break; case Type::RESTART_DISK: restartDisk(query.disk); @@ -402,14 +409,14 @@ BlockIO InterpreterSystemQuery::execute() { getContext()->checkAccess(AccessType::SYSTEM_FLUSH_LOGS); executeCommandsAndThrowIfError( - [&] () { if (auto query_log = getContext()->getQueryLog()) query_log->flush(true); }, - [&] () { if (auto part_log = getContext()->getPartLog("")) part_log->flush(true); }, - [&] () { if (auto query_thread_log = getContext()->getQueryThreadLog()) query_thread_log->flush(true); }, - [&] () { if (auto trace_log = getContext()->getTraceLog()) trace_log->flush(true); }, - [&] () { if (auto text_log = getContext()->getTextLog()) text_log->flush(true); }, - [&] () { if (auto metric_log = getContext()->getMetricLog()) metric_log->flush(true); }, - [&] () { if (auto asynchronous_metric_log = getContext()->getAsynchronousMetricLog()) asynchronous_metric_log->flush(true); }, - [&] () { if (auto opentelemetry_span_log = getContext()->getOpenTelemetrySpanLog()) opentelemetry_span_log->flush(true); } + [&] { if (auto query_log = getContext()->getQueryLog()) query_log->flush(true); }, + [&] { if (auto part_log = getContext()->getPartLog("")) part_log->flush(true); }, + [&] { if (auto query_thread_log = getContext()->getQueryThreadLog()) query_thread_log->flush(true); }, + [&] { if (auto trace_log = getContext()->getTraceLog()) trace_log->flush(true); }, + [&] { if (auto text_log = getContext()->getTextLog()) text_log->flush(true); }, + [&] { if (auto metric_log = getContext()->getMetricLog()) metric_log->flush(true); }, + [&] { if (auto asynchronous_metric_log = getContext()->getAsynchronousMetricLog()) asynchronous_metric_log->flush(true); }, + [&] { if (auto opentelemetry_span_log = getContext()->getOpenTelemetrySpanLog()) opentelemetry_span_log->flush(true); } ); break; } @@ -423,12 +430,51 @@ BlockIO InterpreterSystemQuery::execute() return BlockIO(); } +void InterpreterSystemQuery::restoreReplica() +{ + getContext()->checkAccess(AccessType::SYSTEM_RESTORE_REPLICA, table_id); + + const zkutil::ZooKeeperPtr& zookeeper = getContext()->getZooKeeper(); + + if (zookeeper->expired()) + throw Exception(ErrorCodes::NO_ZOOKEEPER, + "Cannot restore table metadata because ZooKeeper session has expired"); + + const StoragePtr table_ptr = DatabaseCatalog::instance().getTable(table_id, getContext()); + + auto * const table_replicated_ptr = dynamic_cast(table_ptr.get()); + + if (table_replicated_ptr == nullptr) + throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs()); + + auto & table_replicated = *table_replicated_ptr; + + StorageReplicatedMergeTree::Status status; + table_replicated.getStatus(status); + + if (!status.is_readonly) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Replica must be readonly"); + + const String replica_name = table_replicated.getReplicaName(); + const String& zk_root_path = status.zookeeper_path; + + if (String replica_path = zk_root_path + "replicas/" + replica_name; zookeeper->exists(replica_path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Replica path is present at {} -- nothing to restore. " + "If you are sure that metadata it lost and replica path contain some garbage, " + "then use SYSTEM DROP REPLICA query first.", replica_path); + + table_replicated.restoreMetadataInZooKeeper(); +} StoragePtr InterpreterSystemQuery::tryRestartReplica(const StorageID & replica, ContextMutablePtr system_context, bool need_ddl_guard) { getContext()->checkAccess(AccessType::SYSTEM_RESTART_REPLICA, replica); - auto table_ddl_guard = need_ddl_guard ? DatabaseCatalog::instance().getDDLGuard(replica.getDatabaseName(), replica.getTableName()) : nullptr; + auto table_ddl_guard = need_ddl_guard + ? DatabaseCatalog::instance().getDDLGuard(replica.getDatabaseName(), replica.getTableName()) + : nullptr; + auto [database, table] = DatabaseCatalog::instance().tryGetDatabaseAndTable(replica, getContext()); ASTPtr create_ast; @@ -475,28 +521,23 @@ void InterpreterSystemQuery::restartReplicas(ContextMutablePtr system_context) auto & catalog = DatabaseCatalog::instance(); for (auto & elem : catalog.getDatabases()) - { - DatabasePtr & database = elem.second; - for (auto iterator = database->getTablesIterator(getContext()); iterator->isValid(); iterator->next()) - { - if (auto table = iterator->table()) - { - if (dynamic_cast(table.get())) - replica_names.emplace_back(StorageID{iterator->databaseName(), iterator->name()}); - } - } - } + for (auto it = elem.second->getTablesIterator(getContext()); it->isValid(); it->next()) + if (dynamic_cast(it->table().get())) + replica_names.emplace_back(it->databaseName(), it->name()); if (replica_names.empty()) return; TableGuards guards; + for (const auto & name : replica_names) guards.emplace(UniqueTableName{name.database_name, name.table_name}, nullptr); + for (auto & guard : guards) guard.second = catalog.getDDLGuard(guard.first.database_name, guard.first.table_name); ThreadPool pool(std::min(size_t(getNumberOfPhysicalCPUCores()), replica_names.size())); + for (auto & replica : replica_names) { LOG_TRACE(log, "Restarting replica on {}", replica.getNameForLogs()); @@ -516,7 +557,7 @@ void InterpreterSystemQuery::dropReplica(ASTSystemQuery & query) StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); if (!dropReplicaImpl(query, table)) - throw Exception("Table " + table_id.getNameForLogs() + " is not replicated", ErrorCodes::BAD_ARGUMENTS); + throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs()); } else if (!query.database.empty()) { @@ -628,7 +669,7 @@ void InterpreterSystemQuery::syncReplica(ASTSystemQuery &) LOG_TRACE(log, "SYNC REPLICA {}: OK", table_id.getNameForLogs()); } else - throw Exception("Table " + table_id.getNameForLogs() + " is not replicated", ErrorCodes::BAD_ARGUMENTS); + throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs()); } void InterpreterSystemQuery::flushDistributed(ASTSystemQuery &) @@ -659,6 +700,7 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() const auto & query = query_ptr->as(); using Type = ASTSystemQuery::Type; AccessRightsElements required_access; + switch (query.type) { case Type::SHUTDOWN: [[fallthrough]]; @@ -770,6 +812,11 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() required_access.emplace_back(AccessType::SYSTEM_DROP_REPLICA, query.database, query.table); break; } + case Type::RESTORE_REPLICA: + { + required_access.emplace_back(AccessType::SYSTEM_RESTORE_REPLICA, query.database, query.table); + break; + } case Type::SYNC_REPLICA: { required_access.emplace_back(AccessType::SYSTEM_SYNC_REPLICA, query.database, query.table); diff --git a/src/Interpreters/InterpreterSystemQuery.h b/src/Interpreters/InterpreterSystemQuery.h index 297f7225a92..6d1ad78a991 100644 --- a/src/Interpreters/InterpreterSystemQuery.h +++ b/src/Interpreters/InterpreterSystemQuery.h @@ -49,6 +49,9 @@ private: void restartReplicas(ContextMutablePtr system_context); void syncReplica(ASTSystemQuery & query); + + void restoreReplica(); + void dropReplica(ASTSystemQuery & query); bool dropReplicaImpl(ASTSystemQuery & query, const StoragePtr & table); void flushDistributed(ASTSystemQuery & query); diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 8a6abf7714f..c5dec2cf214 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -102,12 +102,10 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, /// The current database in a distributed query need to be replaced with either /// the local current database or a shard's default database. - bool need_replace_current_database - = (std::find_if( - query_requires_access.begin(), - query_requires_access.end(), - [](const AccessRightsElement & elem) { return elem.isEmptyDatabase(); }) - != query_requires_access.end()); + bool need_replace_current_database = std::any_of( + query_requires_access.begin(), + query_requires_access.end(), + [](const AccessRightsElement & elem) { return elem.isEmptyDatabase(); }); bool use_local_default_database = false; const String & current_database = context->getCurrentDatabase(); diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index 821f41c086c..335575a6362 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -113,6 +113,7 @@ Block getHeaderForProcessingStage( case QueryProcessingStage::WithMergeableState: case QueryProcessingStage::Complete: case QueryProcessingStage::WithMergeableStateAfterAggregation: + case QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit: case QueryProcessingStage::MAX: { auto query = query_info.query->clone(); diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp index 9f725c3cd59..5548667e1a7 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/join_common.cpp @@ -130,9 +130,14 @@ void removeColumnNullability(ColumnWithTypeAndName & column) const auto & dict_type = typeid_cast(column.type.get())->getDictionaryType(); column.type = std::make_shared(removeNullable(dict_type)); - ColumnLowCardinality * col_as_lc = typeid_cast(column.column->assumeMutable().get()); - if (col_as_lc && col_as_lc->nestedIsNullable()) - col_as_lc->nestedRemoveNullable(); + if (column.column && column.column->lowCardinality()) + { + auto mut_col = IColumn::mutate(std::move(column.column)); + ColumnLowCardinality * col_as_lc = typeid_cast(mut_col.get()); + if (col_as_lc && col_as_lc->nestedIsNullable()) + col_as_lc->nestedRemoveNullable(); + column.column = std::move(mut_col); + } return; } diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp index bf94df0bf50..5d01e124b0e 100644 --- a/src/Parsers/ASTSystemQuery.cpp +++ b/src/Parsers/ASTSystemQuery.cpp @@ -44,6 +44,8 @@ const char * ASTSystemQuery::typeToString(Type type) return "RESTART REPLICAS"; case Type::RESTART_REPLICA: return "RESTART REPLICA"; + case Type::RESTORE_REPLICA: + return "RESTORE REPLICA"; case Type::DROP_REPLICA: return "DROP REPLICA"; case Type::SYNC_REPLICA: @@ -119,18 +121,6 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &, << (settings.hilite ? hilite_none : ""); }; - auto print_database_dictionary = [&] - { - settings.ostr << " "; - if (!database.empty()) - { - settings.ostr << (settings.hilite ? hilite_identifier : "") << backQuoteIfNeed(database) - << (settings.hilite ? hilite_none : "") << "."; - } - settings.ostr << (settings.hilite ? hilite_identifier : "") << backQuoteIfNeed(target_dictionary) - << (settings.hilite ? hilite_none : ""); - }; - auto print_drop_replica = [&] { settings.ostr << " " << quoteString(replica); @@ -187,14 +177,14 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &, else if (!volume.empty()) print_on_volume(); } - else if (type == Type::RESTART_REPLICA || type == Type::SYNC_REPLICA || type == Type::FLUSH_DISTRIBUTED) + else if ( type == Type::RESTART_REPLICA + || type == Type::RESTORE_REPLICA + || type == Type::SYNC_REPLICA + || type == Type::FLUSH_DISTRIBUTED + || type == Type::RELOAD_DICTIONARY) { print_database_table(); } - else if (type == Type::RELOAD_DICTIONARY) - { - print_database_dictionary(); - } else if (type == Type::DROP_REPLICA) { print_drop_replica(); diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index 6cd1443155f..cbe82cd936f 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -32,6 +32,7 @@ public: START_LISTEN_QUERIES, RESTART_REPLICAS, RESTART_REPLICA, + RESTORE_REPLICA, DROP_REPLICA, SYNC_REPLICA, RELOAD_DICTIONARY, @@ -65,7 +66,6 @@ public: Type type = Type::UNKNOWN; - String target_dictionary; String target_model; String database; String table; diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index eec79edc05e..e6af11399de 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -489,14 +489,12 @@ bool ParserPrefixUnaryOperatorExpression::parseImpl(Pos & pos, ASTPtr & node, Ex /** This is done, because among the unary operators there is only a minus and NOT. * But for a minus the chain of unary operators does not need to be supported. */ + size_t count = 1; if (it[0] && 0 == strncmp(it[0], "NOT", 3)) { - /// Was there an even number of NOTs. - bool even = false; - - const char ** jt; while (true) { + const char ** jt; for (jt = operators; *jt; jt += 2) if (parseOperator(pos, *jt, expected)) break; @@ -504,11 +502,8 @@ bool ParserPrefixUnaryOperatorExpression::parseImpl(Pos & pos, ASTPtr & node, Ex if (!*jt) break; - even = !even; + ++count; } - - if (even) - it = jt; /// Zero the result of parsing the first NOT. It turns out, as if there is no `NOT` chain at all. } ASTPtr elem; @@ -519,19 +514,25 @@ bool ParserPrefixUnaryOperatorExpression::parseImpl(Pos & pos, ASTPtr & node, Ex node = elem; else { - /// the function corresponding to the operator - auto function = std::make_shared(); + for (size_t i = 0; i < count; ++i) + { + /// the function corresponding to the operator + auto function = std::make_shared(); - /// function arguments - auto exp_list = std::make_shared(); + /// function arguments + auto exp_list = std::make_shared(); - function->name = it[1]; - function->arguments = exp_list; - function->children.push_back(exp_list); + function->name = it[1]; + function->arguments = exp_list; + function->children.push_back(exp_list); - exp_list->children.push_back(elem); + if (node) + exp_list->children.push_back(node); + else + exp_list->children.push_back(elem); - node = function; + node = function; + } } return true; diff --git a/src/Parsers/New/AST/SystemQuery.cpp b/src/Parsers/New/AST/SystemQuery.cpp index 2be9ff951e0..d2fda6a3fbc 100644 --- a/src/Parsers/New/AST/SystemQuery.cpp +++ b/src/Parsers/New/AST/SystemQuery.cpp @@ -133,7 +133,7 @@ ASTPtr SystemQuery::convertToOld() const { auto table = std::static_pointer_cast(get(TABLE)->convertToOld()); query->database = table->getDatabaseName(); - query->target_dictionary = table->shortName(); + query->table = table->getTableId().table_name; } break; case QueryType::REPLICATED_SENDS: diff --git a/src/Parsers/ParserRenameQuery.cpp b/src/Parsers/ParserRenameQuery.cpp index e3b35249cd6..c42a0af88b2 100644 --- a/src/Parsers/ParserRenameQuery.cpp +++ b/src/Parsers/ParserRenameQuery.cpp @@ -95,21 +95,18 @@ bool ParserRenameQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ASTRenameQuery::Elements elements; - auto ignore_delim = [&]() - { - return exchange ? s_and.ignore(pos) : s_to.ignore(pos); - }; + const auto ignore_delim = [&] { return exchange ? s_and.ignore(pos) : s_to.ignore(pos); }; while (true) { if (!elements.empty() && !s_comma.ignore(pos)) break; - elements.push_back(ASTRenameQuery::Element()); + ASTRenameQuery::Element& ref = elements.emplace_back(); - if (!parseDatabaseAndTable(elements.back().from, pos, expected) + if (!parseDatabaseAndTable(ref.from, pos, expected) || !ignore_delim() - || !parseDatabaseAndTable(elements.back().to, pos, expected)) + || !parseDatabaseAndTable(ref.to, pos, expected)) return false; } diff --git a/src/Parsers/ParserSelectQuery.cpp b/src/Parsers/ParserSelectQuery.cpp index 12e83486af8..3f6607be0bc 100644 --- a/src/Parsers/ParserSelectQuery.cpp +++ b/src/Parsers/ParserSelectQuery.cpp @@ -97,9 +97,7 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) } } - bool has_distinct_on = false; - - /// SELECT [ALL/DISTINCT] [TOP N [WITH TIES]] expr list + /// SELECT [DISTINCT ON expr] [ALL/DISTINCT] [TOP N [WITH TIES]] expr list { bool has_all = false; if (!s_select.ignore(pos, expected)) @@ -108,8 +106,8 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (s_all.ignore(pos, expected)) has_all = true; - if (s_distinct_on.ignore(pos, expected)) { - has_distinct_on = true; + if (s_distinct_on.ignore(pos, expected)) + { if (!exp_list.parse(pos, limit_by_expression_list, expected)) return false; limit_by_length = std::make_shared(Field{UInt8(1)}); @@ -276,8 +274,8 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (limit_with_ties_occured) throw Exception("Can not use WITH TIES alongside LIMIT BY", ErrorCodes::LIMIT_BY_WITH_TIES_IS_NOT_SUPPORTED); - if (has_distinct_on) - throw Exception("Can not use distinct on alongside LIMIT BY", ErrorCodes::DISTINCT_ON_AND_LIMIT_BY_TOGETHER); + if (limit_by_length) + throw Exception("Can not use DISTINCT ON alongside LIMIT BY", ErrorCodes::DISTINCT_ON_AND_LIMIT_BY_TOGETHER); limit_by_length = limit_length; limit_by_offset = limit_offset; diff --git a/src/Parsers/ParserSystemQuery.cpp b/src/Parsers/ParserSystemQuery.cpp index a1487468ab3..66bd39e0202 100644 --- a/src/Parsers/ParserSystemQuery.cpp +++ b/src/Parsers/ParserSystemQuery.cpp @@ -15,6 +15,47 @@ namespace ErrorCodes namespace DB { +static bool parseQueryWithOnClusterAndMaybeTable(std::shared_ptr & res, IParser::Pos & pos, + Expected & expected, bool require_table, bool allow_string_literal) +{ + /// Better form for user: SYSTEM table ON CLUSTER cluster + /// Query rewritten form + form while executing on cluster: SYSTEM ON CLUSTER cluster table + /// Need to support both + String cluster; + bool parsed_on_cluster = false; + + if (ParserKeyword{"ON"}.ignore(pos, expected)) + { + if (!ASTQueryWithOnCluster::parse(pos, cluster, expected)) + return false; + parsed_on_cluster = true; + } + + bool parsed_table = false; + if (allow_string_literal) + { + ASTPtr ast; + if (ParserStringLiteral{}.parse(pos, ast, expected)) + { + res->database = {}; + res->table = ast->as().value.safeGet(); + parsed_table = true; + } + } + + if (!parsed_table) + parsed_table = parseDatabaseAndTableName(pos, expected, res->database, res->table); + + if (!parsed_table && require_table) + return false; + + if (!parsed_on_cluster && ParserKeyword{"ON"}.ignore(pos, expected)) + if (!ASTQueryWithOnCluster::parse(pos, cluster, expected)) + return false; + + res->cluster = cluster; + return true; +} bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & expected) { @@ -43,17 +84,7 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & { case Type::RELOAD_DICTIONARY: { - String cluster_str; - if (ParserKeyword{"ON"}.ignore(pos, expected)) - { - if (!ASTQueryWithOnCluster::parse(pos, cluster_str, expected)) - return false; - } - res->cluster = cluster_str; - ASTPtr ast; - if (ParserStringLiteral{}.parse(pos, ast, expected)) - res->target_dictionary = ast->as().value.safeGet(); - else if (!parseDatabaseAndTableName(pos, expected, res->database, res->target_dictionary)) + if (!parseQueryWithOnClusterAndMaybeTable(res, pos, expected, /* require table = */ true, /* allow_string_literal = */ true)) return false; break; } @@ -145,24 +176,21 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & break; } + /// FLUSH DISTRIBUTED requires table + /// START/STOP DISTRIBUTED SENDS does not require table case Type::STOP_DISTRIBUTED_SENDS: case Type::START_DISTRIBUTED_SENDS: - case Type::FLUSH_DISTRIBUTED: { - String cluster_str; - if (ParserKeyword{"ON"}.ignore(pos, expected)) - { - if (!ASTQueryWithOnCluster::parse(pos, cluster_str, expected)) - return false; - } - res->cluster = cluster_str; - if (!parseDatabaseAndTableName(pos, expected, res->database, res->table)) - { - /// FLUSH DISTRIBUTED requires table - /// START/STOP DISTRIBUTED SENDS does not require table - if (res->type == Type::FLUSH_DISTRIBUTED) - return false; - } + if (!parseQueryWithOnClusterAndMaybeTable(res, pos, expected, /* require table = */ false, /* allow_string_literal = */ false)) + return false; + break; + } + + case Type::FLUSH_DISTRIBUTED: + case Type::RESTORE_REPLICA: + { + if (!parseQueryWithOnClusterAndMaybeTable(res, pos, expected, /* require table = */ true, /* allow_string_literal = */ false)) + return false; break; } diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index 52d2cf98c25..269faac5258 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -1,4 +1,5 @@ #include "ArrowBlockInputFormat.h" + #if USE_ARROW #include @@ -29,7 +30,6 @@ ArrowBlockInputFormat::ArrowBlockInputFormat(ReadBuffer & in_, const Block & hea Chunk ArrowBlockInputFormat::generate() { Chunk res; - const Block & header = getPort().getHeader(); arrow::Result> batch_result; if (stream) @@ -63,7 +63,7 @@ Chunk ArrowBlockInputFormat::generate() ++record_batch_current; - ArrowColumnToCHColumn::arrowTableToCHChunk(res, *table_result, header, "Arrow"); + arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result); return res; } @@ -81,6 +81,8 @@ void ArrowBlockInputFormat::resetParser() void ArrowBlockInputFormat::prepareReader() { + std::shared_ptr schema; + if (stream) { auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(in)); @@ -88,6 +90,7 @@ void ArrowBlockInputFormat::prepareReader() throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", stream_reader_status.status().ToString()); stream_reader = *stream_reader_status; + schema = stream_reader->schema(); } else { @@ -96,8 +99,11 @@ void ArrowBlockInputFormat::prepareReader() throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", file_reader_status.status().ToString()); file_reader = *file_reader_status; + schema = file_reader->schema(); } + arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), std::move(schema), "Arrow"); + if (stream) record_batch_total = -1; else diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h index 5ad112efde9..3bfead93bf1 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h @@ -11,6 +11,7 @@ namespace DB { class ReadBuffer; +class ArrowColumnToCHColumn; class ArrowBlockInputFormat : public IInputFormat { @@ -32,6 +33,8 @@ private: // The following fields are used only for Arrow format std::shared_ptr file_reader; + std::unique_ptr arrow_column_to_ch_column; + int record_batch_total = 0; int record_batch_current = 0; diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp index c1abdd1a759..8f43d03de38 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp @@ -18,17 +18,26 @@ namespace ErrorCodes } ArrowBlockOutputFormat::ArrowBlockOutputFormat(WriteBuffer & out_, const Block & header_, bool stream_, const FormatSettings & format_settings_) - : IOutputFormat(header_, out_), stream{stream_}, format_settings{format_settings_}, arrow_ostream{std::make_shared(out_)} + : IOutputFormat(header_, out_) + , stream{stream_} + , format_settings{format_settings_} + , arrow_ostream{std::make_shared(out_)} { } void ArrowBlockOutputFormat::consume(Chunk chunk) { - const Block & header = getPort(PortKind::Main).getHeader(); const size_t columns_num = chunk.getNumColumns(); std::shared_ptr arrow_table; - CHColumnToArrowColumn::chChunkToArrowTable(arrow_table, header, chunk, columns_num, "Arrow"); + if (!ch_column_to_arrow_column) + { + const Block & header = getPort(PortKind::Main).getHeader(); + ch_column_to_arrow_column + = std::make_unique(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary); + } + + ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); if (!writer) prepareWriter(arrow_table->schema()); diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.h b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.h index 0cc6804705b..40d81f8b919 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.h @@ -12,6 +12,8 @@ namespace arrow::ipc { class RecordBatchWriter; } namespace DB { +class CHColumnToArrowColumn; + class ArrowBlockOutputFormat : public IOutputFormat { public: @@ -28,6 +30,7 @@ private: const FormatSettings format_settings; std::shared_ptr arrow_ostream; std::shared_ptr writer; + std::unique_ptr ch_column_to_arrow_column; void prepareWriter(const std::shared_ptr & schema); }; diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 29fff1fd4e0..edf131cd49e 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -7,15 +7,22 @@ #include #include #include +#include +#include +#include #include #include #include #include #include #include +#include +#include +#include +#include #include #include -#include +#include namespace DB @@ -27,6 +34,7 @@ namespace DB extern const int CANNOT_CONVERT_TYPE; extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN; extern const int THERE_IS_NO_COLUMN; + extern const int BAD_ARGUMENTS; } static const std::initializer_list> arrow_type_to_internal_type = @@ -152,11 +160,11 @@ namespace DB if (days_num > DATE_LUT_MAX_DAY_NUM) { // TODO: will it rollback correctly? - throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + internal_column.getName() - + "\" is greater than " - "max allowed Date value, which is " - + std::to_string(DATE_LUT_MAX_DAY_NUM), - ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE}; + throw Exception + { + fmt::format("Input value {} of a column \"{}\" is greater than max allowed Date value, which is {}", days_num, internal_column.getName(), DATE_LUT_MAX_DAY_NUM), + ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE + }; } column_data.emplace_back(days_num); @@ -263,23 +271,47 @@ namespace DB offsets_data.emplace_back(start + arrow_offsets.Value(i)); } } + static ColumnPtr createAndFillColumnWithIndexesData(std::shared_ptr & arrow_column) + { + switch (arrow_column->type()->id()) + { +# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ + case ARROW_NUMERIC_TYPE: \ + { \ + auto column = DataTypeNumber().createColumn(); \ + fillColumnWithNumericData(arrow_column, *column); \ + return column; \ + } + FOR_ARROW_INDEXES_TYPES(DISPATCH) +# undef DISPATCH + default: + throw Exception(fmt::format("Unsupported type for indexes in LowCardinality: {}.", arrow_column->type()->name()), ErrorCodes::BAD_ARGUMENTS); + } + } - static void readColumnFromArrowColumn(std::shared_ptr & arrow_column, IColumn & internal_column, const std::string & column_name, const std::string format_name, bool is_nullable) + static void readColumnFromArrowColumn( + std::shared_ptr & arrow_column, + IColumn & internal_column, + const std::string & column_name, + const std::string & format_name, + bool is_nullable, + std::unordered_map dictionary_values) { if (internal_column.isNullable()) { - ColumnNullable & column_nullable = typeid_cast(internal_column); - readColumnFromArrowColumn(arrow_column, column_nullable.getNestedColumn(), column_name, format_name, true); + ColumnNullable & column_nullable = assert_cast(internal_column); + readColumnFromArrowColumn(arrow_column, column_nullable.getNestedColumn(), column_name, format_name, true, dictionary_values); fillByteMapFromArrowColumn(arrow_column, column_nullable.getNullMapColumn()); return; } - // TODO: check if a column is const? - if (!is_nullable && !checkColumn(internal_column) && arrow_column->null_count()) + /// TODO: check if a column is const? + if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST + && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT) { throw Exception { - "Can not insert NULL data into non-nullable column \"" + column_name + "\"", + fmt::format("Can not insert NULL data into non-nullable column \"{}\".", column_name), ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN }; } @@ -304,13 +336,11 @@ namespace DB fillColumnWithTimestampData(arrow_column, internal_column); break; case arrow::Type::DECIMAL: - //fillColumnWithNumericData>(arrow_column, read_column); // Have problems with trash values under NULL, but faster fillColumnWithDecimalData(arrow_column, internal_column /*, internal_nested_type*/); break; + case arrow::Type::MAP: [[fallthrough]]; case arrow::Type::LIST: { - const auto * list_type = static_cast(arrow_column->type().get()); - auto list_nested_type = list_type->value_type(); arrow::ArrayVector array_vector; array_vector.reserve(arrow_column->num_chunks()); for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -321,11 +351,70 @@ namespace DB } auto arrow_nested_column = std::make_shared(array_vector); - ColumnArray & column_array = typeid_cast(internal_column); - readColumnFromArrowColumn(arrow_nested_column, column_array.getData(), column_name, format_name, false); + ColumnArray & column_array = arrow_column->type()->id() == arrow::Type::MAP + ? assert_cast(internal_column).getNestedColumn() + : assert_cast(internal_column); + + readColumnFromArrowColumn(arrow_nested_column, column_array.getData(), column_name, format_name, false, dictionary_values); fillOffsetsFromArrowListColumn(arrow_column, column_array.getOffsetsColumn()); break; } + case arrow::Type::STRUCT: + { + ColumnTuple & column_tuple = assert_cast(internal_column); + int fields_count = column_tuple.tupleSize(); + std::vector nested_arrow_columns(fields_count); + for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) + { + arrow::StructArray & struct_chunk = static_cast(*(arrow_column->chunk(chunk_i))); + for (int i = 0; i < fields_count; ++i) + nested_arrow_columns[i].emplace_back(struct_chunk.field(i)); + } + + for (int i = 0; i != fields_count; ++i) + { + auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); + readColumnFromArrowColumn(nested_arrow_column, column_tuple.getColumn(i), column_name, format_name, false, dictionary_values); + } + break; + } + case arrow::Type::DICTIONARY: + { + ColumnLowCardinality & column_lc = assert_cast(internal_column); + auto & dict_values = dictionary_values[column_name]; + /// Load dictionary values only once and reuse it. + if (!dict_values) + { + arrow::ArrayVector dict_array; + for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) + { + arrow::DictionaryArray & dict_chunk = static_cast(*(arrow_column->chunk(chunk_i))); + dict_array.emplace_back(dict_chunk.dictionary()); + } + auto arrow_dict_column = std::make_shared(dict_array); + + auto dict_column = IColumn::mutate(column_lc.getDictionaryPtr()); + auto * uniq_column = static_cast(dict_column.get()); + auto values_column = uniq_column->getNestedColumn()->cloneEmpty(); + readColumnFromArrowColumn(arrow_dict_column, *values_column, column_name, format_name, false, dictionary_values); + uniq_column->uniqueInsertRangeFrom(*values_column, 0, values_column->size()); + dict_values = std::move(dict_column); + } + + arrow::ArrayVector indexes_array; + for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) + { + arrow::DictionaryArray & dict_chunk = static_cast(*(arrow_column->chunk(chunk_i))); + indexes_array.emplace_back(dict_chunk.indices()); + } + + auto arrow_indexes_column = std::make_shared(indexes_array); + auto indexes_column = createAndFillColumnWithIndexesData(arrow_indexes_column); + + auto new_column_lc = ColumnLowCardinality::create(dict_values, std::move(indexes_column)); + column_lc = std::move(*new_column_lc); + break; + } # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ case ARROW_NUMERIC_TYPE: \ fillColumnWithNumericData(arrow_column, internal_column); \ @@ -339,8 +428,7 @@ namespace DB default: throw Exception { - "Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \"" - + column_name + "\"", + fmt::format(R"(Unsupported {} type "{}" of an input column "{}".)", format_name, arrow_column->type()->name(), column_name), ErrorCodes::UNKNOWN_TYPE }; } @@ -350,7 +438,7 @@ namespace DB { if (column_type->isNullable()) { - DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); + DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); return makeNullable(getInternalType(arrow_type, nested_type, column_name, format_name)); } @@ -367,11 +455,61 @@ namespace DB const DataTypeArray * array_type = typeid_cast(column_type.get()); if (!array_type) - throw Exception{"Cannot convert arrow LIST type to a not Array ClickHouse type " + column_type->getName(), ErrorCodes::CANNOT_CONVERT_TYPE}; + throw Exception{fmt::format("Cannot convert arrow LIST type to a not Array ClickHouse type {}.", column_type->getName()), ErrorCodes::CANNOT_CONVERT_TYPE}; return std::make_shared(getInternalType(list_nested_type, array_type->getNestedType(), column_name, format_name)); } + if (arrow_type->id() == arrow::Type::STRUCT) + { + const auto * struct_type = static_cast(arrow_type.get()); + const DataTypeTuple * tuple_type = typeid_cast(column_type.get()); + if (!tuple_type) + throw Exception{fmt::format("Cannot convert arrow STRUCT type to a not Tuple ClickHouse type {}.", column_type->getName()), ErrorCodes::CANNOT_CONVERT_TYPE}; + + const DataTypes & tuple_nested_types = tuple_type->getElements(); + int internal_fields_num = tuple_nested_types.size(); + /// If internal column has less elements then arrow struct, we will select only first internal_fields_num columns. + if (internal_fields_num > struct_type->num_fields()) + throw Exception + { + fmt::format( + "Cannot convert arrow STRUCT with {} fields to a ClickHouse Tuple with {} elements: {}.", + struct_type->num_fields(), + internal_fields_num, + column_type->getName()), + ErrorCodes::CANNOT_CONVERT_TYPE + }; + + DataTypes nested_types; + for (int i = 0; i < internal_fields_num; ++i) + nested_types.push_back(getInternalType(struct_type->field(i)->type(), tuple_nested_types[i], column_name, format_name)); + + return std::make_shared(std::move(nested_types)); + } + + if (arrow_type->id() == arrow::Type::DICTIONARY) + { + const auto * arrow_dict_type = static_cast(arrow_type.get()); + const auto * lc_type = typeid_cast(column_type.get()); + /// We allow to insert arrow dictionary into a non-LowCardinality column. + const auto & dict_type = lc_type ? lc_type->getDictionaryType() : column_type; + return std::make_shared(getInternalType(arrow_dict_type->value_type(), dict_type, column_name, format_name)); + } + + if (arrow_type->id() == arrow::Type::MAP) + { + const auto * arrow_map_type = typeid_cast(arrow_type.get()); + const auto * map_type = typeid_cast(column_type.get()); + if (!map_type) + throw Exception{fmt::format("Cannot convert arrow MAP type to a not Map ClickHouse type {}.", column_type->getName()), ErrorCodes::CANNOT_CONVERT_TYPE}; + + return std::make_shared( + getInternalType(arrow_map_type->key_type(), map_type->getKeyType(), column_name, format_name), + getInternalType(arrow_map_type->item_type(), map_type->getValueType(), column_name, format_name) + ); + } + if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(), [=](auto && elem) { return elem.first == arrow_type->id(); }); internal_type_it != arrow_type_to_internal_type.end()) @@ -380,13 +518,24 @@ namespace DB } throw Exception { - "The type \"" + arrow_type->name() + "\" of an input column \"" + column_name + "\" is not supported for conversion from a " + format_name + " data format", + fmt::format(R"(The type "{}" of an input column "{}" is not supported for conversion from a {} data format.)", arrow_type->name(), column_name, format_name), ErrorCodes::CANNOT_CONVERT_TYPE }; } - void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr & table, - const Block & header, std::string format_name) + ArrowColumnToCHColumn::ArrowColumnToCHColumn(const Block & header_, std::shared_ptr schema_, const std::string & format_name_) : header(header_), format_name(format_name_) + { + for (const auto & field : schema_->fields()) + { + if (header.has(field->name())) + { + const auto column_type = recursiveRemoveLowCardinality(header.getByName(field->name()).type); + name_to_internal_type[field->name()] = getInternalType(field->type(), column_type, field->name(), format_name); + } + } + } + + void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr & table) { Columns columns_list; UInt64 num_rows = 0; @@ -404,20 +553,18 @@ namespace DB for (size_t column_i = 0, columns = header.columns(); column_i < columns; ++column_i) { - ColumnWithTypeAndName header_column = header.getByPosition(column_i); - const auto column_type = recursiveRemoveLowCardinality(header_column.type); + const ColumnWithTypeAndName & header_column = header.getByPosition(column_i); if (name_to_column_ptr.find(header_column.name) == name_to_column_ptr.end()) // TODO: What if some columns were not presented? Insert NULLs? What if a column is not nullable? - throw Exception{"Column \"" + header_column.name + "\" is not presented in input data", + throw Exception{fmt::format("Column \"{}\" is not presented in input data.", header_column.name), ErrorCodes::THERE_IS_NO_COLUMN}; std::shared_ptr arrow_column = name_to_column_ptr[header_column.name]; - DataTypePtr internal_type = getInternalType(arrow_column->type(), column_type, header_column.name, format_name); - + DataTypePtr & internal_type = name_to_internal_type[header_column.name]; MutableColumnPtr read_column = internal_type->createColumn(); - readColumnFromArrowColumn(arrow_column, *read_column, header_column.name, format_name, false); + readColumnFromArrowColumn(arrow_column, *read_column, header_column.name, format_name, false, dictionary_values); ColumnWithTypeAndName column; column.name = header_column.name; diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index abac501c4c5..7da54a8a02d 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -19,11 +19,15 @@ namespace DB { - class ArrowColumnToCHColumn - { - private: +class ArrowColumnToCHColumn +{ +public: + ArrowColumnToCHColumn(const Block & header_, std::shared_ptr schema_, const std::string & format_name_); -# define FOR_ARROW_NUMERIC_TYPES(M) \ + void arrowTableToCHChunk(Chunk & res, std::shared_ptr & table); + +private: +#define FOR_ARROW_NUMERIC_TYPES(M) \ M(arrow::Type::UINT8, DB::UInt8) \ M(arrow::Type::INT8, DB::Int8) \ M(arrow::Type::UINT16, DB::UInt16) \ @@ -36,11 +40,24 @@ namespace DB M(arrow::Type::FLOAT, DB::Float32) \ M(arrow::Type::DOUBLE, DB::Float64) +#define FOR_ARROW_INDEXES_TYPES(M) \ + M(arrow::Type::UINT8, DB::UInt8) \ + M(arrow::Type::INT8, DB::UInt8) \ + M(arrow::Type::UINT16, DB::UInt16) \ + M(arrow::Type::INT16, DB::UInt16) \ + M(arrow::Type::UINT32, DB::UInt32) \ + M(arrow::Type::INT32, DB::UInt32) \ + M(arrow::Type::UINT64, DB::UInt64) \ + M(arrow::Type::INT64, DB::UInt64) - public: - static void arrowTableToCHChunk(Chunk & res, std::shared_ptr & table, - const Block & header, std::string format_name); - }; + const Block & header; + std::unordered_map name_to_internal_type; + const std::string format_name; + /// Map {column name : dictionary column}. + /// To avoid converting dictionary from Arrow Dictionary + /// to LowCardinality every chunk we save it and reuse. + std::unordered_map dictionary_values; +}; } #endif diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index 0e9968bec17..cc487535e37 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -6,17 +6,22 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +#include +#include +#include #include #include #include #include #include -#include namespace DB @@ -25,6 +30,7 @@ namespace DB { extern const int UNKNOWN_EXCEPTION; extern const int UNKNOWN_TYPE; + extern const int LOGICAL_ERROR; } static const std::initializer_list>> internal_type_to_arrow_type = @@ -46,16 +52,15 @@ namespace DB //{"DateTime", arrow::date64()}, // BUG! saves as date32 {"DateTime", arrow::uint32()}, - // TODO: ClickHouse can actually store non-utf8 strings! - {"String", arrow::utf8()}, - {"FixedString", arrow::utf8()}, + {"String", arrow::binary()}, + {"FixedString", arrow::binary()}, }; static void checkStatus(const arrow::Status & status, const String & column_name, const String & format_name) { if (!status.ok()) - throw Exception{"Error with a " + format_name + " column \"" + column_name + "\": " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION}; + throw Exception{fmt::format("Error with a {} column \"{}\": {}.", format_name, column_name, status.ToString()), ErrorCodes::UNKNOWN_EXCEPTION}; } template @@ -101,8 +106,10 @@ namespace DB arrow::ArrayBuilder * array_builder, String format_name, size_t start, - size_t end); + size_t end, + std::unordered_map> & dictionary_values); + template static void fillArrowArrayWithArrayColumnData( const String & column_name, ColumnPtr & column, @@ -111,26 +118,164 @@ namespace DB arrow::ArrayBuilder * array_builder, String format_name, size_t start, - size_t end) + size_t end, + std::unordered_map> & dictionary_values) { - const auto * column_array = static_cast(column.get()); + const auto * column_array = assert_cast(column.get()); ColumnPtr nested_column = column_array->getDataPtr(); - DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); + DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); const auto & offsets = column_array->getOffsets(); - arrow::ListBuilder & builder = assert_cast(*array_builder); + Builder & builder = assert_cast(*array_builder); arrow::ArrayBuilder * value_builder = builder.value_builder(); arrow::Status components_status; for (size_t array_idx = start; array_idx < end; ++array_idx) { - /// Start new array + /// Start new array. components_status = builder.Append(); checkStatus(components_status, nested_column->getName(), format_name); - fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx]); + fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], dictionary_values); } } + static void fillArrowArrayWithTupleColumnData( + const String & column_name, + ColumnPtr & column, + const std::shared_ptr & column_type, + const PaddedPODArray * null_bytemap, + arrow::ArrayBuilder * array_builder, + String format_name, + size_t start, + size_t end, + std::unordered_map> & dictionary_values) + { + const auto * column_tuple = assert_cast(column.get()); + const auto & nested_types = assert_cast(column_type.get())->getElements(); + + arrow::StructBuilder & builder = assert_cast(*array_builder); + + for (size_t i = 0; i != column_tuple->tupleSize(); ++i) + { + ColumnPtr nested_column = column_tuple->getColumnPtr(i); + fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, dictionary_values); + } + + for (size_t i = start; i != end; ++i) + { + auto status = builder.Append(); + checkStatus(status, column->getName(), format_name); + } + } + + template + static PaddedPODArray extractIndexesImpl(ColumnPtr column, size_t start, size_t end) + { + const PaddedPODArray & data = assert_cast *>(column.get())->getData(); + PaddedPODArray result; + result.reserve(end - start); + std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value); }); + return result; + } + + static PaddedPODArray extractIndexesImpl(ColumnPtr column, size_t start, size_t end) + { + switch (column->getDataType()) + { + case TypeIndex::UInt8: + return extractIndexesImpl(column, start, end); + case TypeIndex::UInt16: + return extractIndexesImpl(column, start, end); + case TypeIndex::UInt32: + return extractIndexesImpl(column, start, end); + case TypeIndex::UInt64: + return extractIndexesImpl(column, start, end); + default: + throw Exception(fmt::format("Indexes column must be ColumnUInt, got {}.", column->getName()), + ErrorCodes::LOGICAL_ERROR); + } + } + + template + static void fillArrowArrayWithLowCardinalityColumnDataImpl( + const String & column_name, + ColumnPtr & column, + const std::shared_ptr & column_type, + const PaddedPODArray * null_bytemap, + arrow::ArrayBuilder * array_builder, + String format_name, + size_t start, + size_t end, + std::unordered_map> & dictionary_values) + { + const auto * column_lc = assert_cast(column.get()); + arrow::DictionaryBuilder * builder = assert_cast *>(array_builder); + auto & dict_values = dictionary_values[column_name]; + + /// Convert dictionary from LowCardinality to Arrow dictionary only once and then reuse it. + if (!dict_values) + { + auto value_type = assert_cast(builder->type().get())->value_type(); + std::unique_ptr values_builder; + arrow::MemoryPool* pool = arrow::default_memory_pool(); + arrow::Status status = MakeBuilder(pool, value_type, &values_builder); + checkStatus(status, column->getName(), format_name); + + auto dict_column = column_lc->getDictionary().getNestedColumn(); + const auto & dict_type = assert_cast(column_type.get())->getDictionaryType(); + fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), dictionary_values); + status = values_builder->Finish(&dict_values); + checkStatus(status, column->getName(), format_name); + } + + arrow::Status status = builder->InsertMemoValues(*dict_values); + checkStatus(status, column->getName(), format_name); + + /// AppendIndices in DictionaryBuilder works only with int64_t data, so we cannot use + /// fillArrowArray here and should copy all indexes to int64_t container. + auto indexes = extractIndexesImpl(column_lc->getIndexesPtr(), start, end); + const uint8_t * arrow_null_bytemap_raw_ptr = nullptr; + PaddedPODArray arrow_null_bytemap; + if (null_bytemap) + { + /// Invert values since Arrow interprets 1 as a non-null value, while CH as a null + arrow_null_bytemap.reserve(end - start); + for (size_t i = start; i < end; ++i) + arrow_null_bytemap.emplace_back(!(*null_bytemap)[i]); + + arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data(); + } + + status = builder->AppendIndices(indexes.data(), indexes.size(), arrow_null_bytemap_raw_ptr); + checkStatus(status, column->getName(), format_name); + } + + + static void fillArrowArrayWithLowCardinalityColumnData( + const String & column_name, + ColumnPtr & column, + const std::shared_ptr & column_type, + const PaddedPODArray * null_bytemap, + arrow::ArrayBuilder * array_builder, + String format_name, + size_t start, + size_t end, + std::unordered_map> & dictionary_values) + { + auto value_type = assert_cast(array_builder->type().get())->value_type(); + +#define DISPATCH(ARROW_TYPE_ID, ARROW_TYPE) \ + if (arrow::Type::ARROW_TYPE_ID == value_type->id()) \ + { \ + fillArrowArrayWithLowCardinalityColumnDataImpl(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); \ + return; \ + } + + FOR_ARROW_TYPES(DISPATCH) +#undef DISPATCH + + } + template static void fillArrowArrayWithStringColumnData( ColumnPtr write_column, @@ -141,7 +286,7 @@ namespace DB size_t end) { const auto & internal_column = assert_cast(*write_column); - arrow::StringBuilder & builder = assert_cast(*array_builder); + arrow::BinaryBuilder & builder = assert_cast(*array_builder); arrow::Status status; for (size_t string_i = start; string_i < end; ++string_i) @@ -155,7 +300,6 @@ namespace DB StringRef string_ref = internal_column.getDataAt(string_i); status = builder.Append(string_ref.data, string_ref.size); } - checkStatus(status, write_column->getName(), format_name); } } @@ -218,18 +362,19 @@ namespace DB arrow::ArrayBuilder * array_builder, String format_name, size_t start, - size_t end) + size_t end, + std::unordered_map> & dictionary_values) { const String column_type_name = column_type->getFamilyName(); if ("Nullable" == column_type_name) { - const ColumnNullable * column_nullable = checkAndGetColumn(column.get()); + const ColumnNullable * column_nullable = assert_cast(column.get()); ColumnPtr nested_column = column_nullable->getNestedColumnPtr(); - DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); + DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr(); const PaddedPODArray & bytemap = assert_cast &>(*null_column).getData(); - fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end); + fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, dictionary_values); } else if ("String" == column_type_name) { @@ -249,7 +394,21 @@ namespace DB } else if ("Array" == column_type_name) { - fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end); + fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + } + else if ("Tuple" == column_type_name) + { + fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + } + else if ("LowCardinality" == column_type_name) + { + fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + } + else if ("Map" == column_type_name) + { + ColumnPtr column_array = assert_cast(column.get())->getNestedColumnPtr(); + DataTypePtr array_type = assert_cast(column_type.get())->getNestedType(); + fillArrowArrayWithArrayColumnData(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); } else if (isDecimal(column_type)) { @@ -280,7 +439,7 @@ namespace DB { throw Exception { - "Internal type \"" + column_type_name + "\" of a column \"" + column_name + "\" is not supported for conversion into a " + format_name + " data format", + fmt::format(R"(Internal type "{}" of a column "{}" is not supported for conversion into a {} data format.)", column_type_name, column_name, format_name), ErrorCodes::UNKNOWN_TYPE }; } @@ -295,7 +454,7 @@ namespace DB size_t start, size_t end) { - const auto & column = static_cast(*write_column); + const auto & column = assert_cast(*write_column); arrow::DecimalBuilder & builder = assert_cast(*array_builder); arrow::Status status; @@ -312,12 +471,33 @@ namespace DB checkStatus(status, write_column->getName(), format_name); } - static std::shared_ptr getArrowType(DataTypePtr column_type, const std::string & column_name, const std::string & format_name, bool * is_column_nullable) + static std::shared_ptr getArrowTypeForLowCardinalityIndexes(ColumnPtr indexes_column) + { + /// Arrow docs recommend preferring signed integers over unsigned integers for representing dictionary indices. + /// https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout + switch (indexes_column->getDataType()) + { + case TypeIndex::UInt8: + return arrow::int8(); + case TypeIndex::UInt16: + return arrow::int16(); + case TypeIndex::UInt32: + return arrow::int32(); + case TypeIndex::UInt64: + return arrow::int64(); + default: + throw Exception(fmt::format("Indexes column for getUniqueIndex must be ColumnUInt, got {}.", indexes_column->getName()), + ErrorCodes::LOGICAL_ERROR); + } + } + + static std::shared_ptr getArrowType(DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool * is_column_nullable) { if (column_type->isNullable()) { - DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); - auto arrow_type = getArrowType(nested_type, column_name, format_name, is_column_nullable); + DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); + ColumnPtr nested_column = assert_cast(column.get())->getNestedColumnPtr(); + auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, is_column_nullable); *is_column_nullable = true; return arrow_type; } @@ -334,7 +514,7 @@ namespace DB || std::is_same_v> || std::is_same_v>) { - const auto & decimal_type = static_cast(column_type.get()); + const auto & decimal_type = assert_cast(column_type.get()); arrow_type = arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()); } @@ -346,11 +526,50 @@ namespace DB if (isArray(column_type)) { - auto nested_type = typeid_cast(column_type.get())->getNestedType(); - auto nested_arrow_type = getArrowType(nested_type, column_name, format_name, is_column_nullable); + auto nested_type = assert_cast(column_type.get())->getNestedType(); + auto nested_column = assert_cast(column.get())->getDataPtr(); + auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, is_column_nullable); return arrow::list(nested_arrow_type); } + if (isTuple(column_type)) + { + const auto & nested_types = assert_cast(column_type.get())->getElements(); + const auto * tuple_column = assert_cast(column.get()); + std::vector> nested_fields; + for (size_t i = 0; i != nested_types.size(); ++i) + { + String name = column_name + "." + std::to_string(i); + auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, is_column_nullable); + nested_fields.push_back(std::make_shared(name, nested_arrow_type, *is_column_nullable)); + } + return arrow::struct_(std::move(nested_fields)); + } + + if (column_type->lowCardinality()) + { + auto nested_type = assert_cast(column_type.get())->getDictionaryType(); + const auto * lc_column = assert_cast(column.get()); + const auto & nested_column = lc_column->getDictionaryPtr(); + const auto & indexes_column = lc_column->getIndexesPtr(); + return arrow::dictionary( + getArrowTypeForLowCardinalityIndexes(indexes_column), + getArrowType(nested_type, nested_column, column_name, format_name, is_column_nullable)); + } + + if (isMap(column_type)) + { + const auto * map_type = assert_cast(column_type.get()); + const auto & key_type = map_type->getKeyType(); + const auto & val_type = map_type->getValueType(); + + const auto & columns = assert_cast(column.get())->getNestedData().getColumns(); + return arrow::map( + getArrowType(key_type, columns[0], column_name, format_name, is_column_nullable), + getArrowType(val_type, columns[1], column_name, format_name, is_column_nullable) + ); + } + const std::string type_name = column_type->getFamilyName(); if (const auto * arrow_type_it = std::find_if( internal_type_to_arrow_type.begin(), @@ -361,49 +580,59 @@ namespace DB return arrow_type_it->second; } - throw Exception{"The type \"" + column_name + "\" of a column \"" + column_name + "\"" - " is not supported for conversion into a " + format_name + " data format", + throw Exception{fmt::format(R"(The type "{}" of a column "{}" is not supported for conversion into a {} data format.)", column_type->getName(), column_name, format_name), ErrorCodes::UNKNOWN_TYPE}; } + CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_) + : format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_) + { + arrow_fields.reserve(header.columns()); + header_columns.reserve(header.columns()); + for (auto column : header.getColumnsWithTypeAndName()) + { + if (!low_cardinality_as_dictionary) + { + column.type = recursiveRemoveLowCardinality(column.type); + column.column = recursiveRemoveLowCardinality(column.column); + } + bool is_column_nullable = false; + auto arrow_type = getArrowType(column.type, column.column, column.name, format_name, &is_column_nullable); + arrow_fields.emplace_back(std::make_shared(column.name, arrow_type, is_column_nullable)); + header_columns.emplace_back(std::move(column)); + } + } + void CHColumnToArrowColumn::chChunkToArrowTable( std::shared_ptr & res, - const Block & header, const Chunk & chunk, - size_t columns_num, - String format_name) + size_t columns_num) { /// For arrow::Schema and arrow::Table creation - std::vector> arrow_fields; std::vector> arrow_arrays; - arrow_fields.reserve(columns_num); arrow_arrays.reserve(columns_num); - for (size_t column_i = 0; column_i < columns_num; ++column_i) { - // TODO: constructed every iteration - ColumnWithTypeAndName column = header.safeGetByPosition(column_i); - column.column = recursiveRemoveLowCardinality(chunk.getColumns()[column_i]); - column.type = recursiveRemoveLowCardinality(column.type); + const ColumnWithTypeAndName & header_column = header_columns[column_i]; + auto column = chunk.getColumns()[column_i]; - bool is_column_nullable = false; - auto arrow_type = getArrowType(column.type, column.name, format_name, &is_column_nullable); - arrow_fields.emplace_back(std::make_shared(column.name, arrow_type, is_column_nullable)); + if (!low_cardinality_as_dictionary) + column = recursiveRemoveLowCardinality(column); arrow::MemoryPool* pool = arrow::default_memory_pool(); std::unique_ptr array_builder; arrow::Status status = MakeBuilder(pool, arrow_fields[column_i]->type(), &array_builder); - checkStatus(status, column.column->getName(), format_name); + checkStatus(status, column->getName(), format_name); - fillArrowArray(column.name, column.column, column.type, nullptr, array_builder.get(), format_name, 0, column.column->size()); + fillArrowArray(header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), dictionary_values); std::shared_ptr arrow_array; status = array_builder->Finish(&arrow_array); - checkStatus(status, column.column->getName(), format_name); + checkStatus(status, column->getName(), format_name); arrow_arrays.emplace_back(std::move(arrow_array)); } - std::shared_ptr arrow_schema = std::make_shared(std::move(arrow_fields)); + std::shared_ptr arrow_schema = std::make_shared(arrow_fields); res = arrow::Table::Make(arrow_schema, arrow_arrays); } diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h index de594389c25..efe02a0d7d9 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h @@ -12,6 +12,10 @@ namespace DB class CHColumnToArrowColumn { +public: + CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_ = false); + + void chChunkToArrowTable(std::shared_ptr & res, const Chunk & chunk, size_t columns_num); private: #define FOR_INTERNAL_NUMERIC_TYPES(M) \ @@ -26,10 +30,27 @@ private: M(Float32, arrow::FloatBuilder) \ M(Float64, arrow::DoubleBuilder) +#define FOR_ARROW_TYPES(M) \ + M(UINT8, arrow::UInt8Type) \ + M(INT8, arrow::Int8Type) \ + M(UINT16, arrow::UInt16Type) \ + M(INT16, arrow::Int16Type) \ + M(UINT32, arrow::UInt32Type) \ + M(INT32, arrow::Int32Type) \ + M(UINT64, arrow::UInt64Type) \ + M(INT64, arrow::Int64Type) \ + M(FLOAT, arrow::FloatType) \ + M(DOUBLE, arrow::DoubleType) \ + M(STRING, arrow::StringType) -public: - static void chChunkToArrowTable(std::shared_ptr & res, const Block & header, const Chunk & chunk, - size_t columns_num, String format_name); + ColumnsWithTypeAndName header_columns; + std::vector> arrow_fields; + const std::string format_name; + bool low_cardinality_as_dictionary; + /// Map {column name : arrow dictionary}. + /// To avoid converting dictionary from LowCardinality to Arrow + /// Dictionary every chunk we save it and reuse. + std::unordered_map> dictionary_values; }; } #endif diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 24b524faeaf..6ee247413e9 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -33,7 +33,6 @@ ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_) : IInp Chunk ORCBlockInputFormat::generate() { Chunk res; - const Block & header = getPort().getHeader(); if (!file_reader) prepareReader(); @@ -54,7 +53,7 @@ Chunk ORCBlockInputFormat::generate() ++stripe_current; - ArrowColumnToCHColumn::arrowTableToCHChunk(res, *table_result, header, "ORC"); + arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result); return res; } @@ -67,11 +66,26 @@ void ORCBlockInputFormat::resetParser() stripe_current = 0; } -size_t countIndicesForType(std::shared_ptr type) +static size_t countIndicesForType(std::shared_ptr type) { if (type->id() == arrow::Type::LIST) return countIndicesForType(static_cast(type.get())->value_type()) + 1; + if (type->id() == arrow::Type::STRUCT) + { + int indices = 1; + auto * struct_type = static_cast(type.get()); + for (int i = 0; i != struct_type->num_fields(); ++i) + indices += countIndicesForType(struct_type->field(i)->type()); + return indices; + } + + if (type->id() == arrow::Type::MAP) + { + auto * map_type = static_cast(type.get()); + return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()); + } + return 1; } @@ -84,17 +98,22 @@ void ORCBlockInputFormat::prepareReader() std::shared_ptr schema; THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema)); - int index = 0; + arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), schema, "ORC"); + + /// In ReadStripe column indices should be started from 1, + /// because 0 indicates to select all columns. + int index = 1; for (int i = 0; i < schema->num_fields(); ++i) { + /// LIST type require 2 indices, STRUCT - the number of elements + 1, + /// so we should recursively count the number of indices we need for this type. + int indexes_count = countIndicesForType(schema->field(i)->type()); if (getPort().getHeader().has(schema->field(i)->name())) { - /// LIST type require 2 indices, so we should recursively - /// count the number of indices we need for this type. - int indexes_count = countIndicesForType(schema->field(i)->type()); for (int j = 0; j != indexes_count; ++j) - include_indices.push_back(index++); + include_indices.push_back(index + j); } + index += indexes_count; } } diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index 0c78290f3cc..f27685a9884 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -8,6 +8,9 @@ namespace arrow::adapters::orc { class ORCFileReader; } namespace DB { + +class ArrowColumnToCHColumn; + class ORCBlockInputFormat : public IInputFormat { public: @@ -26,6 +29,8 @@ private: std::unique_ptr file_reader; + std::unique_ptr arrow_column_to_ch_column; + int stripe_total = 0; int stripe_current = 0; diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index ec6a7a65573..a5143792e7d 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -10,12 +10,16 @@ #include #include #include +#include +#include #include #include #include #include #include +#include +#include namespace DB { @@ -46,15 +50,9 @@ void ORCOutputStream::write(const void* buf, size_t length) ORCBlockOutputFormat::ORCBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) : IOutputFormat(header_, out_), format_settings{format_settings_}, output_stream(out_), data_types(header_.getDataTypes()) { - schema = orc::createStructType(); - options.setCompression(orc::CompressionKind::CompressionKind_NONE); - size_t columns_count = header_.columns(); - for (size_t i = 0; i != columns_count; ++i) - schema->addStructField(header_.safeGetByPosition(i).name, getORCType(data_types[i])); - writer = orc::createWriter(*schema, &output_stream, options); } -ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & type) +ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & type, const std::string & column_name) { switch (type->getTypeId()) { @@ -102,28 +100,48 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t } case TypeIndex::Nullable: { - return getORCType(removeNullable(type)); + return getORCType(removeNullable(type), column_name); } case TypeIndex::Array: { - const auto * array_type = typeid_cast(type.get()); - return orc::createListType(getORCType(array_type->getNestedType())); + const auto * array_type = assert_cast(type.get()); + return orc::createListType(getORCType(array_type->getNestedType(), column_name)); } case TypeIndex::Decimal32: { - const auto * decimal_type = typeid_cast *>(type.get()); + const auto * decimal_type = assert_cast *>(type.get()); return orc::createDecimalType(decimal_type->getPrecision(), decimal_type->getScale()); } case TypeIndex::Decimal64: { - const auto * decimal_type = typeid_cast *>(type.get()); + const auto * decimal_type = assert_cast *>(type.get()); return orc::createDecimalType(decimal_type->getPrecision(), decimal_type->getScale()); } case TypeIndex::Decimal128: { - const auto * decimal_type = typeid_cast *>(type.get()); + const auto * decimal_type = assert_cast *>(type.get()); return orc::createDecimalType(decimal_type->getPrecision(), decimal_type->getScale()); } + case TypeIndex::Tuple: + { + const auto * tuple_type = assert_cast(type.get()); + const auto & nested_types = tuple_type->getElements(); + auto struct_type = orc::createStructType(); + for (size_t i = 0; i < nested_types.size(); ++i) + { + String name = column_name + "." + std::to_string(i); + struct_type->addStructField(name, getORCType(nested_types[i], name)); + } + return struct_type; + } + case TypeIndex::Map: + { + const auto * map_type = assert_cast(type.get()); + return orc::createMapType( + getORCType(map_type->getKeyType(), column_name), + getORCType(map_type->getValueType(), column_name) + ); + } default: { throw Exception("Type " + type->getName() + " is not supported for ORC output format", ErrorCodes::ILLEGAL_COLUMN); @@ -149,6 +167,8 @@ void ORCBlockOutputFormat::writeNumbers( number_orc_column.notNull[i] = 0; continue; } + + number_orc_column.notNull[i] = 1; number_orc_column.data[i] = convert(number_column.getElement(i)); } number_orc_column.numElements = number_column.size(); @@ -164,7 +184,7 @@ void ORCBlockOutputFormat::writeDecimals( { DecimalVectorBatch & decimal_orc_column = dynamic_cast(orc_column); const auto & decimal_column = assert_cast &>(column); - const auto * decimal_type = typeid_cast *>(type.get()); + const auto * decimal_type = assert_cast *>(type.get()); decimal_orc_column.precision = decimal_type->getPrecision(); decimal_orc_column.scale = decimal_type->getScale(); decimal_orc_column.resize(decimal_column.size()); @@ -175,6 +195,8 @@ void ORCBlockOutputFormat::writeDecimals( decimal_orc_column.notNull[i] = 0; continue; } + + decimal_orc_column.notNull[i] = 1; decimal_orc_column.values[i] = convert(decimal_column.getElement(i).value); } decimal_orc_column.numElements = decimal_column.size(); @@ -197,6 +219,8 @@ void ORCBlockOutputFormat::writeStrings( string_orc_column.notNull[i] = 0; continue; } + + string_orc_column.notNull[i] = 1; const StringRef & string = string_column.getDataAt(i); string_orc_column.data[i] = const_cast(string.data); string_orc_column.length[i] = string.size; @@ -223,6 +247,8 @@ void ORCBlockOutputFormat::writeDateTimes( timestamp_orc_column.notNull[i] = 0; continue; } + + timestamp_orc_column.notNull[i] = 1; timestamp_orc_column.data[i] = get_seconds(timestamp_column.getElement(i)); timestamp_orc_column.nanoseconds[i] = get_nanoseconds(timestamp_column.getElement(i)); } @@ -235,11 +261,10 @@ void ORCBlockOutputFormat::writeColumn( DataTypePtr & type, const PaddedPODArray * null_bytemap) { + orc_column.notNull.resize(column.size()); if (null_bytemap) - { orc_column.hasNulls = true; - orc_column.notNull.resize(column.size()); - } + switch (type->getTypeId()) { case TypeIndex::Int8: @@ -374,12 +399,52 @@ void ORCBlockOutputFormat::writeColumn( for (size_t i = 0; i != list_column.size(); ++i) { list_orc_column.offsets[i + 1] = offsets[i]; + list_orc_column.notNull[i] = 1; } orc::ColumnVectorBatch & nested_orc_column = *list_orc_column.elements; writeColumn(nested_orc_column, list_column.getData(), nested_type, null_bytemap); list_orc_column.numElements = list_column.size(); break; } + case TypeIndex::Tuple: + { + orc::StructVectorBatch & struct_orc_column = dynamic_cast(orc_column); + const auto & tuple_column = assert_cast(column); + auto nested_types = assert_cast(type.get())->getElements(); + for (size_t i = 0; i != tuple_column.size(); ++i) + struct_orc_column.notNull[i] = 1; + for (size_t i = 0; i != tuple_column.tupleSize(); ++i) + writeColumn(*struct_orc_column.fields[i], tuple_column.getColumn(i), nested_types[i], null_bytemap); + break; + } + case TypeIndex::Map: + { + orc::MapVectorBatch & map_orc_column = dynamic_cast(orc_column); + const auto & list_column = assert_cast(column).getNestedColumn(); + const auto & map_type = assert_cast(*type); + const ColumnArray::Offsets & offsets = list_column.getOffsets(); + + map_orc_column.resize(list_column.size()); + /// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i]. + map_orc_column.offsets[0] = 0; + for (size_t i = 0; i != list_column.size(); ++i) + { + map_orc_column.offsets[i + 1] = offsets[i]; + map_orc_column.notNull[i] = 1; + } + const auto nested_columns = assert_cast(list_column.getDataPtr().get())->getColumns(); + + orc::ColumnVectorBatch & keys_orc_column = *map_orc_column.keys; + auto key_type = map_type.getKeyType(); + writeColumn(keys_orc_column, *nested_columns[0], key_type, null_bytemap); + + orc::ColumnVectorBatch & values_orc_column = *map_orc_column.elements; + auto value_type = map_type.getValueType(); + writeColumn(values_orc_column, *nested_columns[1], value_type, null_bytemap); + + map_orc_column.numElements = list_column.size(); + break; + } default: throw Exception("Type " + type->getName() + " is not supported for ORC output format", ErrorCodes::ILLEGAL_COLUMN); } @@ -409,6 +474,8 @@ size_t ORCBlockOutputFormat::getMaxColumnSize(Chunk & chunk) void ORCBlockOutputFormat::consume(Chunk chunk) { + if (!writer) + prepareWriter(); size_t columns_num = chunk.getNumColumns(); size_t rows_num = chunk.getNumRows(); /// getMaxColumnSize is needed to write arrays. @@ -425,9 +492,23 @@ void ORCBlockOutputFormat::consume(Chunk chunk) void ORCBlockOutputFormat::finalize() { + if (!writer) + prepareWriter(); + writer->close(); } +void ORCBlockOutputFormat::prepareWriter() +{ + const Block & header = getPort(PortKind::Main).getHeader(); + schema = orc::createStructType(); + options.setCompression(orc::CompressionKind::CompressionKind_NONE); + size_t columns_count = header.columns(); + for (size_t i = 0; i != columns_count; ++i) + schema->addStructField(header.safeGetByPosition(i).name, getORCType(data_types[i], header.safeGetByPosition(i).name)); + writer = orc::createWriter(*schema, &output_stream, options); +} + void registerOutputFormatProcessorORC(FormatFactory & factory) { factory.registerOutputFormatProcessor("ORC", []( diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h index 05053317533..557bf6cc07a 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h @@ -43,7 +43,7 @@ public: void finalize() override; private: - ORC_UNIQUE_PTR getORCType(const DataTypePtr & type); + ORC_UNIQUE_PTR getORCType(const DataTypePtr & type, const std::string & column_name); /// ConvertFunc is needed for type UInt8, because firstly UInt8 (char8_t) must be /// converted to unsigned char (bugprone-signed-char-misuse in clang). @@ -71,6 +71,8 @@ private: size_t getColumnSize(const IColumn & column, DataTypePtr & type); size_t getMaxColumnSize(Chunk & chunk); + void prepareWriter(); + const FormatSettings format_settings; ORCOutputStream output_stream; DataTypes data_types; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index df264406cfe..07a0e15cb6b 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -38,7 +38,6 @@ ParquetBlockInputFormat::ParquetBlockInputFormat(ReadBuffer & in_, Block header_ Chunk ParquetBlockInputFormat::generate() { Chunk res; - const Block & header = getPort().getHeader(); if (!file_reader) prepareReader(); @@ -54,7 +53,7 @@ Chunk ParquetBlockInputFormat::generate() ++row_group_current; - ArrowColumnToCHColumn::arrowTableToCHChunk(res, table, header, "Parquet"); + arrow_column_to_ch_column->arrowTableToCHChunk(res, table); return res; } @@ -67,6 +66,29 @@ void ParquetBlockInputFormat::resetParser() row_group_current = 0; } +static size_t countIndicesForType(std::shared_ptr type) +{ + if (type->id() == arrow::Type::LIST) + return countIndicesForType(static_cast(type.get())->value_type()); + + if (type->id() == arrow::Type::STRUCT) + { + int indices = 0; + auto * struct_type = static_cast(type.get()); + for (int i = 0; i != struct_type->num_fields(); ++i) + indices += countIndicesForType(struct_type->field(i)->type()); + return indices; + } + + if (type->id() == arrow::Type::MAP) + { + auto * map_type = static_cast(type.get()); + return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()); + } + + return 1; +} + void ParquetBlockInputFormat::prepareReader() { THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(asArrowFile(in), arrow::default_memory_pool(), &file_reader)); @@ -76,12 +98,21 @@ void ParquetBlockInputFormat::prepareReader() std::shared_ptr schema; THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); + arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), schema, "Parquet"); + + int index = 0; for (int i = 0; i < schema->num_fields(); ++i) { + /// STRUCT type require the number of indexes equal to the number of + /// nested elements, so we should recursively + /// count the number of indices we need for this type. + int indexes_count = countIndicesForType(schema->field(i)->type()); if (getPort().getHeader().has(schema->field(i)->name())) { - column_indices.push_back(i); + for (int j = 0; j != indexes_count; ++j) + column_indices.push_back(index + j); } + index += indexes_count; } } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index 0841e82d4d0..b68f97c005a 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -12,6 +12,8 @@ namespace arrow { class Buffer; } namespace DB { +class ArrowColumnToCHColumn; + class ParquetBlockInputFormat : public IInputFormat { public: @@ -32,6 +34,7 @@ private: int row_group_total = 0; // indices of columns to read from Parquet file std::vector column_indices; + std::unique_ptr arrow_column_to_ch_column; int row_group_current = 0; }; diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 78b6a9c53a4..800fd0ff0e8 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include "ArrowBufferedStreams.h" #include "CHColumnToArrowColumn.h" @@ -32,11 +31,16 @@ ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Blo void ParquetBlockOutputFormat::consume(Chunk chunk) { - const Block & header = getPort(PortKind::Main).getHeader(); const size_t columns_num = chunk.getNumColumns(); std::shared_ptr arrow_table; - CHColumnToArrowColumn::chChunkToArrowTable(arrow_table, header, chunk, columns_num, "Parquet"); + if (!ch_column_to_arrow_column) + { + const Block & header = getPort(PortKind::Main).getHeader(); + ch_column_to_arrow_column = std::make_unique(header, "Parquet"); + } + + ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); if (!file_writer) { diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h index 11d746a0a6d..8114d1ab494 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h @@ -21,6 +21,9 @@ namespace arrow namespace DB { + +class CHColumnToArrowColumn; + class ParquetBlockOutputFormat : public IOutputFormat { public: @@ -36,6 +39,7 @@ private: const FormatSettings format_settings; std::unique_ptr file_writer; + std::unique_ptr ch_column_to_arrow_column; }; } diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index a96748ceb63..0e8c7e0a263 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -190,9 +190,10 @@ protected: /// Initially reserved virtual column name may be shadowed by real column. bool isVirtualColumn(const String & column_name, const StorageMetadataPtr & metadata_snapshot) const; - private: + StorageID storage_id; + mutable std::mutex id_mutex; /// Multiversion storage metadata. Allows to read/write storage metadata diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index f8ff7fe697a..54fcfc1adc9 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -223,6 +223,12 @@ public: DeleteOnDestroy, /// part was moved to another disk and should be deleted in own destructor }; + static constexpr auto all_part_states = + { + State::Temporary, State::PreCommitted, State::Committed, State::Outdated, State::Deleting, + State::DeleteOnDestroy + }; + using TTLInfo = MergeTreeDataPartTTLInfo; using TTLInfos = MergeTreeDataPartTTLInfos; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index b0bf0c8e672..abc37f52ff9 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2156,8 +2156,7 @@ bool MergeTreeData::renameTempPartAndReplace( LOG_TRACE(log, "Renaming temporary part {} to {}.", part->relative_path, part_name); - auto it_duplicate = data_parts_by_info.find(part_info); - if (it_duplicate != data_parts_by_info.end()) + if (auto it_duplicate = data_parts_by_info.find(part_info); it_duplicate != data_parts_by_info.end()) { String message = "Part " + (*it_duplicate)->getNameWithState() + " already exists"; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 4f33aa30bdc..65d875aa9cf 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -402,6 +402,7 @@ public: /// Returns a copy of the list so that the caller shouldn't worry about locks. DataParts getDataParts(const DataPartStates & affordable_states) const; + /// Returns sorted list of the parts with specified states /// out_states will contain snapshot of each part state DataPartsVector getDataPartsVector( diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp index 7f167f929e5..4a73658e8a4 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp @@ -262,8 +262,8 @@ void ReplicatedMergeTreeBlockOutputStream::commitPart( { log_entry.type = StorageReplicatedMergeTree::LogEntry::ATTACH_PART; - /// We don't need to involve ZooKeeper to obtain the checksums as by the time we get - /// the MutableDataPartPtr here, we already have the data thus being able to + /// We don't need to involve ZooKeeper to obtain checksums as by the time we get + /// MutableDataPartPtr here, we already have the data thus being able to /// calculate the checksums. log_entry.part_checksum = part->checksums.getTotalChecksumHex(); } @@ -384,6 +384,7 @@ void ReplicatedMergeTreeBlockOutputStream::commitPart( MergeTreeData::Transaction transaction(storage); /// If you can not add a part to ZK, we'll remove it back from the working set. bool renamed = false; + try { renamed = storage.renameTempPartAndAdd(part, nullptr, &transaction); @@ -394,6 +395,7 @@ void ReplicatedMergeTreeBlockOutputStream::commitPart( && e.code() != ErrorCodes::PART_IS_TEMPORARILY_LOCKED) throw; } + if (!renamed) { if (is_already_existing_part) diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index ce7fc38b904..b3b9ce31ff5 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -257,7 +257,7 @@ If you use the Replicated version of engines, see https://clickhouse.tech/docs/e static StoragePtr create(const StorageFactory::Arguments & args) { - /** [Replicated][|Summing|Collapsing|Aggregating|Replacing|Graphite]MergeTree (2 * 7 combinations) engines + /** [Replicated][|Summing|VersionedCollapsing|Collapsing|Aggregating|Replacing|Graphite]MergeTree (2 * 7 combinations) engines * The argument for the engine should be: * - (for Replicated) The path to the table in ZooKeeper * - (for Replicated) Replica name in ZooKeeper diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index c7703c481c3..8507198a7f6 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -288,6 +288,7 @@ void replaceConstantExpressions( /// is one of the following: /// - QueryProcessingStage::Complete /// - QueryProcessingStage::WithMergeableStateAfterAggregation +/// - QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit /// - none (in this case regular WithMergeableState should be used) std::optional getOptimizedQueryProcessingStage(const SelectQueryInfo & query_info, bool extremes, const Block & sharding_key_block) { @@ -349,13 +350,13 @@ std::optional getOptimizedQueryProcessingStage(const // ORDER BY const ASTPtr order_by = select.orderBy(); if (order_by) - return QueryProcessingStage::WithMergeableStateAfterAggregation; + return QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit; // LIMIT BY // LIMIT // OFFSET if (select.limitBy() || select.limitLength() || select.limitOffset()) - return QueryProcessingStage::WithMergeableStateAfterAggregation; + return QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit; // Only simple SELECT FROM GROUP BY sharding_key can use Complete state. return QueryProcessingStage::Complete; @@ -514,11 +515,23 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage( if (settings.distributed_group_by_no_merge) { if (settings.distributed_group_by_no_merge == DISTRIBUTED_GROUP_BY_NO_MERGE_AFTER_AGGREGATION) - return QueryProcessingStage::WithMergeableStateAfterAggregation; + { + if (settings.distributed_push_down_limit) + return QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit; + else + return QueryProcessingStage::WithMergeableStateAfterAggregation; + } else + { + /// NOTE: distributed_group_by_no_merge=1 does not respect distributed_push_down_limit + /// (since in this case queries processed separatelly and the initiator is just a proxy in this case). return QueryProcessingStage::Complete; + } } + if (settings.distributed_push_down_limit) + return QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit; + /// Nested distributed query cannot return Complete stage, /// since the parent query need to aggregate the results after. if (to_stage == QueryProcessingStage::WithMergeableState) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index aac1b708567..b67cd0a0af7 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -63,7 +63,7 @@ namespace /* Recursive directory listing with matched paths as a result. * Have the same method in StorageHDFS. */ -std::vector listFilesWithRegexpMatching(const std::string & path_for_ls, const std::string & for_match) +std::vector listFilesWithRegexpMatching(const std::string & path_for_ls, const std::string & for_match, size_t & total_bytes_to_read) { const size_t first_glob = for_match.find_first_of("*?{"); @@ -91,6 +91,7 @@ std::vector listFilesWithRegexpMatching(const std::string & path_fo { if (re2::RE2::FullMatch(file_name, matcher)) { + total_bytes_to_read += fs::file_size(it->path()); result.push_back(it->path().string()); } } @@ -99,7 +100,7 @@ std::vector listFilesWithRegexpMatching(const std::string & path_fo if (re2::RE2::FullMatch(file_name, matcher)) { /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. - Strings result_part = listFilesWithRegexpMatching(fs::path(full_path) / "", suffix_with_globs.substr(next_slash)); + Strings result_part = listFilesWithRegexpMatching(fs::path(full_path) / "", suffix_with_globs.substr(next_slash), total_bytes_to_read); std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); } } @@ -127,7 +128,7 @@ void checkCreationIsAllowed(ContextPtr context_global, const std::string & db_di } } -Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context) +Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read) { fs::path user_files_absolute_path = fs::weakly_canonical(user_files_path); fs::path fs_table_path(table_path); @@ -137,9 +138,14 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user Strings paths; const String path = fs::weakly_canonical(fs_table_path); if (path.find_first_of("*?{") == std::string::npos) + { + std::error_code error; + if (fs::exists(path)) + total_bytes_to_read += fs::file_size(path, error); paths.push_back(path); + } else - paths = listFilesWithRegexpMatching("/", path); + paths = listFilesWithRegexpMatching("/", path, total_bytes_to_read); for (const auto & cur_path : paths) checkCreationIsAllowed(context, user_files_absolute_path, cur_path); @@ -173,7 +179,7 @@ StorageFile::StorageFile(const std::string & table_path_, const std::string & us : StorageFile(args) { is_db_table = false; - paths = getPathsList(table_path_, user_files_path, args.getContext()); + paths = getPathsList(table_path_, user_files_path, args.getContext(), total_bytes_to_read); if (args.format_name == "Distributed") { @@ -361,6 +367,13 @@ public: method = chooseCompressionMethod(current_path, storage->compression_method); } + /// For clickhouse-local add progress callback to display progress bar. + if (context->getApplicationType() == Context::ApplicationType::LOCAL) + { + auto & in = static_cast(*nested_buffer); + in.setProgressCallback(context); + } + read_buf = wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); auto get_block_for_format = [&]() -> Block @@ -418,6 +431,7 @@ public: return {}; } + private: std::shared_ptr storage; StorageMetadataPtr metadata_snapshot; @@ -480,6 +494,11 @@ Pipe StorageFile::read( Pipes pipes; pipes.reserve(num_streams); + /// Set total number of bytes to process. For progress bar. + auto progress_callback = context->getFileProgressCallback(); + if (context->getApplicationType() == Context::ApplicationType::LOCAL && progress_callback) + progress_callback(FileProgress(0, total_bytes_to_read)); + for (size_t i = 0; i < num_streams; ++i) { const auto get_columns_for_format = [&]() -> ColumnsDescription diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 1626ce15b3a..843cd405828 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -58,7 +58,7 @@ public: NamesAndTypesList getVirtuals() const override; - static Strings getPathsList(const String & table_path, const String & user_files_path, ContextPtr context); + static Strings getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read); /// Check if the format is column-oriented. /// Is is useful because column oriented formats could effectively skip unknown columns @@ -103,6 +103,9 @@ private: mutable std::shared_timed_mutex rwlock; Poco::Logger * log = &Poco::Logger::get("StorageFile"); + + /// Total number of bytes to read (sums for multiple files in case of globs). Needed for progress bar. + size_t total_bytes_to_read = 0; }; } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index c11f1580a2e..47f6bbd0ccc 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1,5 +1,6 @@ #include +#include "Common/hex.h" #include #include #include @@ -63,10 +64,13 @@ #include #include +#include #include +#include +#include +#include #include #include -#include #include @@ -135,6 +139,7 @@ namespace ErrorCodes extern const int INTERSERVER_SCHEME_DOESNT_MATCH; extern const int DUPLICATE_DATA_PART; extern const int BAD_ARGUMENTS; + extern const int CONCURRENT_ACCESS_NOT_SUPPORTED; } namespace ActionLocks @@ -153,10 +158,6 @@ static const auto MERGE_SELECTING_SLEEP_MS = 5 * 1000; static const auto MUTATIONS_FINALIZING_SLEEP_MS = 1 * 1000; static const auto MUTATIONS_FINALIZING_IDLE_SLEEP_MS = 5 * 1000; - -std::atomic_uint StorageReplicatedMergeTree::total_fetches {0}; - - void StorageReplicatedMergeTree::setZooKeeper() { /// Every ReplicatedMergeTree table is using only one ZooKeeper session. @@ -376,7 +377,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( if (attach && !current_zookeeper->exists(zookeeper_path + "/metadata")) { - LOG_WARNING(log, "No metadata in ZooKeeper: table will be in readonly mode."); + LOG_WARNING(log, "No metadata in ZooKeeper for {}: table will be in readonly mode.", zookeeper_path); is_readonly = true; has_metadata_in_zookeeper = false; return; @@ -384,10 +385,20 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( auto metadata_snapshot = getInMemoryMetadataPtr(); + /// May it be ZK lost not the whole root, so the upper check passed, but only the /replicas/replica + /// folder. + if (attach && !current_zookeeper->exists(replica_path)) + { + LOG_WARNING(log, "No metadata in ZooKeeper for {}: table will be in readonly mode", replica_path); + is_readonly = true; + has_metadata_in_zookeeper = false; + return; + } + if (!attach) { if (!getDataParts().empty()) - throw Exception("Data directory for table already containing data parts" + throw Exception("Data directory for table already contains data parts" " - probably it was unclean DROP table or manual intervention." " You must either clear directory by hand or use ATTACH TABLE" " instead of CREATE TABLE if you need to use that parts.", ErrorCodes::INCORRECT_DATA); @@ -433,13 +444,17 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( { /// In old tables this node may missing or be empty String replica_metadata; - bool replica_metadata_exists = current_zookeeper->tryGet(replica_path + "/metadata", replica_metadata); + const bool replica_metadata_exists = current_zookeeper->tryGet(replica_path + "/metadata", replica_metadata); + if (!replica_metadata_exists || replica_metadata.empty()) { /// We have to check shared node granularity before we create ours. other_replicas_fixed_granularity = checkFixedGranualrityInZookeeper(); + ReplicatedMergeTreeTableMetadata current_metadata(*this, metadata_snapshot); - current_zookeeper->createOrUpdate(replica_path + "/metadata", current_metadata.toString(), zkutil::CreateMode::Persistent); + + current_zookeeper->createOrUpdate(replica_path + "/metadata", current_metadata.toString(), + zkutil::CreateMode::Persistent); } checkTableStructure(replica_path, metadata_snapshot); @@ -460,8 +475,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( current_zookeeper->get(zookeeper_path + "/metadata", &metadata_stat); metadata_version = metadata_stat.version; } - /// Temporary directories contain untinalized results of Merges or Fetches (after forced restart) - /// and don't allow to reinitialize them, so delete each of them immediately + /// Temporary directories contain uninitialized results of Merges or Fetches (after forced restart), + /// don't allow to reinitialize them, delete each of them immediately. clearOldTemporaryDirectories(0); clearOldWriteAheadLogs(); } @@ -727,12 +742,13 @@ void StorageReplicatedMergeTree::createReplica(const StorageMetadataPtr & metada String replicas_value; if (!zookeeper->tryGet(zookeeper_path + "/replicas", replicas_value, &replicas_stat)) - throw Exception(fmt::format("Cannot create a replica of the table {}, because the last replica of the table was dropped right now", - zookeeper_path), ErrorCodes::ALL_REPLICAS_LOST); + throw Exception(ErrorCodes::ALL_REPLICAS_LOST, + "Cannot create a replica of the table {}, because the last replica of the table was dropped right now", + zookeeper_path); /// It is not the first replica, we will mark it as "lost", to immediately repair (clone) from existing replica. /// By the way, it's possible that the replica will be first, if all previous replicas were removed concurrently. - String is_lost_value = replicas_stat.numChildren ? "1" : "0"; + const String is_lost_value = replicas_stat.numChildren ? "1" : "0"; Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, "", @@ -761,21 +777,18 @@ void StorageReplicatedMergeTree::createReplica(const StorageMetadataPtr & metada Coordination::Responses responses; code = zookeeper->tryMulti(ops, responses); - if (code == Coordination::Error::ZNODEEXISTS) + + switch (code) { - throw Exception("Replica " + replica_path + " already exists.", ErrorCodes::REPLICA_IS_ALREADY_EXIST); - } - else if (code == Coordination::Error::ZBADVERSION) - { - LOG_ERROR(log, "Retrying createReplica(), because some other replicas were created at the same time"); - } - else if (code == Coordination::Error::ZNONODE) - { - throw Exception("Table " + zookeeper_path + " was suddenly removed.", ErrorCodes::ALL_REPLICAS_LOST); - } - else - { - zkutil::KeeperMultiException::check(code, ops, responses); + case Coordination::Error::ZNODEEXISTS: + throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, "Replica {} already exists", replica_path); + case Coordination::Error::ZBADVERSION: + LOG_ERROR(log, "Retrying createReplica(), because some other replicas were created at the same time"); + break; + case Coordination::Error::ZNONODE: + throw Exception(ErrorCodes::ALL_REPLICAS_LOST, "Table {} was suddenly removed", zookeeper_path); + default: + zkutil::KeeperMultiException::check(code, ops, responses); } } while (code == Coordination::Error::ZBADVERSION); } @@ -1123,6 +1136,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) size_t unexpected_parts_nonnew = 0; UInt64 unexpected_parts_nonnew_rows = 0; UInt64 unexpected_parts_rows = 0; + for (const auto & part : unexpected_parts) { if (part->info.level > 0) @@ -1134,20 +1148,17 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) unexpected_parts_rows += part->rows_count; } - /// Additional helpful statistics - auto get_blocks_count_in_data_part = [&] (const String & part_name) -> UInt64 - { - MergeTreePartInfo part_info; - if (MergeTreePartInfo::tryParsePartName(part_name, &part_info, format_version)) - return part_info.getBlocksCount(); + const UInt64 parts_to_fetch_blocks = std::accumulate(parts_to_fetch.cbegin(), parts_to_fetch.cend(), 0, + [&](UInt64 acc, const String& part_name) + { + MergeTreePartInfo part_info; - LOG_ERROR(log, "Unexpected part name: {}", part_name); - return 0; - }; + if (MergeTreePartInfo::tryParsePartName(part_name, &part_info, format_version)) + return acc + part_info.getBlocksCount(); - UInt64 parts_to_fetch_blocks = 0; - for (const String & name : parts_to_fetch) - parts_to_fetch_blocks += get_blocks_count_in_data_part(name); + LOG_ERROR(log, "Unexpected part name: {}", part_name); + return acc; + }); /** We can automatically synchronize data, * if the ratio of the total number of errors to the total number of parts (minimum - on the local filesystem or in ZK) @@ -1499,7 +1510,7 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) { if (MutableDataPartPtr part = attachPartHelperFoundValidPart(entry); part) { - LOG_TRACE(log, "Found valid part to attach from local data, preparing the transaction"); + LOG_TRACE(log, "Found valid local part for {}, preparing the transaction", part->name); Transaction transaction(*this); @@ -1512,7 +1523,9 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) return true; } - LOG_TRACE(log, "Didn't find part with the correct checksums, will fetch it from other replica"); + LOG_TRACE(log, "Didn't find valid local part for {} ({}), will fetch it from other replica", + entry.new_part_name, + entry.actual_new_part_name); } if (is_get_or_attach && entry.source_replica == replica_name) @@ -2732,6 +2745,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo /// Remove local parts if source replica does not have them, because such parts will never be fetched by other replicas. Strings local_parts_in_zk = zookeeper->getChildren(fs::path(replica_path) / "parts"); Strings parts_to_remove_from_zk; + for (const auto & part : local_parts_in_zk) { if (active_parts_set.getContainingPart(part).empty()) @@ -2740,10 +2754,13 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo LOG_WARNING(log, "Source replica does not have part {}. Removing it from ZooKeeper.", part); } } + tryRemovePartsFromZooKeeperWithRetries(parts_to_remove_from_zk); auto local_active_parts = getDataParts(); + DataPartsVector parts_to_remove_from_working_set; + for (const auto & part : local_active_parts) { if (active_parts_set.getContainingPart(part->name).empty()) @@ -2756,6 +2773,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo if (getSettings()->detach_old_local_parts_when_cloning_replica) { auto metadata_snapshot = getInMemoryMetadataPtr(); + for (const auto & part : parts_to_remove_from_working_set) { LOG_INFO(log, "Detaching {}", part->relative_path); @@ -2768,7 +2786,35 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo for (const String & name : active_parts) { LogEntry log_entry; - log_entry.type = LogEntry::GET_PART; + + if (!are_restoring_replica) + log_entry.type = LogEntry::GET_PART; + else + { + LOG_DEBUG(log, "Obtaining checksum for path {}", name); + + // The part we want to fetch is probably present in detached/ folder. + // However, we need to get part's checksum to check if it's not corrupt. + log_entry.type = LogEntry::ATTACH_PART; + + MinimalisticDataPartChecksums desired_checksums; + + const fs::path part_path = fs::path(source_path) / "parts" / name; + + const String part_znode = zookeeper->get(part_path); + + if (!part_znode.empty()) + desired_checksums = ReplicatedMergeTreePartHeader::fromString(part_znode).getChecksums(); + else + { + String desired_checksums_str = zookeeper->get(part_path / "checksums"); + desired_checksums = MinimalisticDataPartChecksums::deserializeFrom(desired_checksums_str); + } + + const auto [lo, hi] = desired_checksums.hash_of_all_files; + log_entry.part_checksum = getHexUIntUppercase(hi) + getHexUIntUppercase(lo); + } + log_entry.source_replica = ""; log_entry.new_part_name = name; log_entry.create_time = tryGetPartCreateTime(zookeeper, source_path, name); @@ -2868,6 +2914,7 @@ void StorageReplicatedMergeTree::cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zooke Coordination::Stat is_lost_stat; bool is_new_replica = true; String res; + if (zookeeper->tryGet(fs::path(replica_path) / "is_lost", res, &is_lost_stat)) { if (res == "0") @@ -3968,6 +4015,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora MinimalisticDataPartChecksums desired_checksums; String part_path = fs::path(source_replica_path) / "parts" / part_name; String part_znode = zookeeper->get(part_path); + if (!part_znode.empty()) desired_checksums = ReplicatedMergeTreePartHeader::fromString(part_znode).getChecksums(); else @@ -5030,6 +5078,59 @@ bool StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(const St return true; } +void StorageReplicatedMergeTree::restoreMetadataInZooKeeper() +{ + LOG_INFO(log, "Restoring replica metadata"); + + if (!is_readonly || has_metadata_in_zookeeper) + throw Exception(ErrorCodes::LOGICAL_ERROR, "It's a bug: replica is not readonly"); + + if (are_restoring_replica.exchange(true)) + throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Replica restoration in progress"); + + auto metadata_snapshot = getInMemoryMetadataPtr(); + + const DataPartsVector all_parts = getDataPartsVector(IMergeTreeDataPart::all_part_states); + Strings active_parts_names; + + /// Why all parts (not only Committed) are moved to detached/: + /// After ZK metadata restoration ZK resets sequential counters (including block number counters), so one may + /// potentially encounter a situation that a part we want to attach already exists. + for (const auto & part : all_parts) + { + if (part->getState() == DataPartState::Committed) + active_parts_names.push_back(part->name); + + forgetPartAndMoveToDetached(part); + } + + LOG_INFO(log, "Moved all parts to detached/"); + + const bool is_first_replica = createTableIfNotExists(metadata_snapshot); + + LOG_INFO(log, "Created initial ZK nodes, replica is first: {}", is_first_replica); + + if (!is_first_replica) + createReplica(metadata_snapshot); + + createNewZooKeeperNodes(); + + LOG_INFO(log, "Created ZK nodes for table"); + + is_readonly = false; + has_metadata_in_zookeeper = true; + + if (is_first_replica) + for (const String& part_name : active_parts_names) + attachPartition(std::make_shared(part_name), metadata_snapshot, true, getContext()); + + LOG_INFO(log, "Attached all partitions, starting table"); + + startup(); + + are_restoring_replica.store(false); +} + void StorageReplicatedMergeTree::dropPartNoWaitNoThrow(const String & part_name) { assertNotReadonly(); @@ -6938,8 +7039,10 @@ bool StorageReplicatedMergeTree::dropAllPartsInPartition( zookeeper.get(alter_partition_version_path, &alter_partition_version_stat); MergeTreePartInfo drop_range_info; - /// It prevent other replicas from assigning merges which intersect locked block number. + + /// It would prevent other replicas from assigning merges which intersect locked block number. std::optional delimiting_block_lock; + if (!getFakePartCoveringAllPartsInPartition(partition_id, drop_range_info, delimiting_block_lock)) { LOG_INFO(log, "Will not drop partition {}, it is empty.", partition_id); @@ -6960,23 +7063,31 @@ bool StorageReplicatedMergeTree::dropAllPartsInPartition( entry.create_time = time(nullptr); Coordination::Requests ops; - ops.emplace_back(zkutil::makeCreateRequest(fs::path(zookeeper_path) / "log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential)); + + ops.emplace_back(zkutil::makeCreateRequest(fs::path(zookeeper_path) / "log/log-", entry.toString(), + zkutil::CreateMode::PersistentSequential)); + /// Check and update version to avoid race with REPLACE_RANGE. /// Otherwise new parts covered by drop_range_info may appear after execution of current DROP_RANGE entry /// as a result of execution of concurrently created REPLACE_RANGE entry. ops.emplace_back(zkutil::makeCheckRequest(alter_partition_version_path, alter_partition_version_stat.version)); ops.emplace_back(zkutil::makeSetRequest(alter_partition_version_path, "", -1)); + /// Just update version, because merges assignment relies on it ops.emplace_back(zkutil::makeSetRequest(fs::path(zookeeper_path) / "log", "", -1)); delimiting_block_lock->getUnlockOps(ops); + if (auto txn = query_context->getZooKeeperMetadataTransaction()) txn->moveOpsTo(ops); + Coordination::Responses responses; Coordination::Error code = zookeeper.tryMulti(ops, responses); + if (code == Coordination::Error::ZOK) delimiting_block_lock->assumeUnlocked(); else if (code == Coordination::Error::ZBADVERSION) - throw Exception(ErrorCodes::CANNOT_ASSIGN_ALTER, "Cannot assign ALTER PARTITION, because another ALTER PARTITION query was concurrently executed"); + throw Exception(ErrorCodes::CANNOT_ASSIGN_ALTER, + "Cannot assign ALTER PARTITION because another ALTER PARTITION query was concurrently executed"); else zkutil::KeeperMultiException::check(code, ops, responses); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 505eb4e87c4..396ec7a1741 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -35,7 +35,7 @@ namespace DB { -/** The engine that uses the merge tree (see MergeTreeData) and replicated through ZooKeeper. +/** The engine that uses the merge tree (see MergeTreeData) and is replicated through ZooKeeper. * * ZooKeeper is used for the following things: * - the structure of the table (/metadata, /columns) @@ -57,6 +57,7 @@ namespace DB * Log - a sequence of entries (LogEntry) about what to do. * Each entry is one of: * - normal data insertion (GET), + * - data insertion with a possible attach from local data (ATTACH), * - merge (MERGE), * - delete the partition (DROP). * @@ -65,10 +66,8 @@ namespace DB * Despite the name of the "queue", execution can be reordered, if necessary (shouldExecuteLogEntry, executeLogEntry). * In addition, the records in the queue can be generated independently (not from the log), in the following cases: * - when creating a new replica, actions are put on GET from other replicas (createReplica); - * - if the part is corrupt (removePartAndEnqueueFetch) or absent during the check (at start - checkParts, while running - searchForMissingPart), - * actions are put on GET from other replicas; - * - * TODO Update the GET part after rewriting the code (search locally). + * - if the part is corrupt (removePartAndEnqueueFetch) or absent during the check + * (at start - checkParts, while running - searchForMissingPart), actions are put on GET from other replicas; * * The replica to which INSERT was made in the queue will also have an entry of the GET of this data. * Such an entry is considered to be executed as soon as the queue handler sees it. @@ -240,6 +239,13 @@ public: /// Get best replica having this partition on S3 String getSharedDataReplica(const IMergeTreeDataPart & part) const; + inline String getReplicaName() const { return replica_name; } + + /// Restores table metadata if ZooKeeper lost it. + /// Used only on restarted readonly replicas (not checked). All active (Committed) parts are moved to detached/ + /// folder and attached. Parts in all other states are just moved to detached/ folder. + void restoreMetadataInZooKeeper(); + /// Get throttler for replicated fetches ThrottlerPtr getFetchesThrottler() const { @@ -253,6 +259,8 @@ public: } private: + std::atomic_bool are_restoring_replica {false}; + /// Get a sequential consistent view of current parts. ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock getMaxAddedBlocks() const; @@ -332,7 +340,7 @@ private: Poco::Event partial_shutdown_event {false}; /// Poco::Event::EVENT_MANUALRESET /// Limiting parallel fetches per node - static std::atomic_uint total_fetches; + static inline std::atomic_uint total_fetches {0}; /// Limiting parallel fetches per one table std::atomic_uint current_table_fetches {0}; @@ -389,8 +397,9 @@ private: */ bool createTableIfNotExists(const StorageMetadataPtr & metadata_snapshot); - /** Creates a replica in ZooKeeper and adds to the queue all that it takes to catch up with the rest of the replicas. - */ + /** + * Creates a replica in ZooKeeper and adds to the queue all that it takes to catch up with the rest of the replicas. + */ void createReplica(const StorageMetadataPtr & metadata_snapshot); /** Create nodes in the ZK, which must always be, but which might not exist when older versions of the server are running. diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index 65a536d5d30..3c4ab0edbab 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -79,7 +79,8 @@ ColumnsDescription ITableFunctionFileLike::getActualTableStructure(ContextPtr co if (structure.empty()) { assert(getName() == "file" && format == "Distributed"); - Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context); + size_t total_bytes_to_read = 0; + Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); if (paths.empty()) throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); auto read_stream = StorageDistributedDirectoryMonitor::createStreamFromFile(paths[0]); diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 8b8462a7125..0af76fe2648 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -377,8 +377,8 @@ class ClickhouseIntegrationTestsRunner: test_cmd = ' '.join([test for test in sorted(test_names)]) parallel_cmd = " --parallel {} ".format(num_workers) if num_workers > 0 else "" - cmd = "cd {}/tests/integration && ./runner --tmpfs {} -t {} {} '-ss -rfEp --color=no --durations=0 {}' | tee {}".format( - repo_path, image_cmd, test_cmd, parallel_cmd, _get_deselect_option(self.should_skip_tests()), output_path) + cmd = "cd {}/tests/integration && ./runner --tmpfs {} -t {} {} '-ss -rfEp --run-id={} --color=no --durations=0 {}' | tee {}".format( + repo_path, image_cmd, test_cmd, parallel_cmd, i, _get_deselect_option(self.should_skip_tests()), output_path) with open(log_path, 'w') as log: logging.info("Executing cmd: %s", cmd) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index fa14e2b06d6..993e7a6e973 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -28,4 +28,10 @@ def cleanup_environment(): logging.exception(f"cleanup_environment:{str(e)}") pass - yield \ No newline at end of file + yield + +def pytest_addoption(parser): + parser.addoption("--run-id", default="", help="run-id is used as postfix in _instances_{} directory") + +def pytest_configure(config): + os.environ['INTEGRATION_TESTS_RUN_ID'] = config.option.run_id diff --git a/tests/integration/helpers/client.py b/tests/integration/helpers/client.py index 95f8a58dbf1..ceebf3c23bf 100644 --- a/tests/integration/helpers/client.py +++ b/tests/integration/helpers/client.py @@ -1,6 +1,7 @@ import os import subprocess as sp import tempfile +import logging from threading import Timer @@ -105,6 +106,7 @@ class CommandRequest: stderr = self.stderr_file.read().decode('utf-8', errors='replace') if self.timer is not None and not self.process_finished_before_timeout and not self.ignore_error: + logging.debug(f"Timed out. Last stdout:{stdout}, stderr:{stderr}") raise QueryTimeoutExceedException('Client timed out!') if (self.process.returncode != 0 or stderr) and not self.ignore_error: diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 70a52e42048..bd2f7d2bd8a 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -29,7 +29,6 @@ from dict2xml import dict2xml from kazoo.client import KazooClient from kazoo.exceptions import KazooException from minio import Minio -from minio.deleteobjects import DeleteObject from helpers.test_tools import assert_eq_with_retry import docker @@ -172,6 +171,13 @@ def enable_consistent_hash_plugin(rabbitmq_id): p.communicate() return p.returncode == 0 +def get_instances_dir(): + if 'INTEGRATION_TESTS_RUN_ID' in os.environ and os.environ['INTEGRATION_TESTS_RUN_ID']: + return '_instances_' + shlex.quote(os.environ['INTEGRATION_TESTS_RUN_ID']) + else: + return '_instances' + + class ClickHouseCluster: """ClickHouse cluster with several instances and (possibly) ZooKeeper. @@ -203,7 +209,14 @@ class ClickHouseCluster: project_name = pwd.getpwuid(os.getuid()).pw_name + p.basename(self.base_dir) + self.name # docker-compose removes everything non-alphanumeric from project names so we do it too. self.project_name = re.sub(r'[^a-z0-9]', '', project_name.lower()) - self.instances_dir = p.join(self.base_dir, '_instances' + ('' if not self.name else '_' + self.name)) + instances_dir_name = '_instances' + if self.name: + instances_dir_name += '_' + self.name + + if 'INTEGRATION_TESTS_RUN_ID' in os.environ and os.environ['INTEGRATION_TESTS_RUN_ID']: + instances_dir_name += '_' + shlex.quote(os.environ['INTEGRATION_TESTS_RUN_ID']) + + self.instances_dir = p.join(self.base_dir, instances_dir_name) self.docker_logs_path = p.join(self.instances_dir, 'docker.log') self.env_file = p.join(self.instances_dir, DEFAULT_ENV_NAME) self.env_variables = {} @@ -421,7 +434,15 @@ class ClickHouseCluster: pass def get_docker_handle(self, docker_id): - return self.docker_client.containers.get(docker_id) + exception = None + for i in range(5): + try: + return self.docker_client.containers.get(docker_id) + except Exception as ex: + print("Got exception getting docker handle", str(ex)) + time.sleep(i * 2) + exception = ex + raise exception def get_client_cmd(self): cmd = self.client_bin_path @@ -577,7 +598,7 @@ class ClickHouseCluster: self.base_cmd.extend(['--file', p.join(docker_compose_yml_dir, 'docker_compose_hdfs.yml')]) self.base_hdfs_cmd = ['docker-compose', '--env-file', instance.env_file, '--project-name', self.project_name, '--file', p.join(docker_compose_yml_dir, 'docker_compose_hdfs.yml')] - print("HDFS BASE CMD:{}".format(self.base_hdfs_cmd)) + logging.debug("HDFS BASE CMD:{self.base_hdfs_cmd)}") return self.base_hdfs_cmd def setup_kerberized_hdfs_cmd(self, instance, env_variables, docker_compose_yml_dir): @@ -1217,8 +1238,8 @@ class ClickHouseCluster: for bucket in buckets: if minio_client.bucket_exists(bucket): delete_object_list = map( - lambda x: DeleteObject(x.object_name), - minio_client.list_objects(bucket, recursive=True), + lambda x: x.object_name, + minio_client.list_objects_v2(bucket, recursive=True), ) errors = minio_client.remove_objects(bucket, delete_object_list) for error in errors: @@ -1468,9 +1489,9 @@ class ClickHouseCluster: instance.docker_client = self.docker_client instance.ip_address = self.get_instance_ip(instance.name) - logging.debug("Waiting for ClickHouse start...") + logging.debug("Waiting for ClickHouse start in {instance}, ip: {instance.ip_address}...") instance.wait_for_start(start_timeout) - logging.debug("ClickHouse started") + logging.debug("ClickHouse {instance} started") instance.client = Client(instance.ip_address, command=self.client_bin_path) @@ -1864,8 +1885,7 @@ class ClickHouseInstance: self.start_clickhouse(stop_start_wait_sec) def exec_in_container(self, cmd, detach=False, nothrow=False, **kwargs): - container_id = self.get_docker_handle().id - return self.cluster.exec_in_container(container_id, cmd, detach, nothrow, **kwargs) + return self.cluster.exec_in_container(self.docker_id, cmd, detach, nothrow, **kwargs) def contains_in_log(self, substring): result = self.exec_in_container( @@ -1905,8 +1925,7 @@ class ClickHouseInstance: ["bash", "-c", "echo $(if [ -e '{}' ]; then echo 'yes'; else echo 'no'; fi)".format(path)]) == 'yes\n' def copy_file_to_container(self, local_path, dest_path): - container_id = self.get_docker_handle().id - return self.cluster.copy_file_to_container(container_id, local_path, dest_path) + return self.cluster.copy_file_to_container(self.docker_id, local_path, dest_path) def get_process_pid(self, process_name): output = self.exec_in_container(["bash", "-c", @@ -1961,6 +1980,7 @@ class ClickHouseInstance: self.get_docker_handle().start() def wait_for_start(self, start_timeout=None, connection_timeout=None): + handle = self.get_docker_handle() if start_timeout is None or start_timeout <= 0: raise Exception("Invalid timeout: {}".format(start_timeout)) @@ -1983,11 +2003,10 @@ class ClickHouseInstance: return False while True: - handle = self.get_docker_handle() + handle.reload() status = handle.status if status == 'exited': - raise Exception("Instance `{}' failed to start. Container status: {}, logs: {}" - .format(self.name, status, handle.logs().decode('utf-8'))) + raise Exception(f"Instance `{self.name}' failed to start. Container status: {status}, logs: {handle.logs().decode('utf-8')}") deadline = start_time + start_timeout # It is possible that server starts slowly. @@ -1997,9 +2016,8 @@ class ClickHouseInstance: current_time = time.time() if current_time >= deadline: - raise Exception("Timed out while waiting for instance `{}' with ip address {} to start. " - "Container status: {}, logs: {}".format(self.name, self.ip_address, status, - handle.logs().decode('utf-8'))) + raise Exception(f"Timed out while waiting for instance `{self.name}' with ip address {self.ip_address} to start. " \ + f"Container status: {status}, logs: {handle.logs().decode('utf-8')}") socket_timeout = min(start_timeout, deadline - current_time) diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index 93478c4dd49..ef530c4836b 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -1,5 +1,6 @@ import difflib import time +import logging from io import IOBase @@ -56,7 +57,7 @@ def assert_eq_with_retry(instance, query, expectation, retry_count=20, sleep_tim break time.sleep(sleep_time) except Exception as ex: - print(("assert_eq_with_retry retry {} exception {}".format(i + 1, ex))) + logging.exception(f"assert_eq_with_retry retry {i+1} exception {ex}") time.sleep(sleep_time) else: val = TSV(get_result(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, @@ -76,7 +77,7 @@ def assert_logs_contain_with_retry(instance, substring, retry_count=20, sleep_ti break time.sleep(sleep_time) except Exception as ex: - print("contains_in_log_with_retry retry {} exception {}".format(i + 1, ex)) + logging.exception(f"contains_in_log_with_retry retry {i+1} exception {ex}") time.sleep(sleep_time) else: raise AssertionError("'{}' not found in logs".format(substring)) @@ -89,7 +90,7 @@ def exec_query_with_retry(instance, query, retry_count=40, sleep_time=0.5, setti break except Exception as ex: exception = ex - print("Failed to execute query '", query, "' on instance", instance.name, "will retry") + logging.exception(f"Failed to execute query '{query}' on instance '{instance.name}' will retry") time.sleep(sleep_time) else: raise exception diff --git a/tests/integration/pytest.ini b/tests/integration/pytest.ini index 737a37ee9d0..6d451adf7eb 100644 --- a/tests/integration/pytest.ini +++ b/tests/integration/pytest.ini @@ -1,6 +1,6 @@ [pytest] python_files = test*.py -norecursedirs = _instances +norecursedirs = _instances* timeout = 1800 junit_duration_report = call junit_suite_name = integration diff --git a/tests/integration/test_attach_without_fetching/test.py b/tests/integration/test_attach_without_fetching/test.py index 605ca6a4f51..874f5b36ddc 100644 --- a/tests/integration/test_attach_without_fetching/test.py +++ b/tests/integration/test_attach_without_fetching/test.py @@ -16,11 +16,10 @@ def fill_node(node): '''.format(replica=node.name)) cluster = ClickHouseCluster(__file__) -configs =["configs/remote_servers.xml"] -node_1 = cluster.add_instance('replica1', with_zookeeper=True, main_configs=configs) -node_2 = cluster.add_instance('replica2', with_zookeeper=True, main_configs=configs) -node_3 = cluster.add_instance('replica3', with_zookeeper=True, main_configs=configs) +node_1 = cluster.add_instance('replica1', with_zookeeper=True) +node_2 = cluster.add_instance('replica2', with_zookeeper=True) +node_3 = cluster.add_instance('replica3', with_zookeeper=True) @pytest.fixture(scope="module") def start_cluster(): diff --git a/tests/integration/test_backup_with_other_granularity/test.py b/tests/integration/test_backup_with_other_granularity/test.py index 5ed1cb06787..832c1cf35ce 100644 --- a/tests/integration/test_backup_with_other_granularity/test.py +++ b/tests/integration/test_backup_with_other_granularity/test.py @@ -43,8 +43,8 @@ def test_backup_from_old_version(started_cluster): assert node1.query("SELECT COUNT() FROM dest_table") == "1\n" - node1.exec_in_container(['bash', '-c', - 'cp -r /var/lib/clickhouse/shadow/1/data/default/source_table/all_1_1_0/ /var/lib/clickhouse/data/default/dest_table/detached']) + node1.exec_in_container(['find', '/var/lib/clickhouse/shadow/1/data/default/source_table']) + node1.exec_in_container(['cp', '-r', '/var/lib/clickhouse/shadow/1/data/default/source_table/all_1_1_0/', '/var/lib/clickhouse/data/default/dest_table/detached']) assert node1.query("SELECT COUNT() FROM dest_table") == "1\n" @@ -81,8 +81,7 @@ def test_backup_from_old_version_setting(started_cluster): assert node2.query("SELECT COUNT() FROM dest_table") == "1\n" - node2.exec_in_container(['bash', '-c', - 'cp -r /var/lib/clickhouse/shadow/1/data/default/source_table/all_1_1_0/ /var/lib/clickhouse/data/default/dest_table/detached']) + node2.exec_in_container(['cp', '-r', '/var/lib/clickhouse/shadow/1/data/default/source_table/all_1_1_0/', '/var/lib/clickhouse/data/default/dest_table/detached']) assert node2.query("SELECT COUNT() FROM dest_table") == "1\n" @@ -123,8 +122,7 @@ def test_backup_from_old_version_config(started_cluster): assert node3.query("SELECT COUNT() FROM dest_table") == "1\n" - node3.exec_in_container(['bash', '-c', - 'cp -r /var/lib/clickhouse/shadow/1/data/default/source_table/all_1_1_0/ /var/lib/clickhouse/data/default/dest_table/detached']) + node3.exec_in_container(['cp', '-r', '/var/lib/clickhouse/shadow/1/data/default/source_table/all_1_1_0/', '/var/lib/clickhouse/data/default/dest_table/detached']) assert node3.query("SELECT COUNT() FROM dest_table") == "1\n" @@ -156,8 +154,7 @@ def test_backup_and_alter(started_cluster): node4.query("ALTER TABLE test.backup_table DROP PARTITION tuple()") - node4.exec_in_container(['bash', '-c', - 'cp -r /var/lib/clickhouse/shadow/1/data/test/backup_table/all_1_1_0/ /var/lib/clickhouse/data/test/backup_table/detached']) + node4.exec_in_container(['cp', '-r', '/var/lib/clickhouse/shadow/1/data/test/backup_table/all_1_1_0/', '/var/lib/clickhouse/data/test/backup_table/detached']) node4.query("ALTER TABLE test.backup_table ATTACH PARTITION tuple()") diff --git a/tests/integration/test_cluster_copier/test_three_nodes.py b/tests/integration/test_cluster_copier/test_three_nodes.py index acdc191154c..63b0bcc6679 100644 --- a/tests/integration/test_cluster_copier/test_three_nodes.py +++ b/tests/integration/test_cluster_copier/test_three_nodes.py @@ -39,7 +39,7 @@ class Task: for instance_name, _ in cluster.instances.items(): instance = cluster.instances[instance_name] instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_taxi_data.xml'), self.container_task_file) - print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file)) + logging.debug(f"Copied task file to container of '{instance_name}' instance. Path {self.container_task_file}") def start(self): @@ -48,11 +48,11 @@ class Task: node.query("DROP DATABASE IF EXISTS dailyhistory SYNC;") node.query("DROP DATABASE IF EXISTS monthlyhistory SYNC;") - instance = cluster.instances['first'] + first = cluster.instances['first'] # daily partition database - instance.query("CREATE DATABASE IF NOT EXISTS dailyhistory on cluster events;") - instance.query("""CREATE TABLE dailyhistory.yellow_tripdata_staging ON CLUSTER events + first.query("CREATE DATABASE IF NOT EXISTS dailyhistory on cluster events;") + first.query("""CREATE TABLE dailyhistory.yellow_tripdata_staging ON CLUSTER events ( id UUID DEFAULT generateUUIDv4(), vendor_id String, @@ -84,12 +84,12 @@ class Task: ORDER BY (tpep_pickup_datetime, id) PARTITION BY (toYYYYMMDD(tpep_pickup_datetime))""") - instance.query("""CREATE TABLE dailyhistory.yellow_tripdata + first.query("""CREATE TABLE dailyhistory.yellow_tripdata ON CLUSTER events AS dailyhistory.yellow_tripdata_staging ENGINE = Distributed('events', 'dailyhistory', yellow_tripdata_staging, sipHash64(id) % 3);""") - instance.query("""INSERT INTO dailyhistory.yellow_tripdata + first.query("""INSERT INTO dailyhistory.yellow_tripdata SELECT * FROM generateRandom( 'id UUID DEFAULT generateUUIDv4(), vendor_id String, @@ -119,8 +119,8 @@ class Task: 1, 10, 2) LIMIT 50;""") # monthly partition database - instance.query("create database IF NOT EXISTS monthlyhistory on cluster events;") - instance.query("""CREATE TABLE monthlyhistory.yellow_tripdata_staging ON CLUSTER events + first.query("create database IF NOT EXISTS monthlyhistory on cluster events;") + first.query("""CREATE TABLE monthlyhistory.yellow_tripdata_staging ON CLUSTER events ( id UUID DEFAULT generateUUIDv4(), vendor_id String, @@ -153,16 +153,16 @@ class Task: ORDER BY (tpep_pickup_datetime, id) PARTITION BY (pickup_location_id, toYYYYMM(tpep_pickup_datetime))""") - instance.query("""CREATE TABLE monthlyhistory.yellow_tripdata + first.query("""CREATE TABLE monthlyhistory.yellow_tripdata ON CLUSTER events AS monthlyhistory.yellow_tripdata_staging ENGINE = Distributed('events', 'monthlyhistory', yellow_tripdata_staging, sipHash64(id) % 3);""") def check(self): - instance = cluster.instances["first"] - a = TSV(instance.query("SELECT count() from dailyhistory.yellow_tripdata")) - b = TSV(instance.query("SELECT count() from monthlyhistory.yellow_tripdata")) + first = cluster.instances["first"] + a = TSV(first.query("SELECT count() from dailyhistory.yellow_tripdata")) + b = TSV(first.query("SELECT count() from monthlyhistory.yellow_tripdata")) assert a == b, "Distributed tables" for instance_name, instance in cluster.instances.items(): @@ -187,10 +187,10 @@ def execute_task(started_cluster, task, cmd_options): task.start() zk = started_cluster.get_kazoo_client('zoo1') - print("Use ZooKeeper server: {}:{}".format(zk.hosts[0][0], zk.hosts[0][1])) + logging.debug("Use ZooKeeper server: {}:{}".format(zk.hosts[0][0], zk.hosts[0][1])) # Run cluster-copier processes on each node - docker_api = docker.from_env().api + docker_api = started_cluster.docker_client.api copiers_exec_ids = [] cmd = ['/usr/bin/clickhouse', 'copier', @@ -201,9 +201,9 @@ def execute_task(started_cluster, task, cmd_options): '--base-dir', '/var/log/clickhouse-server/copier'] cmd += cmd_options - print(cmd) + logging.debug(f"execute_task cmd: {cmd}") - for instance_name, instance in started_cluster.instances.items(): + for instance_name in started_cluster.instances.keys(): instance = started_cluster.instances[instance_name] container = instance.get_docker_handle() instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, "configs_three_nodes/config-copier.xml"), "/etc/clickhouse-server/config-copier.xml") diff --git a/tests/integration/test_cluster_copier/test_two_nodes.py b/tests/integration/test_cluster_copier/test_two_nodes.py index a6b2c82e00f..817c3571833 100644 --- a/tests/integration/test_cluster_copier/test_two_nodes.py +++ b/tests/integration/test_cluster_copier/test_two_nodes.py @@ -430,7 +430,7 @@ def execute_task(started_cluster, task, cmd_options): print("Use ZooKeeper server: {}:{}".format(zk.hosts[0][0], zk.hosts[0][1])) # Run cluster-copier processes on each node - docker_api = docker.from_env().api + docker_api = started_cluster.docker_client.api copiers_exec_ids = [] cmd = ['/usr/bin/clickhouse', 'copier', @@ -443,7 +443,7 @@ def execute_task(started_cluster, task, cmd_options): print(cmd) - for instance_name, instance in started_cluster.instances.items(): + for instance_name in started_cluster.instances.keys(): instance = started_cluster.instances[instance_name] container = instance.get_docker_handle() instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, "configs_two_nodes/config-copier.xml"), "/etc/clickhouse-server/config-copier.xml") diff --git a/tests/integration/test_dictionaries_update_and_reload/test.py b/tests/integration/test_dictionaries_update_and_reload/test.py index f1e6e9bb5a9..8e375b7b327 100644 --- a/tests/integration/test_dictionaries_update_and_reload/test.py +++ b/tests/integration/test_dictionaries_update_and_reload/test.py @@ -150,7 +150,7 @@ def test_reload_after_loading(started_cluster): time.sleep(1) # see the comment above replace_in_file_in_container('/etc/clickhouse-server/dictionaries/executable.xml', '82', '83') replace_in_file_in_container('/etc/clickhouse-server/dictionaries/file.txt', '102', '103') - time.sleep(7) + time.sleep(10) assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "103\n" assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "83\n" diff --git a/tests/integration/test_distributed_ddl/configs/config.d/zookeeper_session_timeout.xml b/tests/integration/test_distributed_ddl/configs/config.d/zookeeper_session_timeout.xml index 3b878bbf4de..caa0ff11137 100644 --- a/tests/integration/test_distributed_ddl/configs/config.d/zookeeper_session_timeout.xml +++ b/tests/integration/test_distributed_ddl/configs/config.d/zookeeper_session_timeout.xml @@ -1,6 +1,6 @@ - 10000 + 15000 diff --git a/tests/integration/test_distributed_ddl/configs_secure/config.d/zookeeper_session_timeout.xml b/tests/integration/test_distributed_ddl/configs_secure/config.d/zookeeper_session_timeout.xml index 3b878bbf4de..caa0ff11137 100644 --- a/tests/integration/test_distributed_ddl/configs_secure/config.d/zookeeper_session_timeout.xml +++ b/tests/integration/test_distributed_ddl/configs_secure/config.d/zookeeper_session_timeout.xml @@ -1,6 +1,6 @@ - 10000 + 15000 diff --git a/tests/integration/test_distributed_ddl/test.py b/tests/integration/test_distributed_ddl/test.py index 87e793a7acb..18e091de1ec 100755 --- a/tests/integration/test_distributed_ddl/test.py +++ b/tests/integration/test_distributed_ddl/test.py @@ -53,6 +53,7 @@ def test_default_database(test_cluster): def test_create_view(test_cluster): instance = test_cluster.instances['ch3'] + test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS test.super_simple_view ON CLUSTER 'cluster'") test_cluster.ddl_check_query(instance, "CREATE VIEW test.super_simple_view ON CLUSTER 'cluster' AS SELECT * FROM system.numbers FORMAT TSV") test_cluster.ddl_check_query(instance, @@ -76,7 +77,7 @@ def test_on_server_fail(test_cluster): kill_instance.get_docker_handle().stop() request = instance.get_query_request("CREATE TABLE test.test_server_fail ON CLUSTER 'cluster' (i Int8) ENGINE=Null", - timeout=30) + timeout=180) kill_instance.get_docker_handle().start() test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS test.__nope__ ON CLUSTER 'cluster'") @@ -92,27 +93,6 @@ def test_on_server_fail(test_cluster): test_cluster.ddl_check_query(instance, "DROP TABLE test.test_server_fail ON CLUSTER 'cluster'") -def _test_on_connection_losses(test_cluster, zk_timeout): - instance = test_cluster.instances['ch1'] - kill_instance = test_cluster.instances['ch2'] - - with PartitionManager() as pm: - pm.drop_instance_zk_connections(kill_instance) - request = instance.get_query_request("DROP TABLE IF EXISTS test.__nope__ ON CLUSTER 'cluster'", timeout=20) - time.sleep(zk_timeout) - pm.restore_instance_zk_connections(kill_instance) - - test_cluster.check_all_hosts_successfully_executed(request.get_answer()) - - -def test_on_connection_loss(test_cluster): - _test_on_connection_losses(test_cluster, 5) # connection loss will occur only (3 sec ZK timeout in config) - - -def test_on_session_expired(test_cluster): - _test_on_connection_losses(test_cluster, 15) # session should be expired (3 sec ZK timeout in config) - - def test_simple_alters(test_cluster): instance = test_cluster.instances['ch2'] @@ -190,7 +170,7 @@ def test_implicit_macros(test_cluster): instance = test_cluster.instances['ch2'] - test_cluster.ddl_check_query(instance, "DROP DATABASE IF EXISTS test_db ON CLUSTER '{cluster}'") + test_cluster.ddl_check_query(instance, "DROP DATABASE IF EXISTS test_db ON CLUSTER '{cluster}' SYNC") test_cluster.ddl_check_query(instance, "CREATE DATABASE IF NOT EXISTS test_db ON CLUSTER '{cluster}'") test_cluster.ddl_check_query(instance, """ @@ -270,6 +250,15 @@ def test_create_reserved(test_cluster): def test_rename(test_cluster): instance = test_cluster.instances['ch1'] rules = test_cluster.pm_random_drops.pop_rules() + test_cluster.ddl_check_query(instance, + "DROP TABLE IF EXISTS rename_shard ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, + "DROP TABLE IF EXISTS rename_new ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, + "DROP TABLE IF EXISTS rename_old ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, + "DROP TABLE IF EXISTS rename ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, "CREATE TABLE rename_shard ON CLUSTER cluster (id Int64, sid String DEFAULT concat('old', toString(id))) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/staging/test_shard', '{replica}') ORDER BY (id)") test_cluster.ddl_check_query(instance, @@ -326,12 +315,15 @@ def test_socket_timeout(test_cluster): def test_replicated_without_arguments(test_cluster): rules = test_cluster.pm_random_drops.pop_rules() instance = test_cluster.instances['ch1'] + test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS test_atomic.rmt ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, "DROP DATABASE IF EXISTS test_atomic ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, "CREATE DATABASE test_atomic ON CLUSTER cluster ENGINE=Atomic") assert "are supported only for ON CLUSTER queries with Atomic database engine" in \ instance.query_and_get_error("CREATE TABLE test_atomic.rmt (n UInt64, s String) ENGINE=ReplicatedMergeTree ORDER BY n") test_cluster.ddl_check_query(instance, "CREATE TABLE test_atomic.rmt ON CLUSTER cluster (n UInt64, s String) ENGINE=ReplicatedMergeTree() ORDER BY n") - test_cluster.ddl_check_query(instance, "DROP TABLE test_atomic.rmt ON CLUSTER cluster") + test_cluster.ddl_check_query(instance, "DROP TABLE test_atomic.rmt ON CLUSTER cluster SYNC") test_cluster.ddl_check_query(instance, "CREATE TABLE test_atomic.rmt UUID '12345678-0000-4000-8000-000000000001' ON CLUSTER cluster (n UInt64, s String) ENGINE=ReplicatedMergeTree ORDER BY n") assert instance.query("SHOW CREATE test_atomic.rmt FORMAT TSVRaw") == \ @@ -349,7 +341,7 @@ def test_replicated_without_arguments(test_cluster): "CREATE TABLE test_atomic.rsmt ON CLUSTER cluster (n UInt64, m UInt64, k UInt64) ENGINE=ReplicatedSummingMergeTree((m, k)) ORDER BY n") test_cluster.ddl_check_query(instance, "CREATE TABLE test_atomic.rvcmt ON CLUSTER cluster (n UInt64, m Int8, k UInt64) ENGINE=ReplicatedVersionedCollapsingMergeTree(m, k) ORDER BY n") - test_cluster.ddl_check_query(instance, "DROP DATABASE test_atomic ON CLUSTER cluster") + test_cluster.ddl_check_query(instance, "DROP DATABASE test_atomic ON CLUSTER cluster SYNC") test_cluster.ddl_check_query(instance, "CREATE DATABASE test_ordinary ON CLUSTER cluster ENGINE=Ordinary") assert "are supported only for ON CLUSTER queries with Atomic database engine" in \ @@ -359,7 +351,7 @@ def test_replicated_without_arguments(test_cluster): test_cluster.ddl_check_query(instance, "CREATE TABLE test_ordinary.rmt ON CLUSTER cluster (n UInt64, s String) ENGINE=ReplicatedMergeTree('/{shard}/{table}/', '{replica}') ORDER BY n") assert instance.query("SHOW CREATE test_ordinary.rmt FORMAT TSVRaw") == \ "CREATE TABLE test_ordinary.rmt\n(\n `n` UInt64,\n `s` String\n)\nENGINE = ReplicatedMergeTree('/{shard}/rmt/', '{replica}')\nORDER BY n\nSETTINGS index_granularity = 8192\n" - test_cluster.ddl_check_query(instance, "DROP DATABASE test_ordinary ON CLUSTER cluster") + test_cluster.ddl_check_query(instance, "DROP DATABASE test_ordinary ON CLUSTER cluster SYNC") test_cluster.pm_random_drops.push_rules(rules) diff --git a/tests/integration/test_distributed_ddl/test_replicated_alter.py b/tests/integration/test_distributed_ddl/test_replicated_alter.py index 148ad5fca5e..5e7989cb256 100644 --- a/tests/integration/test_distributed_ddl/test_replicated_alter.py +++ b/tests/integration/test_distributed_ddl/test_replicated_alter.py @@ -38,9 +38,9 @@ def test_cluster(request): def test_replicated_alters(test_cluster): instance = test_cluster.instances['ch2'] - test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS merge_for_alter ON CLUSTER cluster") - test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS all_merge_32 ON CLUSTER cluster") - test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS all_merge_64 ON CLUSTER cluster") + test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS merge_for_alter ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS all_merge_32 ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, "DROP TABLE IF EXISTS all_merge_64 ON CLUSTER cluster SYNC") # Temporarily disable random ZK packet drops, they might broke creation if ReplicatedMergeTree replicas firewall_drops_rules = test_cluster.pm_random_drops.pop_rules() @@ -90,10 +90,10 @@ ENGINE = Distributed(cluster, default, merge_for_alter, i) assert TSV(instance.query("SELECT i, s FROM all_merge_64 ORDER BY i")) == TSV( ''.join(['{}\t{}\n'.format(x, x) for x in range(4)])) - test_cluster.ddl_check_query(instance, "DROP TABLE merge_for_alter ON CLUSTER cluster") + test_cluster.ddl_check_query(instance, "DROP TABLE merge_for_alter ON CLUSTER cluster SYNC") # Enable random ZK packet drops test_cluster.pm_random_drops.push_rules(firewall_drops_rules) - test_cluster.ddl_check_query(instance, "DROP TABLE all_merge_32 ON CLUSTER cluster") - test_cluster.ddl_check_query(instance, "DROP TABLE all_merge_64 ON CLUSTER cluster") + test_cluster.ddl_check_query(instance, "DROP TABLE all_merge_32 ON CLUSTER cluster SYNC") + test_cluster.ddl_check_query(instance, "DROP TABLE all_merge_64 ON CLUSTER cluster SYNC") diff --git a/tests/integration/test_grant_and_revoke/test.py b/tests/integration/test_grant_and_revoke/test.py index c1be16fe17d..1124f072a06 100644 --- a/tests/integration/test_grant_and_revoke/test.py +++ b/tests/integration/test_grant_and_revoke/test.py @@ -151,7 +151,7 @@ def test_grant_all_on_table(): instance.query("GRANT ALL ON test.table TO A WITH GRANT OPTION") instance.query("GRANT ALL ON test.table TO B", user='A') assert instance.query( - "SHOW GRANTS FOR B") == "GRANT SHOW TABLES, SHOW COLUMNS, SHOW DICTIONARIES, SELECT, INSERT, ALTER, CREATE TABLE, CREATE VIEW, CREATE DICTIONARY, DROP TABLE, DROP VIEW, DROP DICTIONARY, TRUNCATE, OPTIMIZE, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, SYSTEM MOVES, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON test.table TO B\n" + "SHOW GRANTS FOR B") == "GRANT SHOW TABLES, SHOW COLUMNS, SHOW DICTIONARIES, SELECT, INSERT, ALTER, CREATE TABLE, CREATE VIEW, CREATE DICTIONARY, DROP TABLE, DROP VIEW, DROP DICTIONARY, TRUNCATE, OPTIMIZE, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, SYSTEM MOVES, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, SYSTEM RESTORE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON test.table TO B\n" instance.query("REVOKE ALL ON test.table FROM B", user='A') assert instance.query("SHOW GRANTS FOR B") == "" diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index 4b30287ca1e..31df6dff374 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -6,7 +6,7 @@ import threading import os import pytest -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, get_instances_dir # By default the exceptions that was throwed in threads will be ignored @@ -30,7 +30,7 @@ class SafeThread(threading.Thread): SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -CONFIG_PATH = os.path.join(SCRIPT_DIR, './_instances/node/configs/config.d/storage_conf.xml') +CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/node/configs/config.d/storage_conf.xml'.format(get_instances_dir())) def replace_config(old, new): diff --git a/tests/integration/test_merge_tree_s3_restore/test.py b/tests/integration/test_merge_tree_s3_restore/test.py index b7543388791..809fff6695c 100644 --- a/tests/integration/test_merge_tree_s3_restore/test.py +++ b/tests/integration/test_merge_tree_s3_restore/test.py @@ -5,10 +5,10 @@ import string import time import pytest -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, get_instances_dir SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -NOT_RESTORABLE_CONFIG_PATH = os.path.join(SCRIPT_DIR, './_instances/node_not_restorable/configs/config.d/storage_conf_not_restorable.xml') +NOT_RESTORABLE_CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/node_not_restorable/configs/config.d/storage_conf_not_restorable.xml'.format(get_instances_dir())) COMMON_CONFIGS = ["configs/config.d/bg_processing_pool_conf.xml", "configs/config.d/log_conf.xml", "configs/config.d/clusters.xml"] diff --git a/tests/integration/test_reload_max_table_size_to_drop/test.py b/tests/integration/test_reload_max_table_size_to_drop/test.py index 5f2083d742e..7e7219088b8 100644 --- a/tests/integration/test_reload_max_table_size_to_drop/test.py +++ b/tests/integration/test_reload_max_table_size_to_drop/test.py @@ -2,13 +2,14 @@ import os import time import pytest -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, get_instances_dir cluster = ClickHouseCluster(__file__) node = cluster.add_instance('node', main_configs=["configs/max_table_size_to_drop.xml"]) SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -CONFIG_PATH = os.path.join(SCRIPT_DIR, './_instances/node/configs/config.d/max_table_size_to_drop.xml') + +CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/node/configs/config.d/max_table_size_to_drop.xml'.format(get_instances_dir())) @pytest.fixture(scope="module") diff --git a/tests/integration/test_restore_replica/__init__.py b/tests/integration/test_restore_replica/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_attach_without_fetching/configs/remote_servers.xml b/tests/integration/test_restore_replica/configs/remote_servers.xml similarity index 79% rename from tests/integration/test_attach_without_fetching/configs/remote_servers.xml rename to tests/integration/test_restore_replica/configs/remote_servers.xml index 7978f921b2e..0709f97551a 100644 --- a/tests/integration/test_attach_without_fetching/configs/remote_servers.xml +++ b/tests/integration/test_restore_replica/configs/remote_servers.xml @@ -4,15 +4,15 @@ true - node_1_1 + replica1 9000 - node_1_2 + replica2 9000 - node_1_3 + replica3 9000 diff --git a/tests/integration/test_restore_replica/test.py b/tests/integration/test_restore_replica/test.py new file mode 100644 index 00000000000..4197c064243 --- /dev/null +++ b/tests/integration/test_restore_replica/test.py @@ -0,0 +1,156 @@ +import time +import pytest + +from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseKiller +from helpers.test_tools import assert_eq_with_retry +from helpers.network import PartitionManager + +def fill_nodes(nodes): + for node in nodes: + node.query( + ''' + CREATE TABLE test(n UInt32) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/', '{replica}') + ORDER BY n PARTITION BY n % 10; + '''.format(replica=node.name)) + +cluster = ClickHouseCluster(__file__) +configs =["configs/remote_servers.xml"] + +node_1 = cluster.add_instance('replica1', with_zookeeper=True, main_configs=configs) +node_2 = cluster.add_instance('replica2', with_zookeeper=True, main_configs=configs) +node_3 = cluster.add_instance('replica3', with_zookeeper=True, main_configs=configs) +nodes = [node_1, node_2, node_3] + +def fill_table(): + node_1.query("TRUNCATE TABLE test") + + for node in nodes: + node.query("SYSTEM SYNC REPLICA test") + + check_data(0, 0) + + # it will create multiple parts in each partition and probably cause merges + node_1.query("INSERT INTO test SELECT number + 0 FROM numbers(200)") + node_1.query("INSERT INTO test SELECT number + 200 FROM numbers(200)") + node_1.query("INSERT INTO test SELECT number + 400 FROM numbers(200)") + node_1.query("INSERT INTO test SELECT number + 600 FROM numbers(200)") + node_1.query("INSERT INTO test SELECT number + 800 FROM numbers(200)") + check_data(499500, 1000) + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + fill_nodes(nodes) + yield cluster + + except Exception as ex: + print(ex) + + finally: + cluster.shutdown() + +def check_data(_sum: int, count: int) -> None: + res = "{}\t{}\n".format(_sum, count) + assert_eq_with_retry(node_1, "SELECT sum(n), count() FROM test", res) + assert_eq_with_retry(node_2, "SELECT sum(n), count() FROM test", res) + assert_eq_with_retry(node_3, "SELECT sum(n), count() FROM test", res) + +def check_after_restoration(): + check_data(1999000, 2000) + + for node in nodes: + node.query_and_get_error("SYSTEM RESTORE REPLICA test") + +def test_restore_replica_invalid_tables(start_cluster): + print("Checking the invocation on non-existent and non-replicated tables") + node_1.query_and_get_error("SYSTEM RESTORE REPLICA i_dont_exist_42") + node_1.query_and_get_error("SYSTEM RESTORE REPLICA no_db.i_dont_exist_42") + node_1.query_and_get_error("SYSTEM RESTORE REPLICA system.numbers") + +def test_restore_replica_sequential(start_cluster): + zk = cluster.get_kazoo_client('zoo1') + fill_table() + + print("Deleting root ZK path metadata") + zk.delete("/clickhouse/tables/test", recursive=True) + assert zk.exists("/clickhouse/tables/test") is None + + node_1.query("SYSTEM RESTART REPLICA test") + node_1.query_and_get_error("INSERT INTO test SELECT number AS num FROM numbers(1000,2000) WHERE num % 2 = 0") + + print("Restoring replica1") + + node_1.query("SYSTEM RESTORE REPLICA test") + assert zk.exists("/clickhouse/tables/test") + check_data(499500, 1000) + + node_1.query("INSERT INTO test SELECT number + 1000 FROM numbers(1000)") + + print("Restoring other replicas") + + node_2.query("SYSTEM RESTART REPLICA test") + node_2.query("SYSTEM RESTORE REPLICA test") + + node_3.query("SYSTEM RESTART REPLICA test") + node_3.query("SYSTEM RESTORE REPLICA test") + + node_2.query("SYSTEM SYNC REPLICA test") + node_3.query("SYSTEM SYNC REPLICA test") + + check_after_restoration() + +def test_restore_replica_parallel(start_cluster): + zk = cluster.get_kazoo_client('zoo1') + fill_table() + + print("Deleting root ZK path metadata") + zk.delete("/clickhouse/tables/test", recursive=True) + assert zk.exists("/clickhouse/tables/test") is None + + node_1.query("SYSTEM RESTART REPLICA test") + node_1.query_and_get_error("INSERT INTO test SELECT number AS num FROM numbers(1000,2000) WHERE num % 2 = 0") + + print("Restoring replicas in parallel") + + node_2.query("SYSTEM RESTART REPLICA test") + node_3.query("SYSTEM RESTART REPLICA test") + + node_1.query("SYSTEM RESTORE REPLICA test ON CLUSTER test_cluster") + + assert zk.exists("/clickhouse/tables/test") + check_data(499500, 1000) + + node_1.query("INSERT INTO test SELECT number + 1000 FROM numbers(1000)") + + check_after_restoration() + +def test_restore_replica_alive_replicas(start_cluster): + zk = cluster.get_kazoo_client('zoo1') + fill_table() + + print("Deleting replica2 path, trying to restore replica1") + zk.delete("/clickhouse/tables/test/replicas/replica2", recursive=True) + assert zk.exists("/clickhouse/tables/test/replicas/replica2") is None + node_1.query_and_get_error("SYSTEM RESTORE REPLICA test") + + print("Deleting replica1 path, trying to restore replica1") + zk.delete("/clickhouse/tables/test/replicas/replica1", recursive=True) + assert zk.exists("/clickhouse/tables/test/replicas/replica1") is None + + node_1.query("SYSTEM RESTART REPLICA test") + node_1.query("SYSTEM RESTORE REPLICA test") + + node_2.query("SYSTEM RESTART REPLICA test") + node_2.query("SYSTEM RESTORE REPLICA test") + + check_data(499500, 1000) + + node_1.query("INSERT INTO test SELECT number + 1000 FROM numbers(1000)") + + node_2.query("SYSTEM SYNC REPLICA test") + node_3.query("SYSTEM SYNC REPLICA test") + + check_after_restoration() diff --git a/tests/integration/test_storage_mysql/test.py b/tests/integration/test_storage_mysql/test.py index 2c993e3d696..a044528cacf 100644 --- a/tests/integration/test_storage_mysql/test.py +++ b/tests/integration/test_storage_mysql/test.py @@ -21,16 +21,27 @@ create_table_sql_template = """ PRIMARY KEY (`id`)) ENGINE=InnoDB; """ -def create_mysql_db(conn, name): - with conn.cursor() as cursor: - cursor.execute( - "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(name)) +drop_table_sql_template = """ + DROP TABLE IF EXISTS `clickhouse`.`{}`; + """ +def get_mysql_conn(started_cluster, host): + conn = pymysql.connect(user='root', password='clickhouse', host=host, port=started_cluster.mysql_port) + return conn def create_mysql_table(conn, tableName): with conn.cursor() as cursor: cursor.execute(create_table_sql_template.format(tableName)) +def drop_mysql_table(conn, tableName): + with conn.cursor() as cursor: + cursor.execute(drop_table_sql_template.format(tableName)) + +def create_mysql_db(conn, name): + with conn.cursor() as cursor: + cursor.execute("DROP DATABASE IF EXISTS {}".format(name)) + cursor.execute("CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(name)) + @pytest.fixture(scope="module") def started_cluster(): @@ -51,7 +62,10 @@ def started_cluster(): def test_many_connections(started_cluster): table_name = 'test_many_connections' + node1.query(f'DROP TABLE IF EXISTS {table_name}') + conn = get_mysql_conn(started_cluster, cluster.mysql_ip) + drop_mysql_table(conn, table_name) create_mysql_table(conn, table_name) node1.query(''' @@ -66,14 +80,18 @@ CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL query += "SELECT id FROM {t})" assert node1.query(query.format(t=table_name)) == '250\n' + drop_mysql_table(conn, table_name) conn.close() def test_insert_select(started_cluster): table_name = 'test_insert_select' + node1.query(f'DROP TABLE IF EXISTS {table_name}') conn = get_mysql_conn(started_cluster, cluster.mysql_ip) + drop_mysql_table(conn, table_name) create_mysql_table(conn, table_name) + node1.query(''' CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL('mysql57:3306', 'clickhouse', '{}', 'root', 'clickhouse'); '''.format(table_name, table_name)) @@ -87,7 +105,9 @@ CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL def test_replace_select(started_cluster): table_name = 'test_replace_select' + node1.query(f'DROP TABLE IF EXISTS {table_name}') conn = get_mysql_conn(started_cluster, cluster.mysql_ip) + drop_mysql_table(conn, table_name) create_mysql_table(conn, table_name) node1.query(''' @@ -106,7 +126,9 @@ CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL def test_insert_on_duplicate_select(started_cluster): table_name = 'test_insert_on_duplicate_select' + node1.query(f'DROP TABLE IF EXISTS {table_name}') conn = get_mysql_conn(started_cluster, cluster.mysql_ip) + drop_mysql_table(conn, table_name) create_mysql_table(conn, table_name) node1.query(''' @@ -125,7 +147,10 @@ CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL def test_where(started_cluster): table_name = 'test_where' + node1.query(f'DROP TABLE IF EXISTS {table_name}') + conn = get_mysql_conn(started_cluster, cluster.mysql_ip) + drop_mysql_table(conn, table_name) create_mysql_table(conn, table_name) node1.query(''' CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL('mysql57:3306', 'clickhouse', '{}', 'root', 'clickhouse'); @@ -146,6 +171,7 @@ CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL def test_table_function(started_cluster): conn = get_mysql_conn(started_cluster, cluster.mysql_ip) + drop_mysql_table(conn, 'table_function') create_mysql_table(conn, 'table_function') table_function = "mysql('mysql57:3306', 'clickhouse', '{}', 'root', 'clickhouse')".format('table_function') assert node1.query("SELECT count() FROM {}".format(table_function)).rstrip() == '0' @@ -168,6 +194,8 @@ def test_table_function(started_cluster): def test_binary_type(started_cluster): conn = get_mysql_conn(started_cluster, cluster.mysql_ip) + drop_mysql_table(conn, 'binary_type') + with conn.cursor() as cursor: cursor.execute("CREATE TABLE clickhouse.binary_type (id INT PRIMARY KEY, data BINARY(16) NOT NULL)") table_function = "mysql('mysql57:3306', 'clickhouse', '{}', 'root', 'clickhouse')".format('binary_type') @@ -177,7 +205,10 @@ def test_binary_type(started_cluster): def test_enum_type(started_cluster): table_name = 'test_enum_type' + node1.query(f'DROP TABLE IF EXISTS {table_name}') + conn = get_mysql_conn(started_cluster, cluster.mysql_ip) + drop_mysql_table(conn, table_name) create_mysql_table(conn, table_name) node1.query(''' CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32, source Enum8('IP' = 1, 'URL' = 2)) ENGINE = MySQL('mysql57:3306', 'clickhouse', '{}', 'root', 'clickhouse', 1); @@ -186,20 +217,8 @@ CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32, source Enum8(' assert node1.query("SELECT source FROM {} LIMIT 1".format(table_name)).rstrip() == 'URL' conn.close() -def get_mysql_conn(started_cluster, host): - conn = pymysql.connect(user='root', password='clickhouse', host=host, port=started_cluster.mysql_port) - return conn -def create_mysql_db(conn, name): - with conn.cursor() as cursor: - cursor.execute("DROP DATABASE IF EXISTS {}".format(name)) - cursor.execute("CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(name)) - -def create_mysql_table(conn, tableName): - with conn.cursor() as cursor: - cursor.execute(create_table_sql_template.format(tableName)) - def test_mysql_distributed(started_cluster): table_name = 'test_replicas' @@ -218,6 +237,8 @@ def test_mysql_distributed(started_cluster): create_mysql_table(conn3, table_name) create_mysql_table(conn4, table_name) + node2.query('DROP TABLE IF EXISTS test_replicas') + # Storage with with 3 replicas node2.query(''' CREATE TABLE test_replicas @@ -227,6 +248,7 @@ def test_mysql_distributed(started_cluster): # Fill remote tables with different data to be able to check nodes = [node1, node2, node2, node2] for i in range(1, 5): + nodes[i-1].query('DROP TABLE IF EXISTS test_replica{}'.format(i)) nodes[i-1].query(''' CREATE TABLE test_replica{} (id UInt32, name String, age UInt32, money UInt32) @@ -249,6 +271,8 @@ def test_mysql_distributed(started_cluster): assert(result == 'host2\nhost3\nhost4\n') # Storage with with two shards, each has 2 replicas + node2.query('DROP TABLE IF EXISTS test_shards') + node2.query(''' CREATE TABLE test_shards (id UInt32, name String, age UInt32, money UInt32) @@ -275,9 +299,12 @@ def test_mysql_distributed(started_cluster): def test_external_settings(started_cluster): table_name = 'test_external_settings' + node1.query(f'DROP TABLE IF EXISTS {table_name}') conn = get_mysql_conn(started_cluster, started_cluster.mysql_ip) + drop_mysql_table(conn, table_name) create_mysql_table(conn, table_name) + node3.query(f'DROP TABLE IF EXISTS {table_name}') node3.query(''' CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL('mysql57:3306', 'clickhouse', '{}', 'root', 'clickhouse'); '''.format(table_name, table_name)) diff --git a/tests/integration/test_storage_postgresql/test.py b/tests/integration/test_storage_postgresql/test.py index f81033822c8..716f16c6211 100644 --- a/tests/integration/test_storage_postgresql/test.py +++ b/tests/integration/test_storage_postgresql/test.py @@ -308,6 +308,21 @@ def test_postgres_distributed(started_cluster): assert(result == 'host2\nhost4\n' or result == 'host3\nhost4\n') +def test_datetime_with_timezone(started_cluster): + conn = get_postgres_conn(started_cluster, started_cluster.postgres_ip, True) + cursor = conn.cursor() + cursor.execute("CREATE TABLE test_timezone (ts timestamp without time zone, ts_z timestamp with time zone)") + cursor.execute("insert into test_timezone select '2014-04-04 20:00:00', '2014-04-04 20:00:00'::timestamptz at time zone 'America/New_York';") + cursor.execute("select * from test_timezone") + result = cursor.fetchall()[0] + print(result[0], str(result[1])[:-6]) + node1.query("create table test_timezone ( ts DateTime, ts_z DateTime('America/New_York')) ENGINE PostgreSQL('postgres1:5432', 'clickhouse', 'test_timezone', 'postgres', 'mysecretpassword');") + assert(node1.query("select ts from test_timezone").strip() == str(result[0])) + # [:-6] because 2014-04-04 16:00:00+00:00 -> 2014-04-04 16:00:00 + assert(node1.query("select ts_z from test_timezone").strip() == str(result[1])[:-6]) + assert(node1.query("select * from test_timezone") == "2014-04-04 20:00:00\t2014-04-04 16:00:00\n") + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 545ca4256f3..52b021a07c5 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -9,12 +9,13 @@ import time import helpers.client import pytest -from helpers.cluster import ClickHouseCluster, ClickHouseInstance +from helpers.cluster import ClickHouseCluster, ClickHouseInstance, get_instances_dir MINIO_INTERNAL_PORT = 9001 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -CONFIG_PATH = os.path.join(SCRIPT_DIR, './_instances/dummy/configs/config.d/defaultS3.xml') + +CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/dummy/configs/config.d/defaultS3.xml'.format(get_instances_dir())) # Creates S3 bucket for tests and allows anonymous read-write access to it. diff --git a/tests/integration/test_system_flush_logs/test.py b/tests/integration/test_system_flush_logs/test.py index 6dc843e101a..b69105710fb 100644 --- a/tests/integration/test_system_flush_logs/test.py +++ b/tests/integration/test_system_flush_logs/test.py @@ -20,10 +20,6 @@ system_logs = [ ('system.metric_log', 1), ] -# Default timeout for flush is 60 -# decrease timeout for the test to show possible issues. -timeout = pytest.mark.timeout(30) - @pytest.fixture(scope='module', autouse=True) def start_cluster(): @@ -39,7 +35,6 @@ def flush_logs(): node.query('SYSTEM FLUSH LOGS') -@timeout @pytest.mark.parametrize('table,exists', system_logs) def test_system_logs(flush_logs, table, exists): q = 'SELECT * FROM {}'.format(table) @@ -51,7 +46,6 @@ def test_system_logs(flush_logs, table, exists): # Logic is tricky, let's check that there is no hang in case of message queue # is not empty (this is another code path in the code). -@timeout def test_system_logs_non_empty_queue(): node.query('SELECT 1', settings={ # right now defaults are the same, diff --git a/tests/integration/test_zookeeper_config/test.py b/tests/integration/test_zookeeper_config/test.py index 732816ea224..95d9db27a7d 100644 --- a/tests/integration/test_zookeeper_config/test.py +++ b/tests/integration/test_zookeeper_config/test.py @@ -30,6 +30,7 @@ def started_cluster(): def test_chroot_with_same_root(started_cluster): for i, node in enumerate([node1, node2]): + node.query('DROP TABLE IF EXISTS simple SYNC') node.query(''' CREATE TABLE simple (date Date, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/0/simple', '{replica}', date, id, 8192); @@ -44,6 +45,7 @@ def test_chroot_with_same_root(started_cluster): def test_chroot_with_different_root(started_cluster): for i, node in [(1, node1), (3, node3)]: + node.query('DROP TABLE IF EXISTS simple_different SYNC') node.query(''' CREATE TABLE simple_different (date Date, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/0/simple_different', '{replica}', date, id, 8192); diff --git a/tests/integration/test_zookeeper_config/test_password.py b/tests/integration/test_zookeeper_config/test_password.py index c0ed4375978..09c15cfd0cf 100644 --- a/tests/integration/test_zookeeper_config/test_password.py +++ b/tests/integration/test_zookeeper_config/test_password.py @@ -22,6 +22,8 @@ def started_cluster(): cluster.shutdown() def test_identity(started_cluster): + node1.query('DROP TABLE IF EXISTS simple SYNC') + node1.query(''' CREATE TABLE simple (date Date, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/0/simple', '{replica}', date, id, 8192); diff --git a/tests/performance/parse_engine_file.xml b/tests/performance/parse_engine_file.xml index 2740b680b67..2b67c19a4f6 100644 --- a/tests/performance/parse_engine_file.xml +++ b/tests/performance/parse_engine_file.xml @@ -22,6 +22,9 @@ Native Avro MsgPack + ORC + Parquet + Arrow diff --git a/tests/performance/select_format.xml b/tests/performance/select_format.xml index 985ec0f2b52..982039102d0 100644 --- a/tests/performance/select_format.xml +++ b/tests/performance/select_format.xml @@ -36,6 +36,8 @@ Avro MsgPack ORC + Parquet + Arrow diff --git a/tests/queries/0_stateless/00900_long_parquet_load.reference b/tests/queries/0_stateless/00900_long_parquet_load.reference index 4bd699f40fe..621bca2ec0e 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load.reference @@ -298,24 +298,40 @@ Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Not y [[['a','b'],['c','d']],[[],['e']]] 1 [[['a','b'],['c','d'],['e']],[[],['f']]] 1 === Try load data from nested_maps.snappy.parquet -Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin - +{'a':{1:1,2:0}} 1 1 +{'b':{1:1}} 1 1 +{'c':{}} 1 1 +{'d':{}} 1 1 +{'e':{1:1}} 1 1 +{'f':{3:1,4:0,5:1}} 1 1 === Try load data from non_hadoop_lz4_compressed.parquet 1593604800 abc 42 1593604800 def 7.7 1593604801 abc 42.125 1593604801 def 7.7 === Try load data from nonnullable.impala.parquet -../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) +8 [-1] [[-1,-2],[]] {'k1':-1} [{},{'k1':1},{},{}] (-1,[-1],([[(-1)]]),{}) === Try load data from nullable.impala.parquet -../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) +1 [1,2,3] [[1,2],[3,4]] {'k1':1,'k2':100} [{'k1':1}] (1,[1],([[(10),(-10)],[(11)]]),{'foo':(([1.1]))}) +2 [NULL,1,2,NULL,3,NULL] [[NULL,1,2,NULL],[3,NULL,4],[],[]] {'k1':2,'k2':NULL} [{'k3':NULL,'k1':1},{},{}] (NULL,[NULL],([[(NULL),(10),(NULL),(-10),(NULL)],[(11),(NULL)],[],[]]),{'g1':(([2.2,NULL])),'g2':(([])),'g3':(([])),'g4':(([])),'g5':(([]))}) +3 [] [[]] {} [{},{}] (NULL,[],([]),{}) +4 [] [] {} [] (NULL,[],([]),{}) +5 [] [] {} [] (NULL,[],([]),{'foo':(([2.2,3.3]))}) +6 [] [] {} [] (NULL,[],([]),{}) +7 [] [[],[5,6]] {'k1':NULL,'k3':NULL} [] (7,[2,3,NULL],([[],[(NULL)],[]]),{}) === Try load data from nullable_list.parquet [1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42] [NULL] [NULL] [NULL] [] [] [] === Try load data from nulls.snappy.parquet -Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin - +(NULL) +(NULL) +(NULL) +(NULL) +(NULL) +(NULL) +(NULL) +(NULL) === Try load data from single_nan.parquet \N === Try load data from userdata1.parquet diff --git a/tests/queries/0_stateless/00900_long_parquet_load.sh b/tests/queries/0_stateless/00900_long_parquet_load.sh index 52213f066e1..1cfba22587a 100755 --- a/tests/queries/0_stateless/00900_long_parquet_load.sh +++ b/tests/queries/0_stateless/00900_long_parquet_load.sh @@ -55,7 +55,10 @@ for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_A COLUMNS=$(cat "$COLUMNS_FILE") || continue ${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load" - ${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load ($COLUMNS) ENGINE = Memory" + $CLICKHOUSE_CLIENT --multiquery <&1 | sed 's/Exception/Ex---tion/' diff --git a/tests/queries/0_stateless/00900_orc_arrow_parquet_maps.reference b/tests/queries/0_stateless/00900_orc_arrow_parquet_maps.reference new file mode 100644 index 00000000000..d96eb672258 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_arrow_parquet_maps.reference @@ -0,0 +1,6 @@ +Arrow +{1:2,2:3} {'1':'a','2':'b'} {1:(1,2),2:(3,4)} {1:[1,2],2:[3,4]} [{1:2,2:3},{3:4,4:5}] ({1:2,2:3},{'a':'b','c':'d'}) [{1:[({1:2},(1)),({2:3},(2))]},{2:[({3:4},(3)),({4:5},(4))]}] +Parquet +{1:2,2:3} {'1':'a','2':'b'} {1:(1,2),2:(3,4)} {1:[1,2],2:[3,4]} [{1:2,2:3},{3:4,4:5}] ({1:2,2:3},{'a':'b','c':'d'}) [{1:[({1:2},(1)),({2:3},(2))]},{2:[({3:4},(3)),({4:5},(4))]}] +ORC +{1:2,2:3} {'1':'a','2':'b'} {1:(1,2),2:(3,4)} {1:[1,2],2:[3,4]} [{1:2,2:3},{3:4,4:5}] ({1:2,2:3},{'a':'b','c':'d'}) [{1:[({1:2},(1)),({2:3},(2))]},{2:[({3:4},(3)),({4:5},(4))]}] diff --git a/tests/queries/0_stateless/00900_orc_arrow_parquet_maps.sh b/tests/queries/0_stateless/00900_orc_arrow_parquet_maps.sh new file mode 100755 index 00000000000..9330a5924a9 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_arrow_parquet_maps.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS maps" +${CLICKHOUSE_CLIENT} --multiquery < "${CLICKHOUSE_TMP}"/maps + ${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE maps" + cat "${CLICKHOUSE_TMP}"/maps | ${CLICKHOUSE_CLIENT} -q "INSERT INTO maps FORMAT Parquet" + ${CLICKHOUSE_CLIENT} --query="SELECT * FROM maps" +done + +${CLICKHOUSE_CLIENT} --query="DROP TABLE maps" diff --git a/tests/queries/0_stateless/00900_orc_arrow_parquet_tuples.reference b/tests/queries/0_stateless/00900_orc_arrow_parquet_tuples.reference new file mode 100644 index 00000000000..12556ca2c7b --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_arrow_parquet_tuples.reference @@ -0,0 +1,6 @@ +Arrow +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] +Parquet +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] +ORC +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] diff --git a/tests/queries/0_stateless/00900_orc_arrow_parquet_tuples.sh b/tests/queries/0_stateless/00900_orc_arrow_parquet_tuples.sh new file mode 100755 index 00000000000..ff16d3205ad --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_arrow_parquet_tuples.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS tuples"; +${CLICKHOUSE_CLIENT} --query="CREATE TABLE tuples (t1 Tuple(UInt32, UInt32), t2 Tuple(String, String), t3 Tuple(Tuple(UInt32, String), UInt32), t4 Tuple(Tuple(UInt32, UInt32), Tuple(String, String)), t5 Tuple(Array(UInt32), UInt32), t6 Tuple(Tuple(Array(UInt32), Array(UInt32)), Tuple(Array(Array(UInt32)), UInt32)), t7 Array(Tuple(Array(Array(UInt32)), Tuple(Array(Tuple(UInt32, UInt32)), UInt32)))) ENGINE=Memory()" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO tuples VALUES ((1, 2), ('1', '2'), ((1, '1'), 1), ((1, 2), ('1', '2')), ([1,2,3], 1), (([1,2,3], [1,2,3]), ([[1,2,3], [1,2,3]], 1)), [([[1,2,3], [1,2,3]], ([(1, 2), (1, 2)], 1))])" + +formats="Arrow Parquet ORC"; + +for format in ${formats}; do + echo $format + + ${CLICKHOUSE_CLIENT} --query="SELECT * FROM tuples FORMAT $format" > "${CLICKHOUSE_TMP}"/tuples + ${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE tuples" + cat "${CLICKHOUSE_TMP}"/tuples | ${CLICKHOUSE_CLIENT} -q "INSERT INTO tuples FORMAT $format" + ${CLICKHOUSE_CLIENT} --query="SELECT * FROM tuples" +done + +${CLICKHOUSE_CLIENT} --query="DROP TABLE tuples" diff --git a/tests/queries/0_stateless/01176_mysql_client_interactive.expect b/tests/queries/0_stateless/01176_mysql_client_interactive.expect index d592bbe1ce2..b2dc88a7795 100755 --- a/tests/queries/0_stateless/01176_mysql_client_interactive.expect +++ b/tests/queries/0_stateless/01176_mysql_client_interactive.expect @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 5 +set timeout 60 match_max 100000 # A default timeout action is to do nothing, change it to fail expect_after { diff --git a/tests/queries/0_stateless/01179_insert_values_semicolon.expect b/tests/queries/0_stateless/01179_insert_values_semicolon.expect index 3814c71a062..0e65e5c4cbf 100755 --- a/tests/queries/0_stateless/01179_insert_values_semicolon.expect +++ b/tests/queries/0_stateless/01179_insert_values_semicolon.expect @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 5 +set timeout 60 match_max 100000 # A default timeout action is to do nothing, change it to fail expect_after { diff --git a/tests/queries/0_stateless/01180_client_syntax_errors.expect b/tests/queries/0_stateless/01180_client_syntax_errors.expect index 267e8edba10..c20982b2991 100755 --- a/tests/queries/0_stateless/01180_client_syntax_errors.expect +++ b/tests/queries/0_stateless/01180_client_syntax_errors.expect @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 5 +set timeout 60 match_max 100000 # A default timeout action is to do nothing, change it to fail expect_after { diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 0ab0d57ebcf..343d8ceeca3 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -103,6 +103,7 @@ SYSTEM REPLICATION QUEUES ['SYSTEM STOP REPLICATION QUEUES','SYSTEM START REPLIC SYSTEM DROP REPLICA ['DROP REPLICA'] TABLE SYSTEM SYSTEM SYNC REPLICA ['SYNC REPLICA'] TABLE SYSTEM SYSTEM RESTART REPLICA ['RESTART REPLICA'] TABLE SYSTEM +SYSTEM RESTORE REPLICA ['RESTORE REPLICA'] TABLE SYSTEM SYSTEM FLUSH DISTRIBUTED ['FLUSH DISTRIBUTED'] TABLE SYSTEM FLUSH SYSTEM FLUSH LOGS ['FLUSH LOGS'] GLOBAL SYSTEM FLUSH SYSTEM FLUSH [] \N SYSTEM diff --git a/tests/queries/0_stateless/01273_arrow_arrays_load.sh b/tests/queries/0_stateless/01273_arrow_arrays_load.sh index b8d1a85921d..bce653376a5 100755 --- a/tests/queries/0_stateless/01273_arrow_arrays_load.sh +++ b/tests/queries/0_stateless/01273_arrow_arrays_load.sh @@ -4,7 +4,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_arrays" +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_arrays" ${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_arrays (arr1 Array(Int8), arr2 Array(UInt8), arr3 Array(Int16), arr4 Array(UInt16), arr5 Array(Int32), arr6 Array(UInt32), arr7 Array(Int64), arr8 Array(UInt64), arr9 Array(String), arr10 Array(FixedString(4)), arr11 Array(Float32), arr12 Array(Float64), arr13 Array(Date), arr14 Array(Datetime)) ENGINE=Memory()" ${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_arrays VALUES ([1,-2,3],[1,2,3],[100,-200,300],[100,200,300],[10000000,-20000000,30000000],[10000000,2000000,3000000],[100000000000000,-200000000000,3000000000000],[100000000000000,20000000000000,3000000000000],['Some string','Some string','Some string'],['0000','1111','2222'],[42.42,424.2,0.4242],[424242.424242,4242042420.242424,42],['2000-01-01','2001-01-01','2002-01-01'],['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']),([],[],[],[],[],[],[],[],[],[],[],[],[],[])" diff --git a/tests/queries/0_stateless/01273_arrow_dictionaries_load.reference b/tests/queries/0_stateless/01273_arrow_dictionaries_load.reference new file mode 100644 index 00000000000..7321c396a59 --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_dictionaries_load.reference @@ -0,0 +1,2 @@ +1 ['a','b','c'] ('z','6') +2 ['d','e'] ('x','9') diff --git a/tests/queries/0_stateless/01273_arrow_dictionaries_load.sh b/tests/queries/0_stateless/01273_arrow_dictionaries_load.sh new file mode 100755 index 00000000000..38e6c2c1b01 --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_dictionaries_load.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_dicts" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_dicts (a LowCardinality(String), b Array(LowCardinality(String)), c Tuple(LowCardinality(String), LowCardinality(String))) ENGINE=Memory()" +${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_dicts VALUES ('1', ['a', 'b', 'c'], ('z', '6')), ('2', ['d', 'e'], ('x', '9'))" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_dicts FORMAT Arrow SETTINGS output_format_arrow_low_cardinality_as_dictionary=1" > "${CLICKHOUSE_TMP}"/dicts.arrow + +cat "${CLICKHOUSE_TMP}"/dicts.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_dicts FORMAT Arrow" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_dicts" +${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_dicts" + +${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_dicts (a LowCardinality(String)) ENGINE=Memory()" +${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_dicts SELECT toString(number % 500) from numbers(10000000)" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_dicts FORMAT Arrow SETTINGS output_format_arrow_low_cardinality_as_dictionary=1" > "${CLICKHOUSE_TMP}"/dicts.arrow + +cat "${CLICKHOUSE_TMP}"/dicts.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_dicts FORMAT Arrow" + +${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_dicts" + diff --git a/tests/queries/0_stateless/01293_client_interactive_vertical_multiline_long.expect b/tests/queries/0_stateless/01293_client_interactive_vertical_multiline_long.expect index 34d62307938..85eb97fb6f2 100755 --- a/tests/queries/0_stateless/01293_client_interactive_vertical_multiline_long.expect +++ b/tests/queries/0_stateless/01293_client_interactive_vertical_multiline_long.expect @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 5 +set timeout 60 match_max 100000 # A default timeout action is to do nothing, change it to fail diff --git a/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect b/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect index a6d52b39918..e0d01d905bb 100755 --- a/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect +++ b/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 1 +set timeout 60 match_max 100000 # A default timeout action is to do nothing, change it to fail expect_after { diff --git a/tests/queries/0_stateless/01520_client_print_query_id.expect b/tests/queries/0_stateless/01520_client_print_query_id.expect index c1d8b90ed39..b0ff5d9d165 100755 --- a/tests/queries/0_stateless/01520_client_print_query_id.expect +++ b/tests/queries/0_stateless/01520_client_print_query_id.expect @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 5 +set timeout 60 match_max 100000 # A default timeout action is to do nothing, change it to fail expect_after { diff --git a/tests/queries/0_stateless/01565_reconnect_after_client_error.expect b/tests/queries/0_stateless/01565_reconnect_after_client_error.expect index a977265d531..712fe4ff64a 100755 --- a/tests/queries/0_stateless/01565_reconnect_after_client_error.expect +++ b/tests/queries/0_stateless/01565_reconnect_after_client_error.expect @@ -4,7 +4,7 @@ # https://github.com/ClickHouse/ClickHouse/issues/19353 log_user 0 -set timeout 5 +set timeout 60 match_max 100000 # A default timeout action is to fail diff --git a/tests/queries/0_stateless/01599_multiline_input_and_singleline_comments.sh b/tests/queries/0_stateless/01599_multiline_input_and_singleline_comments.sh index 9dc02a34592..248e8a06fb2 100755 --- a/tests/queries/0_stateless/01599_multiline_input_and_singleline_comments.sh +++ b/tests/queries/0_stateless/01599_multiline_input_and_singleline_comments.sh @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 5 +set timeout 60 match_max 100000 if ![info exists env(CLICKHOUSE_PORT_TCP)] {set env(CLICKHOUSE_PORT_TCP) 9000} diff --git a/tests/queries/0_stateless/01610_client_spawn_editor.sh b/tests/queries/0_stateless/01610_client_spawn_editor.sh index 723fa761896..b372d82847c 100755 --- a/tests/queries/0_stateless/01610_client_spawn_editor.sh +++ b/tests/queries/0_stateless/01610_client_spawn_editor.sh @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 5 +set timeout 60 match_max 100000 if ![info exists env(CLICKHOUSE_PORT_TCP)] {set env(CLICKHOUSE_PORT_TCP) 9000} diff --git a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh index 8e13f2dcaee..c8cfe0084cd 100755 --- a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh +++ b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh @@ -92,13 +92,4 @@ for w in "${compwords_positive[@]}"; do test_completion_word "$w" || echo "[FAIL] $w (positive)" done -# One negative is enough -compwords_negative=( - # system.clusters - test_shard_localhost_no_such_cluster -) -for w in "${compwords_negative[@]}"; do - test_completion_word "$w" && echo "[FAIL] $w (negative)" -done - exit 0 diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect index 3e6f4fd9715..5543af4dd05 100755 --- a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect +++ b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect @@ -1,7 +1,7 @@ #!/usr/bin/expect -f log_user 0 -set timeout 5 +set timeout 60 match_max 100000 # A default timeout action is to do nothing, change it to fail expect_after { diff --git a/tests/queries/0_stateless/01814_distributed_push_down_limit.reference b/tests/queries/0_stateless/01814_distributed_push_down_limit.reference new file mode 100644 index 00000000000..f879f2cbd21 --- /dev/null +++ b/tests/queries/0_stateless/01814_distributed_push_down_limit.reference @@ -0,0 +1,37 @@ +distributed_push_down_limit=0 +100 100 +distributed_push_down_limit=1 +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +40 40 +auto-distributed_push_down_limit +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +40 40 +distributed_push_down_limit=1 with OFFSET +97 +96 +96 +95 +95 +94 +94 +93 +93 +92 diff --git a/tests/queries/0_stateless/01814_distributed_push_down_limit.sh b/tests/queries/0_stateless/01814_distributed_push_down_limit.sh new file mode 100755 index 00000000000..93321646037 --- /dev/null +++ b/tests/queries/0_stateless/01814_distributed_push_down_limit.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2206 + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# -- NOTE: this test cannot use 'current_database = $CLICKHOUSE_DATABASE', +# -- because it does not propagated via remote queries, +# -- hence it uses query_id/initial_query_id. + +function setup() +{ + $CLICKHOUSE_CLIENT -nm -q " + drop table if exists data_01814; + drop table if exists dist_01814; + + create table data_01814 (key Int) Engine=MergeTree() order by key settings index_granularity=10 as select * from numbers(100); + create table dist_01814 as data_01814 engine=Distributed('test_cluster_two_shards', $CLICKHOUSE_DATABASE, data_01814, key); + " +} + +function cleanup() +{ + $CLICKHOUSE_CLIENT -nm -q " + drop table data_01814; + drop table dist_01814; + " +} + +function make_query_id() +{ + echo "$(tr -cd '[:lower:]' < /dev/urandom | head -c10)-$CLICKHOUSE_DATABASE" +} + +function test_distributed_push_down_limit_with_query_log() +{ + local table=$1 && shift + local offset=$1 && shift + local query_id + + query_id="$(make_query_id)" + + # NOTES: + # - max_rows_to_read_leaf cannot be used since it does not know anything + # about optimize_aggregation_in_order, + # - limit push down can be checked only with optimize_aggregation_in_order, + # since otherwise the query will be canceled too early, and read_rows will be + # small. + local settings_and_opts=( + --query_id "$query_id" + + --max_block_size 20 + --optimize_aggregation_in_order 1 + --log_queries 1 + --log_queries_min_type 'QUERY_FINISH' + + # disable hedged requests to avoid excessive log entries + --use_hedged_requests 0 + + "$@" + ) + + $CLICKHOUSE_CLIENT "${settings_and_opts[@]}" -q "select * from $table group by key limit $offset, 10" + + $CLICKHOUSE_CLIENT -nm -q " + system flush logs; + select read_rows from system.query_log + where + event_date = today() + and query_kind = 'Select' /* exclude DESC TABLE */ + and initial_query_id = '$query_id' and initial_query_id != query_id; + " | xargs # convert new lines to spaces +} + +function test_distributed_push_down_limit_0() +{ + local args=( + "remote('127.{2,3}', $CLICKHOUSE_DATABASE, data_01814)" + 0 # offset + --distributed_push_down_limit 0 + ) + test_distributed_push_down_limit_with_query_log "${args[@]}" "$@" +} + +function test_distributed_push_down_limit_1() +{ + local args=( + "remote('127.{2,3}', $CLICKHOUSE_DATABASE, data_01814)" + 0 # offset + --distributed_push_down_limit 1 + ) + test_distributed_push_down_limit_with_query_log "${args[@]}" +} + +function test_distributed_push_down_limit_1_offset() +{ + local settings_and_opts=( + --distributed_push_down_limit 1 + ) + + $CLICKHOUSE_CLIENT "${settings_and_opts[@]}" -q "select * from remote('127.{2,3}', $CLICKHOUSE_DATABASE, data_01814) group by key order by key desc limit 5, 10" +} + +function test_auto_distributed_push_down_limit() +{ + local args=( + dist_01814 + 0 # offset + --optimize_skip_unused_shards 1 + --optimize_distributed_group_by_sharding_key 1 + --prefer_localhost_replica 0 + --distributed_push_down_limit 0 + ) + test_distributed_push_down_limit_with_query_log "${args[@]}" +} + +function main() +{ + setup + trap cleanup EXIT + + echo 'distributed_push_down_limit=0' + test_distributed_push_down_limit_0 --format Null + + # + # The following tests (tests with distributed_push_down_limit=1) requires + # retries, since the query may be canceled earlier due to LIMIT, and so + # only one shard will be processed, and it will get not 40 but 20 rows: + # + # 1.160920 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} executeQuery: (from [::ffff:127.0.0.1]:42778, initial_query_id: 66cf643c-b1b4-4f7e-942a-c4c3493029f6, using production parser) (comment: /usr/share/clickhouse-test/queries/0_stateless/01814_distributed_push_down_limit.sql) WITH CAST('test_31uut9', 'String') AS id_distributed_push_down_limit_1 SELECT key FROM test_31uut9.data_01814 GROUP BY key LIMIT 10 + # 1.214964 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} ContextAccess (default): Access granted: SELECT(key) ON test_31uut9.data_01814 + # 1.216790 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} test_31uut9.data_01814 (b484ad2e-0591-4faf-8110-1dcbd7cdd0db) (SelectExecutor): Key condition: unknown + # 1.227245 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} test_31uut9.data_01814 (b484ad2e-0591-4faf-8110-1dcbd7cdd0db) (SelectExecutor): Selected 1/1 parts by partition key, 1 parts by primary key, 10/11 marks by primary key, 10 marks to read from 1 ranges + # 1.228452 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} MergeTreeSelectProcessor: Reading 3 ranges from part all_1_1_0, approx. 100 rows starting from 0 + # 1.229104 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} InterpreterSelectQuery: FetchColumns -> WithMergeableStateAfterAggregationAndLimit + # 1.339085 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} TCPHandler: Query was cancelled. + # 1.416573 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} executeQuery: Read 20 rows, 80.00 B in 0.254374666 sec., 78 rows/sec., 314.50 B/sec. + # 1.419006 [ 291 ] {7ac5de70-c26c-4e3b-bdee-3873ad1b84f1} MemoryTracker: Peak memory usage (for query): 0.00 B. + # + + local out out_lines max_tries=20 + + echo 'distributed_push_down_limit=1' + for ((i = 0; i < max_tries; ++i)); do + out=$(test_distributed_push_down_limit_1) + out_lines=( $out ) + if [[ ${#out_lines[@]} -gt 2 ]] && [[ ${out_lines[-1]} = 40 ]] && [[ ${out_lines[-2]} = 40 ]]; then + break + fi + done + echo "$out" + + echo 'auto-distributed_push_down_limit' + for ((i = 0; i < max_tries; ++i)); do + out=$(test_auto_distributed_push_down_limit) + out_lines=( $out ) + if [[ ${#out_lines[@]} -gt 2 ]] && [[ ${out_lines[-1]} = 40 ]] && [[ ${out_lines[-2]} = 40 ]]; then + break + fi + done + echo "$out" + + echo 'distributed_push_down_limit=1 with OFFSET' + test_distributed_push_down_limit_1_offset +} +main "$@" diff --git a/tests/queries/0_stateless/01815_with_mergeable_state_after_aggregation_and_limit.reference b/tests/queries/0_stateless/01815_with_mergeable_state_after_aggregation_and_limit.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/01815_with_mergeable_state_after_aggregation_and_limit.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/01815_with_mergeable_state_after_aggregation_and_limit.sh b/tests/queries/0_stateless/01815_with_mergeable_state_after_aggregation_and_limit.sh new file mode 100755 index 00000000000..0efacc4ac31 --- /dev/null +++ b/tests/queries/0_stateless/01815_with_mergeable_state_after_aggregation_and_limit.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# with_mergeable_state_after_aggregation will not stop after 1 row, while with_mergeable_state_after_aggregation_and_limit should +$CLICKHOUSE_CLIENT -q 'select * from system.numbers limit 1' --stage with_mergeable_state_after_aggregation_and_limit diff --git a/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect new file mode 100755 index 00000000000..55211dca979 --- /dev/null +++ b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect @@ -0,0 +1,34 @@ +#!/usr/bin/expect -f + +log_user 0 +set timeout 60 +match_max 100000 +# A default timeout action is to do nothing, change it to fail +expect_after { + timeout { + exit 1 + } +} +set basedir [file dirname $argv0] + +# history file is not required, in-memory history is enough +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --history_file=$basedir/01910_client_replxx_container_overflow_long.history.log" +expect ":) " + +# Make a query. +send -- "SELECT 1\r" +expect "1" +expect ":) " + +# Do reverse-search. +send -- "" +expect "(reverse-i-search)" +send -- "1" +expect "(reverse-i-search)" +# This will trigger the container-overflow under ASAN before the fix. +send -- "" +expect "(reverse-i-search)" + +# Exit. +send -- "\4" +expect eof diff --git a/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.reference b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01917_distinct_on.sql b/tests/queries/0_stateless/01917_distinct_on.sql index 0940d8566bd..87dfc3ec626 100644 --- a/tests/queries/0_stateless/01917_distinct_on.sql +++ b/tests/queries/0_stateless/01917_distinct_on.sql @@ -5,5 +5,7 @@ INSERT INTO t1 VALUES (1, 1, 1), (1, 1, 2), (2, 2, 2), (1, 2, 2); SELECT DISTINCT ON (a, b) a, b, c FROM t1; +SELECT DISTINCT ON (a, b) a, b, c FROM t1 LIMIT BY 1; -- { serverError 588 } + DROP TABLE IF EXISTS t1; diff --git a/tests/queries/0_stateless/01920_not_chain_format.reference b/tests/queries/0_stateless/01920_not_chain_format.reference new file mode 100644 index 00000000000..22abfd17dc7 --- /dev/null +++ b/tests/queries/0_stateless/01920_not_chain_format.reference @@ -0,0 +1,5 @@ +-- { echo } +EXPLAIN SYNTAX SELECT NOT NOT (NOT (NOT (NULL))); +SELECT NOT (NOT (NOT NOT NULL)) +EXPLAIN SYNTAX SELECT NOT (NOT (NOT NOT NULL)); +SELECT NOT (NOT (NOT NOT NULL)) diff --git a/tests/queries/0_stateless/01920_not_chain_format.sql b/tests/queries/0_stateless/01920_not_chain_format.sql new file mode 100644 index 00000000000..79a5050432d --- /dev/null +++ b/tests/queries/0_stateless/01920_not_chain_format.sql @@ -0,0 +1,3 @@ +-- { echo } +EXPLAIN SYNTAX SELECT NOT NOT (NOT (NOT (NULL))); +EXPLAIN SYNTAX SELECT NOT (NOT (NOT NOT NULL)); diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index 05ae06fc069..f146913a2e8 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -234,6 +234,7 @@ 01801_distinct_group_by_shard 01804_dictionary_decimal256_type 01801_s3_distributed +01814_distributed_push_down_limit 01833_test_collation_alvarotuso 01850_dist_INSERT_preserve_error 01870_modulo_partition_key @@ -245,3 +246,4 @@ 01901_test_attach_partition_from 01910_view_dictionary 01824_prefer_global_in_and_join +01576_alias_column_rewrite diff --git a/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns index d9e51028f22..c6bb5057cc2 100644 --- a/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns @@ -1 +1 @@ -`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Nullable(Int32) +`a` Nullable(String), `b` Array(Nullable(Int32)), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32)) diff --git a/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns index d5e9599431b..eef66ae66c7 100644 --- a/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns @@ -1 +1 @@ -`a` Tuple(Nullable(String), Nullable(Int32), Nullable(UInt8)), `b` Nullable(Int32), `c` Nullable(Float64) +`a` Map(String, Map(Int32, Nullable(UInt8))), `b` Nullable(Int32), `c` Nullable(Float64) diff --git a/tests/queries/0_stateless/data_parquet/nonnullable.impala.parquet.columns b/tests/queries/0_stateless/data_parquet/nonnullable.impala.parquet.columns index 6d724200aec..299ec3b6af2 100644 --- a/tests/queries/0_stateless/data_parquet/nonnullable.impala.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/nonnullable.impala.parquet.columns @@ -1 +1 @@ -`ID` Nullable(Int64), `Int_Array` Nullable(Int32), `int_array_array` Nullable(Int32), `Int_Map` Tuple(Nullable(String), Nullable(Int32)), `int_map_array` Tuple(Nullable(String), Nullable(Int32)), `nested_Struct` Tuple(Nullable(Int32), Nullable(Int32), Nullable(Int32), Nullable(String), Nullable(String), Nullable(Float64)) +`ID` Nullable(Int64), `Int_Array` Array(Nullable(Int32)), `int_array_array` Array(Array(Nullable(Int32))), `Int_Map` Map(String, Nullable(Int32)), `int_map_array` Array(Map(String, Nullable(Int32))), `nested_Struct` Tuple(Nullable(Int32), Array(Nullable(Int32)), Tuple(Array(Array(Tuple(Nullable(Int32))))), Map(String, Tuple(Tuple(Array(Nullable(Float64)))))) diff --git a/tests/queries/0_stateless/data_parquet/nullable.impala.parquet.columns b/tests/queries/0_stateless/data_parquet/nullable.impala.parquet.columns index b5e122585d7..6fcbcdd4a0b 100644 --- a/tests/queries/0_stateless/data_parquet/nullable.impala.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/nullable.impala.parquet.columns @@ -1 +1 @@ -`id` Nullable(Int64), `int_array` Nullable(Int32), `int_array_Array` Nullable(Int32), `int_map` Tuple(Nullable(String), Nullable(Int32)), `int_Map_Array` Tuple(Nullable(String), Nullable(Int32)), `nested_struct` Tuple(Nullable(Int32), Nullable(Int32), Nullable(Int32), Nullable(String), Nullable(String), Nullable(Float64)) +`id` Nullable(Int64), `int_array` Array(Nullable(Int32)), `int_array_Array` Array(Array(Nullable(Int32))), `int_map` Map(String, Nullable(Int32)), `int_Map_Array` Array(Map(String, Nullable(Int32))), `nested_struct` Tuple(Nullable(Int32), Array(Nullable(Int32)), Tuple(Array(Array(Tuple(Nullable(Int32))))), Map(String, Tuple(Tuple(Array(Nullable(Float64)))))) diff --git a/tests/queries/0_stateless/data_parquet/nulls.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/nulls.snappy.parquet.columns index a99b8b80eac..6e723ef72c3 100644 --- a/tests/queries/0_stateless/data_parquet/nulls.snappy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/nulls.snappy.parquet.columns @@ -1 +1 @@ -`b_struct` Nullable(Int32) +`b_struct` Tuple(Nullable(Int32)) diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 45d3dbf56d6..e38089230f4 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -110,7 +110,9 @@ "00609_mv_index_in_in", "00510_materizlized_view_and_deduplication_zookeeper", "00738_lock_for_inner_table", - "01153_attach_mv_uuid" + "01153_attach_mv_uuid", + /// Sometimes cannot lock file most likely due to concurrent or adjacent tests, but we don't care how it works in Ordinary database. + "rocksdb" ], "database-replicated": [ /// Unclassified diff --git a/tests/testflows/extended_precision_data_types/regression.py b/tests/testflows/extended_precision_data_types/regression.py index a0c3186c961..8fea6f68e5c 100755 --- a/tests/testflows/extended_precision_data_types/regression.py +++ b/tests/testflows/extended_precision_data_types/regression.py @@ -22,7 +22,7 @@ xflags = { @XFlags(xflags) @Name("extended precision data types") @Specifications( - QA_SRS020_ClickHouse_Extended_Precision_Data_Types + SRS020_ClickHouse_Extended_Precision_Data_Types ) @Requirements( RQ_SRS_020_ClickHouse_Extended_Precision("1.0"), diff --git a/tests/testflows/extended_precision_data_types/requirements/requirements.md b/tests/testflows/extended_precision_data_types/requirements/requirements.md index 9bbd59a14d5..232eb4d7aba 100644 --- a/tests/testflows/extended_precision_data_types/requirements/requirements.md +++ b/tests/testflows/extended_precision_data_types/requirements/requirements.md @@ -1,4 +1,4 @@ -# QA-SRS020 ClickHouse Extended Precision Data Types +# SRS020 ClickHouse Extended Precision Data Types # Software Requirements Specification ## Table of Contents diff --git a/tests/testflows/extended_precision_data_types/requirements/requirements.py b/tests/testflows/extended_precision_data_types/requirements/requirements.py index 3fcf7798651..6069b98a551 100644 --- a/tests/testflows/extended_precision_data_types/requirements/requirements.py +++ b/tests/testflows/extended_precision_data_types/requirements/requirements.py @@ -754,8 +754,8 @@ RQ_SRS_020_ClickHouse_Extended_Precision_Create_Table = Requirement( level=3, num='4.13.1') -QA_SRS020_ClickHouse_Extended_Precision_Data_Types = Specification( - name='QA-SRS020 ClickHouse Extended Precision Data Types', +SRS020_ClickHouse_Extended_Precision_Data_Types = Specification( + name='SRS020 ClickHouse Extended Precision Data Types', description=None, author=None, date=None, @@ -855,7 +855,7 @@ QA_SRS020_ClickHouse_Extended_Precision_Data_Types = Specification( RQ_SRS_020_ClickHouse_Extended_Precision_Create_Table, ), content=''' -# QA-SRS020 ClickHouse Extended Precision Data Types +# SRS020 ClickHouse Extended Precision Data Types # Software Requirements Specification ## Table of Contents diff --git a/tests/testflows/ldap/authentication/tests/common.py b/tests/testflows/ldap/authentication/tests/common.py index 0f36879ef62..ec6a66c0257 100644 --- a/tests/testflows/ldap/authentication/tests/common.py +++ b/tests/testflows/ldap/authentication/tests/common.py @@ -92,7 +92,7 @@ def add_config(config, timeout=300, restart=False, modify=False): """Check that preprocessed config is updated. """ started = time.time() - command = f"cat /var/lib/clickhouse/preprocessed_configs/_{config.preprocessed_name} | grep {config.uid}{' > /dev/null' if not settings.debug else ''}" + command = f"cat /var/lib/clickhouse/preprocessed_configs/{config.preprocessed_name} | grep {config.uid}{' > /dev/null' if not settings.debug else ''}" while time.time() - started < timeout: exitcode = node.command(command, steps=False).exitcode @@ -105,7 +105,7 @@ def add_config(config, timeout=300, restart=False, modify=False): time.sleep(1) if settings.debug: - node.command(f"cat /var/lib/clickhouse/preprocessed_configs/_{config.preprocessed_name}") + node.command(f"cat /var/lib/clickhouse/preprocessed_configs/{config.preprocessed_name}") if after_removal: assert exitcode == 1, error() diff --git a/tests/testflows/ldap/external_user_directory/tests/common.py b/tests/testflows/ldap/external_user_directory/tests/common.py index f23356bd061..f6d1654efd6 100644 --- a/tests/testflows/ldap/external_user_directory/tests/common.py +++ b/tests/testflows/ldap/external_user_directory/tests/common.py @@ -138,7 +138,7 @@ def invalid_ldap_external_user_directory_config(server, roles, message, tail=30, with Then(f"{config.preprocessed_name} should be updated", description=f"timeout {timeout}"): started = time.time() - command = f"cat /var/lib/clickhouse/preprocessed_configs/_{config.preprocessed_name} | grep {config.uid}{' > /dev/null' if not settings.debug else ''}" + command = f"cat /var/lib/clickhouse/preprocessed_configs/{config.preprocessed_name} | grep {config.uid}{' > /dev/null' if not settings.debug else ''}" while time.time() - started < timeout: exitcode = node.command(command, steps=False).exitcode if exitcode == 0: diff --git a/tests/testflows/rbac/regression.py b/tests/testflows/rbac/regression.py index 549ccdf80d8..145865b2fa9 100755 --- a/tests/testflows/rbac/regression.py +++ b/tests/testflows/rbac/regression.py @@ -30,6 +30,7 @@ issue_18110 = "https://github.com/ClickHouse/ClickHouse/issues/18110" issue_18206 = "https://github.com/ClickHouse/ClickHouse/issues/18206" issue_21083 = "https://github.com/ClickHouse/ClickHouse/issues/21083" issue_21084 = "https://github.com/ClickHouse/ClickHouse/issues/21084" +issue_25413 = "https://github.com/ClickHouse/ClickHouse/issues/25413" xfails = { "syntax/show create quota/I show create quota current": @@ -144,6 +145,12 @@ xfails = { [(Fail, "new bug")], "privileges/show dictionaries/:/check privilege/check privilege=DROP DICTIONARY/show dict/SHOW DICTIONARIES with privilege": [(Fail, "new bug")], + "privileges/kill mutation/:/:/KILL ALTER : without privilege": + [(Fail, issue_25413)], + "privileges/kill mutation/:/:/KILL ALTER : with revoked privilege": + [(Fail, issue_25413)], + "privileges/kill mutation/:/:/KILL ALTER : with revoked ALL privilege": + [(Fail, issue_25413)] } xflags = { diff --git a/tests/testflows/rbac/tests/privileges/alter/alter_column.py b/tests/testflows/rbac/tests/privileges/alter/alter_column.py index 91b3ab1629a..4e8bfb0b53d 100755 --- a/tests/testflows/rbac/tests/privileges/alter/alter_column.py +++ b/tests/testflows/rbac/tests/privileges/alter/alter_column.py @@ -202,7 +202,7 @@ def check_modify_column_when_privilege_is_granted(table, user, node, column=None column = 'modify' with Given(f"I add the column {column}"): - node.query(f"ALTER TABLE {table} ADD COLUMN {column} String") + node.query(f"ALTER TABLE {table} ADD COLUMN {column} String DEFAULT '0'") with When(f"I insert some data into column {column}"): node.query(f"INSERT INTO {table} ({column}) VALUES ('3.4')") diff --git a/tests/testflows/regression.py b/tests/testflows/regression.py index 5ad529c1b5a..661bc03509b 100755 --- a/tests/testflows/regression.py +++ b/tests/testflows/regression.py @@ -22,15 +22,15 @@ def regression(self, local, clickhouse_binary_path, stress=None, parallel=None): tasks = [] with Pool(8) as pool: try: - run_scenario(pool, tasks, Feature(test=load("example.regression", "regression")), args) - run_scenario(pool, tasks, Feature(test=load("ldap.regression", "regression")), args) + #run_scenario(pool, tasks, Feature(test=load("example.regression", "regression")), args) + #run_scenario(pool, tasks, Feature(test=load("ldap.regression", "regression")), args) #run_scenario(pool, tasks, Feature(test=load("rbac.regression", "regression")), args) - run_scenario(pool, tasks, Feature(test=load("aes_encryption.regression", "regression")), args) - run_scenario(pool, tasks, Feature(test=load("map_type.regression", "regression")), args) - run_scenario(pool, tasks, Feature(test=load("window_functions.regression", "regression")), args) - run_scenario(pool, tasks, Feature(test=load("datetime64_extended_range.regression", "regression")), args) + #run_scenario(pool, tasks, Feature(test=load("aes_encryption.regression", "regression")), args) + #run_scenario(pool, tasks, Feature(test=load("map_type.regression", "regression")), args) + #run_scenario(pool, tasks, Feature(test=load("window_functions.regression", "regression")), args) + #run_scenario(pool, tasks, Feature(test=load("datetime64_extended_range.regression", "regression")), args) #run_scenario(pool, tasks, Feature(test=load("kerberos.regression", "regression")), args) - run_scenario(pool, tasks, Feature(test=load("extended_precision_data_types.regression", "regression")), args) + #run_scenario(pool, tasks, Feature(test=load("extended_precision_data_types.regression", "regression")), args) finally: join(tasks) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index c15925c1030..f7111cc28e4 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v21.6.5.37-stable 2021-06-19 v21.6.4.26-stable 2021-06-11 v21.6.3.14-stable 2021-06-04 v21.5.6.6-stable 2021-05-29