diff --git a/.github/ISSUE_TEMPLATE/10_question.md b/.github/ISSUE_TEMPLATE/10_question.md deleted file mode 100644 index 08a05a844e0..00000000000 --- a/.github/ISSUE_TEMPLATE/10_question.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: Question -about: Ask a question about ClickHouse -title: '' -labels: question -assignees: '' - ---- - -> Make sure to check documentation https://clickhouse.com/docs/en/ first. If the question is concise and probably has a short answer, asking it in [community Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-1gh9ds7f4-PgDhJAaF8ad5RbWBAAjzFg) is probably the fastest way to find the answer. For more complicated questions, consider asking them on StackOverflow with "clickhouse" tag https://stackoverflow.com/questions/tagged/clickhouse - -> If you still prefer GitHub issues, remove all this text and ask your question here. - -**Company or project name** - -Put your company name or project description here - -**Question** - -Your question diff --git a/.github/ISSUE_TEMPLATE/10_question.yaml b/.github/ISSUE_TEMPLATE/10_question.yaml new file mode 100644 index 00000000000..39d4c27807a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/10_question.yaml @@ -0,0 +1,20 @@ +name: Question +description: Ask a question about ClickHouse +labels: ["question"] +body: + - type: markdown + attributes: + value: | + > Make sure to check documentation https://clickhouse.com/docs/en/ first. If the question is concise and probably has a short answer, asking it in [community Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-1gh9ds7f4-PgDhJAaF8ad5RbWBAAjzFg) is probably the fastest way to find the answer. For more complicated questions, consider asking them on StackOverflow with "clickhouse" tag https://stackoverflow.com/questions/tagged/clickhouse + - type: textarea + attributes: + label: Company or project name + description: Put your company name or project description here. + validations: + required: false + - type: textarea + attributes: + label: Question + description: Please put your question here. + validations: + required: true diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 4d2c2e6f466..f94621ba092 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -285,7 +285,7 @@ stop_logs_replication # Try to get logs while server is running failed_to_save_logs=0 -for table in query_log zookeeper_log trace_log transactions_info_log metric_log +for table in query_log zookeeper_log trace_log transactions_info_log metric_log blob_storage_log do err=$(clickhouse-client -q "select * from system.$table into outfile '/test_output/$table.tsv.gz' format TSVWithNamesAndTypes") echo "$err" @@ -339,7 +339,7 @@ if [ $failed_to_save_logs -ne 0 ]; then # directly # - even though ci auto-compress some files (but not *.tsv) it does this only # for files >64MB, we want this files to be compressed explicitly - for table in query_log zookeeper_log trace_log transactions_info_log metric_log + for table in query_log zookeeper_log trace_log transactions_info_log metric_log blob_storage_log do clickhouse-local "$data_path_config" --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||: if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh index 1f2cc9903b2..a4c4c75e5b3 100644 --- a/docker/test/upgrade/run.sh +++ b/docker/test/upgrade/run.sh @@ -25,7 +25,8 @@ azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log & ./setup_minio.sh stateless # to have a proper environment echo "Get previous release tag" -previous_release_tag=$(dpkg --info package_folder/clickhouse-client*.deb | grep "Version: " | awk '{print $2}' | cut -f1 -d'+' | get_previous_release_tag) +# shellcheck disable=SC2016 +previous_release_tag=$(dpkg-deb --showformat='${Version}' --show package_folder/clickhouse-client*.deb | get_previous_release_tag) echo $previous_release_tag echo "Clone previous release repository" diff --git a/docs/en/engines/table-engines/integrations/mongodb.md b/docs/en/engines/table-engines/integrations/mongodb.md index f87e8da8b5b..5bb3bc752f5 100644 --- a/docs/en/engines/table-engines/integrations/mongodb.md +++ b/docs/en/engines/table-engines/integrations/mongodb.md @@ -34,10 +34,11 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name - `options` — MongoDB connection string options (optional parameter). :::tip -If you are using the MongoDB Atlas cloud offering please add these options: +If you are using the MongoDB Atlas cloud offering: ``` -'connectTimeoutMS=10000&ssl=true&authSource=admin' +- connection url can be obtained from 'Atlas SQL' option +- use options: 'connectTimeoutMS=10000&ssl=true&authSource=admin' ``` ::: diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 689c05a24af..803b753fe0a 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -39,8 +39,8 @@ If you need to update rows frequently, we recommend using the [`ReplacingMergeTr ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( - name1 [type1] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr1] [COMMENT ...] [CODEC(codec1)] [STATISTIC(stat1)] [TTL expr1] [PRIMARY KEY] [SETTINGS (name = value, ...)], - name2 [type2] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr2] [COMMENT ...] [CODEC(codec2)] [STATISTIC(stat2)] [TTL expr2] [PRIMARY KEY] [SETTINGS (name = value, ...)], + name1 [type1] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr1] [COMMENT ...] [CODEC(codec1)] [STATISTICS(stat1)] [TTL expr1] [PRIMARY KEY] [SETTINGS (name = value, ...)], + name2 [type2] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr2] [COMMENT ...] [CODEC(codec2)] [STATISTICS(stat2)] [TTL expr2] [PRIMARY KEY] [SETTINGS (name = value, ...)], ... INDEX index_name1 expr1 TYPE type1(...) [GRANULARITY value1], INDEX index_name2 expr2 TYPE type2(...) [GRANULARITY value2], @@ -1043,12 +1043,12 @@ ClickHouse versions 22.3 through 22.7 use a different cache configuration, see [ ## Column Statistics (Experimental) {#column-statistics} -The statistic declaration is in the columns section of the `CREATE` query for tables from the `*MergeTree*` Family when we enable `set allow_experimental_statistic = 1`. +The statistics declaration is in the columns section of the `CREATE` query for tables from the `*MergeTree*` Family when we enable `set allow_experimental_statistics = 1`. ``` sql CREATE TABLE tab ( - a Int64 STATISTIC(tdigest), + a Int64 STATISTICS(TDigest, Uniq), b Float64 ) ENGINE = MergeTree @@ -1058,19 +1058,23 @@ ORDER BY a We can also manipulate statistics with `ALTER` statements. ```sql -ALTER TABLE tab ADD STATISTIC b TYPE tdigest; -ALTER TABLE tab DROP STATISTIC a TYPE tdigest; +ALTER TABLE tab ADD STATISTICS b TYPE TDigest, Uniq; +ALTER TABLE tab DROP STATISTICS a; ``` -These lightweight statistics aggregate information about distribution of values in columns. -They can be used for query optimization when we enable `set allow_statistic_optimize = 1`. +These lightweight statistics aggregate information about distribution of values in columns. Statistics are stored in every part and updated when every insert comes. +They can be used for prewhere optimization only if we enable `set allow_statistics_optimize = 1`. #### Available Types of Column Statistics {#available-types-of-column-statistics} -- `tdigest` +- `TDigest` Stores distribution of values from numeric columns in [TDigest](https://github.com/tdunning/t-digest) sketch. +- `Uniq` + + Estimate the number of distinct values of a column by HyperLogLog. + ## Column-level Settings {#column-level-settings} Certain MergeTree settings can be override at column level: diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index a5fe74fd0c6..ee3ec5126a2 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1206,6 +1206,16 @@ Expired time for HSTS in seconds. The default value is 0 means clickhouse disabl 600000 ``` +## mlock_executable {#mlock_executable} + +Perform mlockall after startup to lower first queries latency and to prevent clickhouse executable from being paged out under high IO load. Enabling this option is recommended but will lead to increased startup time for up to a few seconds. +Keep in mind that this parameter would not work without "CAP_IPC_LOCK" capability. +**Example** + +``` xml +false +``` + ## include_from {#include_from} The path to the file with substitutions. Both XML and YAML formats are supported. @@ -1353,6 +1363,26 @@ Examples: 127.0.0.1 ``` +## listen_try {#listen_try} + +The server will not exit if IPv6 or IPv4 networks are unavailable while trying to listen. + +Examples: + +``` xml +0 +``` + +## listen_reuse_port {#listen_reuse_port} + +Allow multiple servers to listen on the same address:port. Requests will be routed to a random server by the operating system. Enabling this setting is not recommended. + +Examples: + +``` xml +0 +``` + ## listen_backlog {#listen_backlog} Backlog (queue size of pending connections) of the listen socket. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 0b905df21d4..ffaf53085c4 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3170,6 +3170,18 @@ Possible values: Default value: `0`. +## lightweight_deletes_sync {#lightweight_deletes_sync} + +The same as 'mutation_sync', but controls only execution of lightweight deletes. + +Possible values: + +- 0 - Mutations execute asynchronously. +- 1 - The query waits for the lightweight deletes to complete on the current server. +- 2 - The query waits for the lightweight deletes to complete on all replicas (if they exist). + +Default value: `2`. + **See Also** - [Synchronicity of ALTER Queries](../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) @@ -5108,7 +5120,7 @@ a Tuple( ) ``` -## allow_experimental_statistic {#allow_experimental_statistic} +## allow_experimental_statistics {#allow_experimental_statistics} Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics). diff --git a/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md b/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md index 06443994dd9..4f73aadb8da 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md +++ b/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md @@ -24,6 +24,8 @@ Alias: `lttb`. - `x` — x coordinate. [Integer](../../../sql-reference/data-types/int-uint.md) , [Float](../../../sql-reference/data-types/float.md) , [Decimal](../../../sql-reference/data-types/decimal.md) , [Date](../../../sql-reference/data-types/date.md), [Date32](../../../sql-reference/data-types/date32.md), [DateTime](../../../sql-reference/data-types/datetime.md), [DateTime64](../../../sql-reference/data-types/datetime64.md). - `y` — y coordinate. [Integer](../../../sql-reference/data-types/int-uint.md) , [Float](../../../sql-reference/data-types/float.md) , [Decimal](../../../sql-reference/data-types/decimal.md) , [Date](../../../sql-reference/data-types/date.md), [Date32](../../../sql-reference/data-types/date32.md), [DateTime](../../../sql-reference/data-types/datetime.md), [DateTime64](../../../sql-reference/data-types/datetime64.md). +NaNs are ignored in the provided series, meaning that any NaN values will be excluded from the analysis. This ensures that the function operates only on valid numerical data. + **Parameters** - `n` — number of points in the resulting series. [UInt64](../../../sql-reference/data-types/int-uint.md). @@ -61,7 +63,7 @@ Result: ``` text ┌────────largestTriangleThreeBuckets(4)(x, y)───────────┐ -│ [(1,10),(3,15),(5,40),(10,70)] │ +│ [(1,10),(3,15),(9,55),(10,70)] │ └───────────────────────────────────────────────────────┘ ``` diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 5dd1d5ceebe..2ec51d43c59 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -2423,11 +2423,7 @@ Result: ## toUnixTimestamp64Milli -## toUnixTimestamp64Micro - -## toUnixTimestamp64Nano - -Converts a `DateTime64` to a `Int64` value with fixed sub-second precision. Input value is scaled up or down appropriately depending on it precision. +Converts a `DateTime64` to a `Int64` value with fixed millisecond precision. The input value is scaled up or down appropriately depending on its precision. :::note The output value is a timestamp in UTC, not in the timezone of `DateTime64`. @@ -2437,24 +2433,22 @@ The output value is a timestamp in UTC, not in the timezone of `DateTime64`. ```sql toUnixTimestamp64Milli(value) -toUnixTimestamp64Micro(value) -toUnixTimestamp64Nano(value) ``` **Arguments** -- `value` — DateTime64 value with any precision. +- `value` — DateTime64 value with any precision. [DateTime64](../data-types/datetime64.md). **Returned value** -- `value` converted to the `Int64` data type. +- `value` converted to the `Int64` data type. [Int64](../data-types/int-uint.md). -**Examples** +**Example** Query: ```sql -WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 +WITH toDateTime64('2009-02-13 23:31:31.011', 3, 'UTC') AS dt64 SELECT toUnixTimestamp64Milli(dt64); ``` @@ -2462,14 +2456,77 @@ Result: ```response ┌─toUnixTimestamp64Milli(dt64)─┐ -│ 1568650812345 │ +│ 1234567891011 │ └──────────────────────────────┘ ``` +## toUnixTimestamp64Micro + +Converts a `DateTime64` to a `Int64` value with fixed microsecond precision. The input value is scaled up or down appropriately depending on its precision. + +:::note +The output value is a timestamp in UTC, not in the timezone of `DateTime64`. +::: + +**Syntax** + +```sql +toUnixTimestamp64Micro(value) +``` + +**Arguments** + +- `value` — DateTime64 value with any precision. [DateTime64](../data-types/datetime64.md). + +**Returned value** + +- `value` converted to the `Int64` data type. [Int64](../data-types/int-uint.md). + +**Example** + Query: -``` sql -WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 +```sql +WITH toDateTime64('1970-01-15 06:56:07.891011', 6, 'UTC') AS dt64 +SELECT toUnixTimestamp64Micro(dt64); +``` + +Result: + +```response +┌─toUnixTimestamp64Micro(dt64)─┐ +│ 1234567891011 │ +└──────────────────────────────┘ +``` + +## toUnixTimestamp64Nano + +Converts a `DateTime64` to a `Int64` value with fixed nanosecond precision. The input value is scaled up or down appropriately depending on its precision. + +:::note +The output value is a timestamp in UTC, not in the timezone of `DateTime64`. +::: + +**Syntax** + +```sql +toUnixTimestamp64Nano(value) +``` + +**Arguments** + +- `value` — DateTime64 value with any precision. [DateTime64](../data-types/datetime64.md). + +**Returned value** + +- `value` converted to the `Int64` data type. [Int64](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +WITH toDateTime64('1970-01-01 00:20:34.567891011', 9, 'UTC') AS dt64 SELECT toUnixTimestamp64Nano(dt64); ``` @@ -2477,34 +2534,32 @@ Result: ```response ┌─toUnixTimestamp64Nano(dt64)─┐ -│ 1568650812345678000 │ +│ 1234567891011 │ └─────────────────────────────┘ ``` ## fromUnixTimestamp64Milli -## fromUnixTimestamp64Micro +Converts an `Int64` to a `DateTime64` value with fixed millisecond precision and optional timezone. The input value is scaled up or down appropriately depending on its precision. -## fromUnixTimestamp64Nano - -Converts an `Int64` to a `DateTime64` value with fixed sub-second precision and optional timezone. Input value is scaled up or down appropriately depending on it’s precision. Please note that input value is treated as UTC timestamp, not timestamp at given (or implicit) timezone. +:::note +Please note that input value is treated as a UTC timestamp, not timestamp at the given (or implicit) timezone. +::: **Syntax** ``` sql fromUnixTimestamp64Milli(value[, timezone]) -fromUnixTimestamp64Micro(value[, timezone]) -fromUnixTimestamp64Nano(value[, timezone]) ``` **Arguments** -- `value` — `Int64` value with any precision. -- `timezone` — `String` (optional) timezone name of the result. +- `value` — value with any precision. [Int64](../data-types/int-uint.md). +- `timezone` — (optional) timezone name of the result. [String](../data-types/string.md). **Returned value** -- `value` converted to the `DateTime64` data type. +- `value` converted to DateTime64 with precision `3`. [DateTime64](../data-types/datetime64.md). **Example** @@ -2512,15 +2567,101 @@ Query: ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC'); +SELECT + fromUnixTimestamp64Milli(i64, 'UTC') AS x, + toTypeName(x); ``` Result: ```response -┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ -│ 2009-02-13 23:31:31.011 │ -└──────────────────────────────────────┘ +┌───────────────────────x─┬─toTypeName(x)────────┐ +│ 2009-02-13 23:31:31.011 │ DateTime64(3, 'UTC') │ +└─────────────────────────┴──────────────────────┘ +``` + +## fromUnixTimestamp64Micro + +Converts an `Int64` to a `DateTime64` value with fixed microsecond precision and optional timezone. The input value is scaled up or down appropriately depending on its precision. + +:::note +Please note that input value is treated as a UTC timestamp, not timestamp at the given (or implicit) timezone. +::: + +**Syntax** + +``` sql +fromUnixTimestamp64Micro(value[, timezone]) +``` + +**Arguments** + +- `value` — value with any precision. [Int64](../data-types/int-uint.md). +- `timezone` — (optional) timezone name of the result. [String](../data-types/string.md). + +**Returned value** + +- `value` converted to DateTime64 with precision `6`. [DateTime64](../data-types/datetime64.md). + +**Example** + +Query: + +``` sql +WITH CAST(1234567891011, 'Int64') AS i64 +SELECT + fromUnixTimestamp64Micro(i64, 'UTC') AS x, + toTypeName(x); +``` + +Result: + +```response +┌──────────────────────────x─┬─toTypeName(x)────────┐ +│ 1970-01-15 06:56:07.891011 │ DateTime64(6, 'UTC') │ +└────────────────────────────┴──────────────────────┘ +``` + +## fromUnixTimestamp64Nano + +Converts an `Int64` to a `DateTime64` value with fixed nanosecond precision and optional timezone. The input value is scaled up or down appropriately depending on its precision. + +:::note +Please note that input value is treated as a UTC timestamp, not timestamp at the given (or implicit) timezone. +::: + +**Syntax** + +``` sql +fromUnixTimestamp64Nano(value[, timezone]) +``` + +**Arguments** + +- `value` — value with any precision. [Int64](../data-types/int-uint.md). +- `timezone` — (optional) timezone name of the result. [String](../data-types/string.md). + +**Returned value** + +- `value` converted to DateTime64 with precision `9`. [DateTime64](../data-types/datetime64.md). + +**Example** + +Query: + +``` sql +WITH CAST(1234567891011, 'Int64') AS i64 +SELECT + fromUnixTimestamp64Nano(i64, 'UTC') AS x, + toTypeName(x); +``` + +Result: + +```response +┌─────────────────────────────x─┬─toTypeName(x)────────┐ +│ 1970-01-01 00:20:34.567891011 │ DateTime64(9, 'UTC') │ +└───────────────────────────────┴──────────────────────┘ ``` ## formatRow diff --git a/docs/en/sql-reference/statements/alter/index.md b/docs/en/sql-reference/statements/alter/index.md index 3cfb99cff83..f81d4f02e0c 100644 --- a/docs/en/sql-reference/statements/alter/index.md +++ b/docs/en/sql-reference/statements/alter/index.md @@ -16,7 +16,7 @@ Most `ALTER TABLE` queries modify table settings or data: - [INDEX](/docs/en/sql-reference/statements/alter/skipping-index.md) - [CONSTRAINT](/docs/en/sql-reference/statements/alter/constraint.md) - [TTL](/docs/en/sql-reference/statements/alter/ttl.md) -- [STATISTIC](/docs/en/sql-reference/statements/alter/statistic.md) +- [STATISTICS](/docs/en/sql-reference/statements/alter/statistics.md) - [APPLY DELETED MASK](/docs/en/sql-reference/statements/alter/apply-deleted-mask.md) :::note diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index ce5cecf6fd6..0ed1e523669 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -139,7 +139,7 @@ For the query to run successfully, the following conditions must be met: ALTER TABLE table2 [ON CLUSTER cluster] REPLACE PARTITION partition_expr FROM table1 ``` -This query copies the data partition from the `table1` to `table2` and replaces existing partition in the `table2`. +This query copies the data partition from `table1` to `table2` and replaces the existing partition in `table2`. The operation is atomic. Note that: diff --git a/docs/en/sql-reference/statements/alter/statistic.md b/docs/en/sql-reference/statements/alter/statistic.md deleted file mode 100644 index 1c2e45b23fd..00000000000 --- a/docs/en/sql-reference/statements/alter/statistic.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -slug: /en/sql-reference/statements/alter/statistic -sidebar_position: 45 -sidebar_label: STATISTIC ---- - -# Manipulating Column Statistics - -The following operations are available: - -- `ALTER TABLE [db].table ADD STATISTIC (columns list) TYPE type` - Adds statistic description to tables metadata. - -- `ALTER TABLE [db].table DROP STATISTIC (columns list) TYPE type` - Removes statistic description from tables metadata and deletes statistic files from disk. - -- `ALTER TABLE [db].table CLEAR STATISTIC (columns list) TYPE type` - Deletes statistic files from disk. - -- `ALTER TABLE [db.]table MATERIALIZE STATISTIC (columns list) TYPE type` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). - -The first two commands are lightweight in a sense that they only change metadata or remove files. - -Also, they are replicated, syncing statistics metadata via ZooKeeper. - -:::note -Statistic manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). -::: diff --git a/docs/en/sql-reference/statements/alter/statistics.md b/docs/en/sql-reference/statements/alter/statistics.md new file mode 100644 index 00000000000..80024781f88 --- /dev/null +++ b/docs/en/sql-reference/statements/alter/statistics.md @@ -0,0 +1,33 @@ +--- +slug: /en/sql-reference/statements/alter/statistics +sidebar_position: 45 +sidebar_label: STATISTICS +--- + +# Manipulating Column Statistics + +The following operations are available: + +- `ALTER TABLE [db].table ADD STATISTICS (columns list) TYPE (type list)` - Adds statistic description to tables metadata. + +- `ALTER TABLE [db].table MODIFY STATISTICS (columns list) TYPE (type list)` - Modifies statistic description to tables metadata. + +- `ALTER TABLE [db].table DROP STATISTICS (columns list)` - Removes statistics from the metadata of the specified columns and deletes all statistics objects in all parts for the specified columns. + +- `ALTER TABLE [db].table CLEAR STATISTICS (columns list)` - Deletes all statistics objects in all parts for the specified columns. Statistics objects can be rebuild using `ALTER TABLE MATERIALIZE STATISTICS`. + +- `ALTER TABLE [db.]table MATERIALIZE STATISTICS (columns list)` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). + +The first two commands are lightweight in a sense that they only change metadata or remove files. + +Also, they are replicated, syncing statistics metadata via ZooKeeper. + +There is an example adding two statistics types to two columns: + +``` +ALTER TABLE t1 MODIFY STATISTICS c, d TYPE TDigest, Uniq; +``` + +:::note +Statistic manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). +::: diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 7f0eff2184b..e9f24a8c685 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -51,10 +51,11 @@ enum class AccessType : uint8_t M(ALTER_CLEAR_INDEX, "CLEAR INDEX", TABLE, ALTER_INDEX) \ M(ALTER_INDEX, "INDEX", GROUP, ALTER_TABLE) /* allows to execute ALTER ORDER BY or ALTER {ADD|DROP...} INDEX */\ \ - M(ALTER_ADD_STATISTIC, "ALTER ADD STATISTIC", TABLE, ALTER_STATISTIC) \ - M(ALTER_DROP_STATISTIC, "ALTER DROP STATISTIC", TABLE, ALTER_STATISTIC) \ - M(ALTER_MATERIALIZE_STATISTIC, "ALTER MATERIALIZE STATISTIC", TABLE, ALTER_STATISTIC) \ - M(ALTER_STATISTIC, "STATISTIC", GROUP, ALTER_TABLE) /* allows to execute ALTER STATISTIC */\ + M(ALTER_ADD_STATISTICS, "ALTER ADD STATISTIC", TABLE, ALTER_STATISTICS) \ + M(ALTER_DROP_STATISTICS, "ALTER DROP STATISTIC", TABLE, ALTER_STATISTICS) \ + M(ALTER_MODIFY_STATISTICS, "ALTER MODIFY STATISTIC", TABLE, ALTER_STATISTICS) \ + M(ALTER_MATERIALIZE_STATISTICS, "ALTER MATERIALIZE STATISTIC", TABLE, ALTER_STATISTICS) \ + M(ALTER_STATISTICS, "STATISTIC", GROUP, ALTER_TABLE) /* allows to execute ALTER STATISTIC */\ \ M(ALTER_ADD_PROJECTION, "ADD PROJECTION", TABLE, ALTER_PROJECTION) \ M(ALTER_DROP_PROJECTION, "DROP PROJECTION", TABLE, ALTER_PROJECTION) \ diff --git a/src/AggregateFunctions/QuantileTDigest.h b/src/AggregateFunctions/QuantileTDigest.h index d5a4f6b576a..4207ea587b1 100644 --- a/src/AggregateFunctions/QuantileTDigest.h +++ b/src/AggregateFunctions/QuantileTDigest.h @@ -334,6 +334,18 @@ public: compress(); // Allows reading/writing TDigests with different epsilon/max_centroids params } + Float64 getCountEqual(Float64 value) const + { + Float64 result = 0; + for (const auto & c : centroids) + { + /// std::cerr << "c "<< c.mean << " "<< c.count << std::endl; + if (value == c.mean) + result += c.count; + } + return result; + } + Float64 getCountLessThan(Float64 value) const { bool first = true; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index ea6f9510927..29993ed33e4 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -586,7 +586,7 @@ M(705, TABLE_NOT_EMPTY) \ M(706, LIBSSH_ERROR) \ M(707, GCP_ERROR) \ - M(708, ILLEGAL_STATISTIC) \ + M(708, ILLEGAL_STATISTICS) \ M(709, CANNOT_GET_REPLICATED_DATABASE_SNAPSHOT) \ M(710, FAULT_INJECTED) \ M(711, FILECACHE_ACCESS_DENIED) \ diff --git a/src/Common/MemoryTrackerSwitcher.h b/src/Common/MemoryTrackerSwitcher.h index 3c99fd12353..796b5295a83 100644 --- a/src/Common/MemoryTrackerSwitcher.h +++ b/src/Common/MemoryTrackerSwitcher.h @@ -15,6 +15,7 @@ struct MemoryTrackerSwitcher return; auto * thread_tracker = CurrentThread::getMemoryTracker(); + prev_untracked_memory = current_thread->untracked_memory; prev_memory_tracker_parent = thread_tracker->getParent(); @@ -31,8 +32,10 @@ struct MemoryTrackerSwitcher CurrentThread::flushUntrackedMemory(); auto * thread_tracker = CurrentThread::getMemoryTracker(); - current_thread->untracked_memory = prev_untracked_memory; + /// It is important to set untracked memory after the call of + /// 'setParent' because it may flush untracked memory to the wrong parent. thread_tracker->setParent(prev_memory_tracker_parent); + current_thread->untracked_memory = prev_untracked_memory; } private: diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f4de7ee86fb..27ce54c03a7 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -160,8 +160,8 @@ class IColumn; M(Bool, enable_multiple_prewhere_read_steps, true, "Move more conditions from WHERE to PREWHERE and do reads from disk and filtering in multiple steps if there are multiple conditions combined with AND", 0) \ M(Bool, move_primary_key_columns_to_end_of_prewhere, true, "Move PREWHERE conditions containing primary key columns to the end of AND chain. It is likely that these conditions are taken into account during primary key analysis and thus will not contribute a lot to PREWHERE filtering.", 0) \ \ - M(Bool, allow_statistic_optimize, false, "Allows using statistic to optimize queries", 0) \ - M(Bool, allow_experimental_statistic, false, "Allows using statistic", 0) \ + M(Bool, allow_statistics_optimize, false, "Allows using statistics to optimize queries", 0) ALIAS(allow_statistic_optimize) \ + M(Bool, allow_experimental_statistics, false, "Allows using statistics", 0) ALIAS(allow_experimental_statistic) \ \ M(UInt64, alter_sync, 1, "Wait for actions to manipulate the partitions. 0 - do not wait, 1 - wait for execution only of itself, 2 - wait for everyone.", 0) ALIAS(replication_alter_partitions_sync) \ M(Int64, replication_wait_for_inactive_replica_timeout, 120, "Wait for inactive replica to execute ALTER/OPTIMIZE. Time in seconds, 0 - do not wait, negative - wait for unlimited time.", 0) \ @@ -891,6 +891,7 @@ class IColumn; M(Bool, geo_distance_returns_float64_on_float64_arguments, true, "If all four arguments to `geoDistance`, `greatCircleDistance`, `greatCircleAngle` functions are Float64, return Float64 and use double precision for internal calculations. In previous ClickHouse versions, the functions always returned Float32.", 0) \ M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \ M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \ + M(Bool, enable_blob_storage_log, true, "Write information about blob storage operations to system.blob_storage_log table", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 3ddacd06a06..814c08c5705 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -83,7 +83,7 @@ namespace SettingsChangesHistory /// For newly added setting choose the most appropriate previous_value (for example, if new setting /// controls new feature and it's 'true' by default, use 'false' as previous_value). /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) -static std::map settings_changes_history = +static const std::map settings_changes_history = { {"24.6", {{"materialize_skip_indexes_on_insert", true, true, "Added new setting to allow to disable materialization of skip indexes on insert"}, {"materialize_statistics_on_insert", true, true, "Added new setting to allow to disable materialization of statistics on insert"}, @@ -96,6 +96,11 @@ static std::map sett {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, + {"enable_blob_storage_log", true, true, "Write information about blob storage operations to system.blob_storage_log table"}, + {"allow_statistic_optimize", false, false, "Old setting which popped up here being renamed."}, + {"allow_experimental_statistic", false, false, "Old setting which popped up here being renamed."}, + {"allow_statistics_optimize", false, false, "The setting was renamed. The previous name is `allow_statistic_optimize`."}, + {"allow_experimental_statistics", false, false, "The setting was renamed. The previous name is `allow_experimental_statistic`."} }}, {"24.5", {{"allow_deprecated_error_prone_window_functions", true, false, "Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference)"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 46c30240ef8..85fce671cbb 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -543,6 +543,7 @@ template constexpr bool IsDataTypeNumber = false; template constexpr bool IsDataTypeDateOrDateTime = false; template constexpr bool IsDataTypeDate = false; template constexpr bool IsDataTypeEnum = false; +template constexpr bool IsDataTypeStringOrFixedString = false; template constexpr bool IsDataTypeDecimalOrNumber = IsDataTypeDecimal || IsDataTypeNumber; @@ -556,6 +557,8 @@ class DataTypeDate; class DataTypeDate32; class DataTypeDateTime; class DataTypeDateTime64; +class DataTypeString; +class DataTypeFixedString; template constexpr bool IsDataTypeDecimal> = true; @@ -572,6 +575,9 @@ template <> inline constexpr bool IsDataTypeDateOrDateTime = tru template <> inline constexpr bool IsDataTypeDateOrDateTime = true; template <> inline constexpr bool IsDataTypeDateOrDateTime = true; +template <> inline constexpr bool IsDataTypeStringOrFixedString = true; +template <> inline constexpr bool IsDataTypeStringOrFixedString = true; + template class DataTypeEnum; diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 5cb4198e1a2..025ac585593 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -670,7 +670,7 @@ void DatabaseOnDisk::iterateMetadataFiles(ContextPtr local_context, const Iterat for (auto it = metadata_files.begin(); it < metadata_files.end(); std::advance(it, batch_size)) { std::span batch{it, std::min(std::next(it, batch_size), metadata_files.end())}; - pool.scheduleOrThrowOnError( + pool.scheduleOrThrow( [batch, &process_metadata_file, &process_tmp_drop_metadata_file]() mutable { setThreadName("DatabaseOnDisk"); @@ -679,7 +679,7 @@ void DatabaseOnDisk::iterateMetadataFiles(ContextPtr local_context, const Iterat process_metadata_file(file.first); else process_tmp_drop_metadata_file(file.first); - }); + }, Priority{}, getContext()->getSettingsRef().lock_acquire_timeout.totalMicroseconds()); } pool.wait(); } diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 5d36f1cc3d6..10a8e06e8f0 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -44,6 +44,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int UNKNOWN_DATABASE_ENGINE; extern const int NOT_IMPLEMENTED; + extern const int UNEXPECTED_NODE_IN_ZOOKEEPER; } static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; @@ -76,6 +77,20 @@ static void setReplicatedEngine(ASTCreateQuery * create_query, ContextPtr contex String replica_path = server_settings.default_replica_path; String replica_name = server_settings.default_replica_name; + /// Check that replica path doesn't exist + Macros::MacroExpansionInfo info; + StorageID table_id = StorageID(create_query->getDatabase(), create_query->getTable(), create_query->uuid); + info.table_id = table_id; + info.expand_special_macros_only = false; + + String zookeeper_path = context->getMacros()->expand(replica_path, info); + if (context->getZooKeeper()->exists(zookeeper_path)) + throw Exception( + ErrorCodes::UNEXPECTED_NODE_IN_ZOOKEEPER, + "Found existing ZooKeeper path {} while trying to convert table {} to replicated. Table will not be converted.", + zookeeper_path, backQuote(table_id.getFullTableName()) + ); + auto args = std::make_shared(); args->children.push_back(std::make_shared(replica_path)); args->children.push_back(std::make_shared(replica_name)); diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index 0420de0f8dd..a249789df4b 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -36,30 +36,24 @@ void IObjectStorageIteratorAsync::deactivate() void IObjectStorageIteratorAsync::nextBatch() { std::lock_guard lock(mutex); + if (is_finished) { current_batch.clear(); current_batch_iterator = current_batch.begin(); + return; } - else - { - if (!is_initialized) - { - outcome_future = scheduleBatch(); - is_initialized = true; - } + if (!is_initialized) + { + outcome_future = scheduleBatch(); + is_initialized = true; + } + + try + { chassert(outcome_future.valid()); - BatchAndHasNext result; - try - { - result = outcome_future.get(); - } - catch (...) - { - is_finished = true; - throw; - } + BatchAndHasNext result = outcome_future.get(); current_batch = std::move(result.batch); current_batch_iterator = current_batch.begin(); @@ -71,6 +65,11 @@ void IObjectStorageIteratorAsync::nextBatch() else is_finished = true; } + catch (...) + { + is_finished = true; + throw; + } } void IObjectStorageIteratorAsync::next() @@ -95,35 +94,39 @@ std::future IObjectStorageIterator bool IObjectStorageIteratorAsync::isValid() { + std::lock_guard lock(mutex); + if (!is_initialized) nextBatch(); - std::lock_guard lock(mutex); return current_batch_iterator != current_batch.end(); } RelativePathWithMetadataPtr IObjectStorageIteratorAsync::current() { + std::lock_guard lock(mutex); + if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); - std::lock_guard lock(mutex); return *current_batch_iterator; } RelativePathsWithMetadata IObjectStorageIteratorAsync::currentBatch() { + std::lock_guard lock(mutex); + if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); - std::lock_guard lock(mutex); return current_batch; } std::optional IObjectStorageIteratorAsync::getCurrentBatchAndScheduleNext() { std::lock_guard lock(mutex); + if (!is_initialized) nextBatch(); diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 44d0b750af9..2a0b2f1d075 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -709,7 +709,7 @@ bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateL else return tryReadFloatTextFast(x, rb); } - else /*if constexpr (is_integer_v)*/ + else /*if constexpr (is_integral_v)*/ return tryReadIntText(x, rb); } @@ -814,6 +814,16 @@ enum class ConvertFromStringParsingMode : uint8_t BestEffortUS }; +struct AccurateConvertStrategyAdditions +{ + UInt32 scale { 0 }; +}; + +struct AccurateOrNullConvertStrategyAdditions +{ + UInt32 scale { 0 }; +}; + template struct ConvertThroughParsing @@ -1020,7 +1030,13 @@ struct ConvertThroughParsing break; } } - parseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); + if constexpr (std::is_same_v) + { + if (!tryParseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing)) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Cannot parse string to type {}", TypeName); + } + else + parseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); } while (false); } } @@ -1120,16 +1136,6 @@ struct ConvertThroughParsing /// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type. struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; }; -struct AccurateConvertStrategyAdditions -{ - UInt32 scale { 0 }; -}; - -struct AccurateOrNullConvertStrategyAdditions -{ - UInt32 scale { 0 }; -}; - enum class BehaviourOnErrorFromString : uint8_t { ConvertDefaultBehaviorTag, @@ -3174,8 +3180,11 @@ private: { TypeIndex from_type_index = from_type->getTypeId(); WhichDataType which(from_type_index); + TypeIndex to_type_index = to_type->getTypeId(); + WhichDataType to(to_type_index); bool can_apply_accurate_cast = (cast_type == CastType::accurate || cast_type == CastType::accurateOrNull) && (which.isInt() || which.isUInt() || which.isFloat()); + can_apply_accurate_cast |= cast_type == CastType::accurate && which.isStringOrFixedString() && to.isNativeInteger(); FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior = default_date_time_overflow_behavior; if (context) @@ -3260,6 +3269,20 @@ private: return true; } } + else if constexpr (IsDataTypeStringOrFixedString) + { + if constexpr (IsDataTypeNumber) + { + chassert(wrapper_cast_type == CastType::accurate); + result_column = ConvertImpl::execute( + arguments, + result_type, + input_rows_count, + BehaviourOnErrorFromString::ConvertDefaultBehaviorTag, + AccurateConvertStrategyAdditions()); + } + return true; + } return false; }); diff --git a/src/IO/S3/BlobStorageLogWriter.cpp b/src/IO/S3/BlobStorageLogWriter.cpp index aaf4aea5a8e..c2f0cb86928 100644 --- a/src/IO/S3/BlobStorageLogWriter.cpp +++ b/src/IO/S3/BlobStorageLogWriter.cpp @@ -23,6 +23,9 @@ void BlobStorageLogWriter::addEvent( if (!log) return; + if (log->shouldIgnorePath(local_path_.empty() ? local_path : local_path_)) + return; + if (!time_now.time_since_epoch().count()) time_now = std::chrono::system_clock::now(); diff --git a/src/Interpreters/Access/InterpreterGrantQuery.cpp b/src/Interpreters/Access/InterpreterGrantQuery.cpp index a137404a669..6a46ac9c330 100644 --- a/src/Interpreters/Access/InterpreterGrantQuery.cpp +++ b/src/Interpreters/Access/InterpreterGrantQuery.cpp @@ -438,6 +438,12 @@ BlockIO InterpreterGrantQuery::execute() RolesOrUsersSet roles_to_revoke; collectRolesToGrantOrRevoke(access_control, query, roles_to_grant, roles_to_revoke); + /// Replacing empty database with the default. This step must be done before replication to avoid privilege escalation. + String current_database = getContext()->getCurrentDatabase(); + elements_to_grant.replaceEmptyDatabase(current_database); + elements_to_revoke.replaceEmptyDatabase(current_database); + query.access_rights_elements.replaceEmptyDatabase(current_database); + /// Executing on cluster. if (!query.cluster.empty()) { @@ -453,9 +459,6 @@ BlockIO InterpreterGrantQuery::execute() } /// Check if the current user has corresponding access rights granted with grant option. - String current_database = getContext()->getCurrentDatabase(); - elements_to_grant.replaceEmptyDatabase(current_database); - elements_to_revoke.replaceEmptyDatabase(current_database); bool need_check_grantees_are_allowed = true; if (!query.current_grants) checkGrantOption(access_control, *current_user_access, grantees, need_check_grantees_are_allowed, elements_to_grant, elements_to_revoke); diff --git a/src/Interpreters/BlobStorageLog.cpp b/src/Interpreters/BlobStorageLog.cpp index 0324ef8713c..f20ac9165ac 100644 --- a/src/Interpreters/BlobStorageLog.cpp +++ b/src/Interpreters/BlobStorageLog.cpp @@ -9,6 +9,8 @@ #include #include +#include +#include namespace DB { @@ -69,4 +71,32 @@ void BlobStorageLogElement::appendToBlock(MutableColumns & columns) const columns[i++]->insert(error_message); } +void BlobStorageLog::addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const +{ + SystemLog::addSettingsForQuery(mutable_context, query_kind); + + if (query_kind == IAST::QueryKind::Insert) + mutable_context->setSetting("enable_blob_storage_log", false); +} + +static std::string_view normalizePath(std::string_view path) +{ + if (path.starts_with("./")) + path.remove_prefix(2); + if (path.ends_with("/")) + path.remove_suffix(1); + return path; +} + +void BlobStorageLog::prepareTable() +{ + SystemLog::prepareTable(); + if (auto merge_tree_table = std::dynamic_pointer_cast(getStorage())) + { + std::unique_lock lock{prepare_mutex}; + const auto & relative_data_path = merge_tree_table->getRelativeDataPath(); + prefix_to_ignore = normalizePath(relative_data_path); + } +} + } diff --git a/src/Interpreters/BlobStorageLog.h b/src/Interpreters/BlobStorageLog.h index 15e15be4f87..cf8f37299f7 100644 --- a/src/Interpreters/BlobStorageLog.h +++ b/src/Interpreters/BlobStorageLog.h @@ -1,11 +1,14 @@ #pragma once -#include -#include -#include -#include -#include #include +#include + +#include + +#include +#include +#include +#include namespace DB { @@ -51,7 +54,23 @@ struct BlobStorageLogElement class BlobStorageLog : public SystemLog { +public: using SystemLog::SystemLog; + + /// We should not log events for table itself to avoid infinite recursion + bool shouldIgnorePath(const String & path) const + { + std::shared_lock lock{prepare_mutex}; + return !prefix_to_ignore.empty() && path.starts_with(prefix_to_ignore); + } + +protected: + void prepareTable() override; + void addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const override; + +private: + mutable std::shared_mutex prepare_mutex; + String prefix_to_ignore; }; } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 5c9ae4716b9..06b3adb328d 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -4103,6 +4103,13 @@ std::shared_ptr Context::getBackupLog() const std::shared_ptr Context::getBlobStorageLog() const { + bool enable_blob_storage_log = settings.enable_blob_storage_log; + if (hasQueryContext()) + enable_blob_storage_log = getQueryContext()->getSettingsRef().enable_blob_storage_log; + + if (!enable_blob_storage_log) + return {}; + SharedLockGuard lock(shared->mutex); if (!shared->system_logs) diff --git a/src/Interpreters/ITokenExtractor.cpp b/src/Interpreters/ITokenExtractor.cpp index 1c5d0d4b6d4..f0bf90fcb5c 100644 --- a/src/Interpreters/ITokenExtractor.cpp +++ b/src/Interpreters/ITokenExtractor.cpp @@ -240,4 +240,34 @@ bool SplitTokenExtractor::nextInStringLike(const char * data, size_t length, siz return !bad_token && !token.empty(); } +void SplitTokenExtractor::substringToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter, bool is_prefix, bool is_suffix) const +{ + size_t cur = 0; + size_t token_start = 0; + size_t token_len = 0; + + while (cur < length && nextInString(data, length, &cur, &token_start, &token_len)) + // In order to avoid filter updates with incomplete tokens, + // first token is ignored, unless substring is prefix and + // last token is ignored, unless substring is suffix + if ((token_start > 0 || is_prefix) && (token_start + token_len < length || is_suffix)) + bloom_filter.add(data + token_start, token_len); +} + +void SplitTokenExtractor::substringToGinFilter(const char * data, size_t length, GinFilter & gin_filter, bool is_prefix, bool is_suffix) const +{ + gin_filter.setQueryString(data, length); + + size_t cur = 0; + size_t token_start = 0; + size_t token_len = 0; + + while (cur < length && nextInString(data, length, &cur, &token_start, &token_len)) + // In order to avoid filter updates with incomplete tokens, + // first token is ignored, unless substring is prefix and + // last token is ignored, unless substring is suffix + if ((token_start > 0 || is_prefix) && (token_start + token_len < length || is_suffix)) + gin_filter.addTerm(data + token_start, token_len); +} + } diff --git a/src/Interpreters/ITokenExtractor.h b/src/Interpreters/ITokenExtractor.h index 2423ef12311..76711606d09 100644 --- a/src/Interpreters/ITokenExtractor.h +++ b/src/Interpreters/ITokenExtractor.h @@ -28,8 +28,22 @@ struct ITokenExtractor /// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight. virtual bool nextInStringLike(const char * data, size_t length, size_t * pos, String & out) const = 0; + /// Updates Bloom filter from exact-match string filter value virtual void stringToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const = 0; + /// Updates Bloom filter from substring-match string filter value. + /// An `ITokenExtractor` implementation may decide to skip certain + /// tokens depending on whether the substring is a prefix or a suffix. + virtual void substringToBloomFilter( + const char * data, + size_t length, + BloomFilter & bloom_filter, + bool is_prefix [[maybe_unused]], + bool is_suffix [[maybe_unused]]) const + { + stringToBloomFilter(data, length, bloom_filter); + } + virtual void stringPaddedToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const { stringToBloomFilter(data, length, bloom_filter); @@ -37,8 +51,22 @@ struct ITokenExtractor virtual void stringLikeToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const = 0; + /// Updates GIN filter from exact-match string filter value virtual void stringToGinFilter(const char * data, size_t length, GinFilter & gin_filter) const = 0; + /// Updates GIN filter from substring-match string filter value. + /// An `ITokenExtractor` implementation may decide to skip certain + /// tokens depending on whether the substring is a prefix or a suffix. + virtual void substringToGinFilter( + const char * data, + size_t length, + GinFilter & gin_filter, + bool is_prefix [[maybe_unused]], + bool is_suffix [[maybe_unused]]) const + { + stringToGinFilter(data, length, gin_filter); + } + virtual void stringPaddedToGinFilter(const char * data, size_t length, GinFilter & gin_filter) const { stringToGinFilter(data, length, gin_filter); @@ -148,6 +176,11 @@ struct SplitTokenExtractor final : public ITokenExtractorHelpergetSettings().allow_experimental_statistic && ( - command_ast->type == ASTAlterCommand::ADD_STATISTIC || - command_ast->type == ASTAlterCommand::DROP_STATISTIC || - command_ast->type == ASTAlterCommand::MATERIALIZE_STATISTIC)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Alter table with statistic is now disabled. Turn on allow_experimental_statistic"); + if (!getContext()->getSettings().allow_experimental_statistics && ( + command_ast->type == ASTAlterCommand::ADD_STATISTICS || + command_ast->type == ASTAlterCommand::DROP_STATISTICS || + command_ast->type == ASTAlterCommand::MATERIALIZE_STATISTICS)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Alter table with statistics is now disabled. Turn on allow_experimental_statistics"); } if (typeid_cast(database.get())) @@ -343,19 +343,24 @@ AccessRightsElements InterpreterAlterQuery::getRequiredAccessForCommand(const AS required_access.emplace_back(AccessType::ALTER_SAMPLE_BY, database, table); break; } - case ASTAlterCommand::ADD_STATISTIC: + case ASTAlterCommand::ADD_STATISTICS: { - required_access.emplace_back(AccessType::ALTER_ADD_STATISTIC, database, table); + required_access.emplace_back(AccessType::ALTER_ADD_STATISTICS, database, table); break; } - case ASTAlterCommand::DROP_STATISTIC: + case ASTAlterCommand::MODIFY_STATISTICS: { - required_access.emplace_back(AccessType::ALTER_DROP_STATISTIC, database, table); + required_access.emplace_back(AccessType::ALTER_MODIFY_STATISTICS, database, table); break; } - case ASTAlterCommand::MATERIALIZE_STATISTIC: + case ASTAlterCommand::DROP_STATISTICS: { - required_access.emplace_back(AccessType::ALTER_MATERIALIZE_STATISTIC, database, table); + required_access.emplace_back(AccessType::ALTER_DROP_STATISTICS, database, table); + break; + } + case ASTAlterCommand::MATERIALIZE_STATISTICS: + { + required_access.emplace_back(AccessType::ALTER_MATERIALIZE_STATISTICS, database, table); break; } case ASTAlterCommand::ADD_INDEX: diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index f2e03ca41bd..66936dc25d7 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -448,9 +448,9 @@ ASTPtr InterpreterCreateQuery::formatColumns(const ColumnsDescription & columns) column_declaration->children.push_back(column_declaration->codec); } - if (column.stat) + if (!column.statistics.empty()) { - column_declaration->stat_type = column.stat->ast; + column_declaration->stat_type = column.statistics.getAST(); column_declaration->children.push_back(column_declaration->stat_type); } @@ -675,11 +675,13 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_deflate_qpl_codec, enable_zstd_qat_codec); } + column.statistics.column_name = column.name; /// We assign column name here for better exception error message. if (col_decl.stat_type) { - if (!skip_checks && !context_->getSettingsRef().allow_experimental_statistic) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Create table with statistic is now disabled. Turn on allow_experimental_statistic"); - column.stat = StatisticDescription::getStatisticFromColumnDeclaration(col_decl); + if (!skip_checks && !context_->getSettingsRef().allow_experimental_statistics) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Create table with statistics is now disabled. Turn on allow_experimental_statistics"); + column.statistics = ColumnStatisticsDescription::fromColumnDeclaration(col_decl); + column.statistics.data_type = column.type; } if (col_decl.ttl) @@ -754,7 +756,7 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Experimental full-text index feature is not enabled (the setting 'allow_experimental_full_text_index')"); /// ---- /// Temporary check during a transition period. Please remove at the end of 2024. - if (index_desc.type == INVERTED_INDEX_NAME && settings.allow_experimental_inverted_index) /// The funny condition is not a mistake, see 02346_fulltext_index_old_name.sql + if (index_desc.type == INVERTED_INDEX_NAME && !settings.allow_experimental_inverted_index) throw Exception(ErrorCodes::ILLEGAL_INDEX, "Please use index type 'full_text' instead of 'inverted'"); /// ---- if (index_desc.type == "annoy" && !settings.allow_experimental_annoy_index) diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp index 458be843b59..3a06e1b2301 100644 --- a/src/Interpreters/InterpreterExplainQuery.cpp +++ b/src/Interpreters/InterpreterExplainQuery.cpp @@ -67,8 +67,8 @@ namespace static void visit(ASTSelectQuery & select, ASTPtr & node, Data & data) { - /// we need to read statistic when `allow_statistic_optimize` is enabled. - bool only_analyze = !data.getContext()->getSettings().allow_statistic_optimize; + /// we need to read statistic when `allow_statistics_optimize` is enabled. + bool only_analyze = !data.getContext()->getSettings().allow_statistics_optimize; InterpreterSelectQuery interpreter( node, data.getContext(), SelectQueryOptions(QueryProcessingStage::FetchColumns).analyze(only_analyze).modify()); diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index e72cf670f69..09f987a1c24 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -657,7 +657,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( MergeTreeWhereOptimizer where_optimizer{ std::move(column_compressed_sizes), metadata_snapshot, - storage->getConditionEstimatorByPredicate(storage_snapshot, nullptr, context), + storage->getConditionSelectivityEstimatorByPredicate(storage_snapshot, nullptr, context), queried_columns, supported_prewhere_columns, log}; diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 4f6c1c5f18b..ba33b70b59c 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -55,7 +55,7 @@ namespace ErrorCodes extern const int CANNOT_UPDATE_COLUMN; extern const int UNEXPECTED_EXPRESSION; extern const int THERE_IS_NO_COLUMN; - extern const int ILLEGAL_STATISTIC; + extern const int ILLEGAL_STATISTICS; } @@ -781,7 +781,7 @@ void MutationsInterpreter::prepare(bool dry_run) } else if (command.type == MutationCommand::MATERIALIZE_INDEX) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); auto it = std::find_if( std::cbegin(indices_desc), std::end(indices_desc), [&](const IndexDescription & index) @@ -801,20 +801,20 @@ void MutationsInterpreter::prepare(bool dry_run) materialized_indices.emplace(command.index_name); } } - else if (command.type == MutationCommand::MATERIALIZE_STATISTIC) + else if (command.type == MutationCommand::MATERIALIZE_STATISTICS) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); - for (const auto & stat_column_name: command.statistic_columns) + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); + for (const auto & stat_column_name: command.statistics_columns) { - if (!columns_desc.has(stat_column_name) || !columns_desc.get(stat_column_name).stat) - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Unknown statistic column: {}", stat_column_name); - dependencies.emplace(stat_column_name, ColumnDependency::STATISTIC); + if (!columns_desc.has(stat_column_name) || columns_desc.get(stat_column_name).statistics.empty()) + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Unknown statistics column: {}", stat_column_name); + dependencies.emplace(stat_column_name, ColumnDependency::STATISTICS); materialized_statistics.emplace(stat_column_name); } } else if (command.type == MutationCommand::MATERIALIZE_PROJECTION) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); const auto & projection = projections_desc.get(command.projection_name); if (!source.hasProjection(projection.name) || source.hasBrokenProjection(projection.name)) { @@ -825,18 +825,18 @@ void MutationsInterpreter::prepare(bool dry_run) } else if (command.type == MutationCommand::DROP_INDEX) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); materialized_indices.erase(command.index_name); } - else if (command.type == MutationCommand::DROP_STATISTIC) + else if (command.type == MutationCommand::DROP_STATISTICS) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); - for (const auto & stat_column_name: command.statistic_columns) + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); + for (const auto & stat_column_name: command.statistics_columns) materialized_statistics.erase(stat_column_name); } else if (command.type == MutationCommand::DROP_PROJECTION) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); materialized_projections.erase(command.projection_name); } else if (command.type == MutationCommand::MATERIALIZE_TTL) @@ -888,7 +888,7 @@ void MutationsInterpreter::prepare(bool dry_run) { if (dependency.kind == ColumnDependency::SKIP_INDEX || dependency.kind == ColumnDependency::PROJECTION - || dependency.kind == ColumnDependency::STATISTIC) + || dependency.kind == ColumnDependency::STATISTICS) dependencies.insert(dependency); } } @@ -1360,7 +1360,7 @@ QueryPipelineBuilder MutationsInterpreter::execute() Block MutationsInterpreter::getUpdatedHeader() const { // If it's an index/projection materialization, we don't write any data columns, thus empty header is used - return mutation_kind.mutation_kind == MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION ? Block{} : *updated_header; + return mutation_kind.mutation_kind == MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION ? Block{} : *updated_header; } const ColumnDependencies & MutationsInterpreter::getColumnDependencies() const diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 2d01c7154c8..6aaa233cda3 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -102,7 +102,7 @@ public: enum MutationKindEnum { MUTATE_UNKNOWN, - MUTATE_INDEX_STATISTIC_PROJECTION, + MUTATE_INDEX_STATISTICS_PROJECTION, MUTATE_OTHER, } mutation_kind = MUTATE_UNKNOWN; diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index 3af8761ff8e..5e0ce2cb0de 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -519,8 +519,7 @@ void SystemLog::flushImpl(const std::vector & to_flush, // we need query context to do inserts to target table with MV containing subqueries or joins auto insert_context = Context::createCopy(context); insert_context->makeQueryContext(); - /// We always want to deliver the data to the original table regardless of the MVs - insert_context->setSetting("materialized_views_ignore_errors", true); + addSettingsForQuery(insert_context, IAST::QueryKind::Insert); InterpreterInsertQuery interpreter(query_ptr, insert_context); BlockIO io = interpreter.execute(); @@ -541,13 +540,18 @@ void SystemLog::flushImpl(const std::vector & to_flush, LOG_TRACE(log, "Flushed system log up to offset {}", to_flush_end); } +template +StoragePtr SystemLog::getStorage() const +{ + return DatabaseCatalog::instance().tryGetTable(table_id, getContext()); +} template void SystemLog::prepareTable() { String description = table_id.getNameForLogs(); - auto table = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); + auto table = getStorage(); if (table) { if (old_create_query.empty()) @@ -596,10 +600,9 @@ void SystemLog::prepareTable() merges_lock = table->getActionLock(ActionLocks::PartsMerge); auto query_context = Context::createCopy(context); - /// As this operation is performed automatically we don't want it to fail because of user dependencies on log tables - query_context->setSetting("check_table_dependencies", Field{false}); - query_context->setSetting("check_referential_table_dependencies", Field{false}); query_context->makeQueryContext(); + addSettingsForQuery(query_context, IAST::QueryKind::Rename); + InterpreterRenameQuery(rename, query_context).execute(); /// The required table will be created. @@ -616,6 +619,7 @@ void SystemLog::prepareTable() auto query_context = Context::createCopy(context); query_context->makeQueryContext(); + addSettingsForQuery(query_context, IAST::QueryKind::Create); auto create_query_ast = getCreateTableQuery(); InterpreterCreateQuery interpreter(create_query_ast, query_context); @@ -630,6 +634,22 @@ void SystemLog::prepareTable() is_prepared = true; } +template +void SystemLog::addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const +{ + if (query_kind == IAST::QueryKind::Insert) + { + /// We always want to deliver the data to the original table regardless of the MVs + mutable_context->setSetting("materialized_views_ignore_errors", true); + } + else if (query_kind == IAST::QueryKind::Rename) + { + /// As this operation is performed automatically we don't want it to fail because of user dependencies on log tables + mutable_context->setSetting("check_table_dependencies", Field{false}); + mutable_context->setSetting("check_referential_table_dependencies", Field{false}); + } +} + template ASTPtr SystemLog::getCreateTableQuery() { diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h index e5b79585701..af635ca1bdb 100644 --- a/src/Interpreters/SystemLog.h +++ b/src/Interpreters/SystemLog.h @@ -2,6 +2,7 @@ #include #include +#include #include @@ -139,6 +140,17 @@ protected: using ISystemLog::thread_mutex; using Base::queue; + StoragePtr getStorage() const; + + /** Creates new table if it does not exist. + * Renames old table if its structure is not suitable. + * This cannot be done in constructor to avoid deadlock while renaming a table under locked Context when SystemLog object is created. + */ + void prepareTable() override; + + /// Some tables can override settings for internal queries + virtual void addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const; + private: /* Saving thread data */ const StorageID table_id; @@ -147,12 +159,6 @@ private: String old_create_query; bool is_prepared = false; - /** Creates new table if it does not exist. - * Renames old table if its structure is not suitable. - * This cannot be done in constructor to avoid deadlock while renaming a table under locked Context when SystemLog object is created. - */ - void prepareTable() override; - void savingThreadFunction() override; /// flushImpl can be executed only in saving_thread. diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index f104e715452..90b63d2ce6f 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -42,8 +42,8 @@ ASTPtr ASTAlterCommand::clone() const res->projection_decl = res->children.emplace_back(projection_decl->clone()).get(); if (projection) res->projection = res->children.emplace_back(projection->clone()).get(); - if (statistic_decl) - res->statistic_decl = res->children.emplace_back(statistic_decl->clone()).get(); + if (statistics_decl) + res->statistics_decl = res->children.emplace_back(statistics_decl->clone()).get(); if (partition) res->partition = res->children.emplace_back(partition->clone()).get(); if (predicate) @@ -200,27 +200,33 @@ void ASTAlterCommand::formatImpl(const FormatSettings & settings, FormatState & partition->formatImpl(settings, state, frame); } } - else if (type == ASTAlterCommand::ADD_STATISTIC) + else if (type == ASTAlterCommand::ADD_STATISTICS) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << "ADD STATISTIC " << (if_not_exists ? "IF NOT EXISTS " : "") + settings.ostr << (settings.hilite ? hilite_keyword : "") << "ADD STATISTICS " << (if_not_exists ? "IF NOT EXISTS " : "") << (settings.hilite ? hilite_none : ""); - statistic_decl->formatImpl(settings, state, frame); + statistics_decl->formatImpl(settings, state, frame); } - else if (type == ASTAlterCommand::DROP_STATISTIC) + else if (type == ASTAlterCommand::MODIFY_STATISTICS) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << (clear_statistic ? "CLEAR " : "DROP ") << "STATISTIC " + settings.ostr << (settings.hilite ? hilite_keyword : "") << "MODIFY STATISTICS " + << (settings.hilite ? hilite_none : ""); + statistics_decl->formatImpl(settings, state, frame); + } + else if (type == ASTAlterCommand::DROP_STATISTICS) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << (clear_statistics ? "CLEAR " : "DROP ") << "STATISTICS " << (if_exists ? "IF EXISTS " : "") << (settings.hilite ? hilite_none : ""); - statistic_decl->formatImpl(settings, state, frame); + statistics_decl->formatImpl(settings, state, frame); if (partition) { settings.ostr << (settings.hilite ? hilite_keyword : "") << " IN PARTITION " << (settings.hilite ? hilite_none : ""); partition->formatImpl(settings, state, frame); } } - else if (type == ASTAlterCommand::MATERIALIZE_STATISTIC) + else if (type == ASTAlterCommand::MATERIALIZE_STATISTICS) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << "MATERIALIZE STATISTIC " << (settings.hilite ? hilite_none : ""); - statistic_decl->formatImpl(settings, state, frame); + settings.ostr << (settings.hilite ? hilite_keyword : "") << "MATERIALIZE STATISTICS " << (settings.hilite ? hilite_none : ""); + statistics_decl->formatImpl(settings, state, frame); if (partition) { settings.ostr << (settings.hilite ? hilite_keyword : "") << " IN PARTITION " << (settings.hilite ? hilite_none : ""); @@ -507,7 +513,7 @@ void ASTAlterCommand::forEachPointerToChild(std::function f) f(reinterpret_cast(&constraint)); f(reinterpret_cast(&projection_decl)); f(reinterpret_cast(&projection)); - f(reinterpret_cast(&statistic_decl)); + f(reinterpret_cast(&statistics_decl)); f(reinterpret_cast(&partition)); f(reinterpret_cast(&predicate)); f(reinterpret_cast(&update_assignments)); diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index a3cab1688c2..d7269bed2da 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -55,9 +55,10 @@ public: DROP_PROJECTION, MATERIALIZE_PROJECTION, - ADD_STATISTIC, - DROP_STATISTIC, - MATERIALIZE_STATISTIC, + ADD_STATISTICS, + DROP_STATISTICS, + MODIFY_STATISTICS, + MATERIALIZE_STATISTICS, DROP_PARTITION, DROP_DETACHED_PARTITION, @@ -135,7 +136,7 @@ public: */ IAST * projection = nullptr; - IAST * statistic_decl = nullptr; + IAST * statistics_decl = nullptr; /** Used in DROP PARTITION, ATTACH PARTITION FROM, FORGET PARTITION, UPDATE, DELETE queries. * The value or ID of the partition is stored here. @@ -180,7 +181,7 @@ public: bool clear_index = false; /// for CLEAR INDEX (do not drop index from metadata) - bool clear_statistic = false; /// for CLEAR STATISTIC (do not drop statistic from metadata) + bool clear_statistics = false; /// for CLEAR STATISTICS (do not drop statistics from metadata) bool clear_projection = false; /// for CLEAR PROJECTION (do not drop projection from metadata) diff --git a/src/Parsers/ASTStatisticDeclaration.cpp b/src/Parsers/ASTStatisticDeclaration.cpp deleted file mode 100644 index 0e20b020ab3..00000000000 --- a/src/Parsers/ASTStatisticDeclaration.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include - -#include -#include -#include - - -namespace DB -{ - -ASTPtr ASTStatisticDeclaration::clone() const -{ - auto res = std::make_shared(); - - res->set(res->columns, columns->clone()); - res->type = type; - - return res; -} - -std::vector ASTStatisticDeclaration::getColumnNames() const -{ - std::vector result; - result.reserve(columns->children.size()); - for (const ASTPtr & column_ast : columns->children) - { - result.push_back(column_ast->as().name()); - } - return result; - -} - -void ASTStatisticDeclaration::formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const -{ - columns->formatImpl(s, state, frame); - s.ostr << (s.hilite ? hilite_keyword : "") << " TYPE " << (s.hilite ? hilite_none : ""); - s.ostr << backQuoteIfNeed(type); -} - -} - diff --git a/src/Parsers/ASTStatisticsDeclaration.cpp b/src/Parsers/ASTStatisticsDeclaration.cpp new file mode 100644 index 00000000000..f9b7a9e29db --- /dev/null +++ b/src/Parsers/ASTStatisticsDeclaration.cpp @@ -0,0 +1,60 @@ +#include +#include + +#include +#include +#include + + +namespace DB +{ + +ASTPtr ASTStatisticsDeclaration::clone() const +{ + auto res = std::make_shared(); + + res->set(res->columns, columns->clone()); + if (types) + res->set(res->types, types->clone()); + + return res; +} + +std::vector ASTStatisticsDeclaration::getColumnNames() const +{ + std::vector result; + result.reserve(columns->children.size()); + for (const ASTPtr & column_ast : columns->children) + { + result.push_back(column_ast->as().name()); + } + return result; + +} + +std::vector ASTStatisticsDeclaration::getTypeNames() const +{ + chassert(types != nullptr); + std::vector result; + result.reserve(types->children.size()); + for (const ASTPtr & column_ast : types->children) + { + result.push_back(column_ast->as().name); + } + return result; + +} + +void ASTStatisticsDeclaration::formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const +{ + columns->formatImpl(s, state, frame); + s.ostr << (s.hilite ? hilite_keyword : ""); + if (types) + { + s.ostr << " TYPE " << (s.hilite ? hilite_none : ""); + types->formatImpl(s, state, frame); + } +} + +} + diff --git a/src/Parsers/ASTStatisticDeclaration.h b/src/Parsers/ASTStatisticsDeclaration.h similarity index 74% rename from src/Parsers/ASTStatisticDeclaration.h rename to src/Parsers/ASTStatisticsDeclaration.h index f936c93f2ba..f43567b3c70 100644 --- a/src/Parsers/ASTStatisticDeclaration.h +++ b/src/Parsers/ASTStatisticsDeclaration.h @@ -9,17 +9,17 @@ class ASTFunction; /** name BY columns TYPE typename(args) in create query */ -class ASTStatisticDeclaration : public IAST +class ASTStatisticsDeclaration : public IAST { public: IAST * columns; - /// TODO type should be a list of ASTFunction, for example, 'tdigest(256), hyperloglog(128)', etc. - String type; + IAST * types; /** Get the text that identifies this element. */ String getID(char) const override { return "Stat"; } std::vector getColumnNames() const; + std::vector getTypeNames() const; ASTPtr clone() const override; void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/CommonParsers.h b/src/Parsers/CommonParsers.h index 97094b00bc6..f0cbe42da80 100644 --- a/src/Parsers/CommonParsers.h +++ b/src/Parsers/CommonParsers.h @@ -13,7 +13,7 @@ namespace DB MR_MACROS(ADD_CONSTRAINT, "ADD CONSTRAINT") \ MR_MACROS(ADD_INDEX, "ADD INDEX") \ MR_MACROS(ADD_PROJECTION, "ADD PROJECTION") \ - MR_MACROS(ADD_STATISTIC, "ADD STATISTIC") \ + MR_MACROS(ADD_STATISTICS, "ADD STATISTICS") \ MR_MACROS(ADD, "ADD") \ MR_MACROS(ADMIN_OPTION_FOR, "ADMIN OPTION FOR") \ MR_MACROS(AFTER, "AFTER") \ @@ -83,7 +83,7 @@ namespace DB MR_MACROS(CLEAR_COLUMN, "CLEAR COLUMN") \ MR_MACROS(CLEAR_INDEX, "CLEAR INDEX") \ MR_MACROS(CLEAR_PROJECTION, "CLEAR PROJECTION") \ - MR_MACROS(CLEAR_STATISTIC, "CLEAR STATISTIC") \ + MR_MACROS(CLEAR_STATISTICS, "CLEAR STATISTICS") \ MR_MACROS(CLUSTER, "CLUSTER") \ MR_MACROS(CLUSTERS, "CLUSTERS") \ MR_MACROS(CN, "CN") \ @@ -150,7 +150,7 @@ namespace DB MR_MACROS(DROP_PART, "DROP PART") \ MR_MACROS(DROP_PARTITION, "DROP PARTITION") \ MR_MACROS(DROP_PROJECTION, "DROP PROJECTION") \ - MR_MACROS(DROP_STATISTIC, "DROP STATISTIC") \ + MR_MACROS(DROP_STATISTICS, "DROP STATISTICS") \ MR_MACROS(DROP_TABLE, "DROP TABLE") \ MR_MACROS(DROP_TEMPORARY_TABLE, "DROP TEMPORARY TABLE") \ MR_MACROS(DROP, "DROP") \ @@ -279,7 +279,7 @@ namespace DB MR_MACROS(MATERIALIZE_COLUMN, "MATERIALIZE COLUMN") \ MR_MACROS(MATERIALIZE_INDEX, "MATERIALIZE INDEX") \ MR_MACROS(MATERIALIZE_PROJECTION, "MATERIALIZE PROJECTION") \ - MR_MACROS(MATERIALIZE_STATISTIC, "MATERIALIZE STATISTIC") \ + MR_MACROS(MATERIALIZE_STATISTICS, "MATERIALIZE STATISTICS") \ MR_MACROS(MATERIALIZE_TTL, "MATERIALIZE TTL") \ MR_MACROS(MATERIALIZE, "MATERIALIZE") \ MR_MACROS(MATERIALIZED, "MATERIALIZED") \ @@ -304,6 +304,7 @@ namespace DB MR_MACROS(MODIFY_QUERY, "MODIFY QUERY") \ MR_MACROS(MODIFY_REFRESH, "MODIFY REFRESH") \ MR_MACROS(MODIFY_SAMPLE_BY, "MODIFY SAMPLE BY") \ + MR_MACROS(MODIFY_STATISTICS, "MODIFY STATISTICS") \ MR_MACROS(MODIFY_SETTING, "MODIFY SETTING") \ MR_MACROS(MODIFY_SQL_SECURITY, "MODIFY SQL SECURITY") \ MR_MACROS(MODIFY_TTL, "MODIFY TTL") \ @@ -447,7 +448,7 @@ namespace DB MR_MACROS(SQL_SECURITY, "SQL SECURITY") \ MR_MACROS(SS, "SS") \ MR_MACROS(START_TRANSACTION, "START TRANSACTION") \ - MR_MACROS(STATISTIC, "STATISTIC") \ + MR_MACROS(STATISTICS, "STATISTICS") \ MR_MACROS(STEP, "STEP") \ MR_MACROS(STORAGE, "STORAGE") \ MR_MACROS(STRICT, "STRICT") \ diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 416f696323c..5997452bcf3 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -703,7 +703,7 @@ bool ParserCodec::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return true; } -bool ParserStatisticType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +bool ParserStatisticsType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserList stat_type_parser(std::make_unique(), std::make_unique(TokenType::Comma), false); @@ -722,7 +722,7 @@ bool ParserStatisticType::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte ++pos; auto function_node = std::make_shared(); - function_node->name = "STATISTIC"; + function_node->name = "STATISTICS"; function_node->arguments = stat_type; function_node->children.push_back(function_node->arguments); diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index 14d501e50da..4e3f29bfe0c 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -202,11 +202,11 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; -/// STATISTIC(tdigest(200)) -class ParserStatisticType : public IParserBase +/// STATISTICS(tdigest(200)) +class ParserStatisticsType : public IParserBase { protected: - const char * getName() const override { return "statistic"; } + const char * getName() const override { return "statistics"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp index 5f2bd50524c..b4601389696 100644 --- a/src/Parsers/Lexer.cpp +++ b/src/Parsers/Lexer.cpp @@ -59,9 +59,6 @@ Token quotedStringWithUnicodeQuotes(const char *& pos, const char * const token_ pos = find_first_symbols<'\xE2'>(pos, end); if (pos + 2 >= end) return Token(error_token, token_begin, end); - /// Empty identifiers are not allowed, while empty strings are. - if (success_token == TokenType::QuotedIdentifier && pos + 3 >= end) - return Token(error_token, token_begin, end); if (pos[0] == '\xE2' && pos[1] == '\x80' && pos[2] == expected_end_byte) { diff --git a/src/Parsers/ParserAlterQuery.cpp b/src/Parsers/ParserAlterQuery.cpp index 6f48f79d942..28dbf781011 100644 --- a/src/Parsers/ParserAlterQuery.cpp +++ b/src/Parsers/ParserAlterQuery.cpp @@ -49,10 +49,11 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserKeyword s_clear_index(Keyword::CLEAR_INDEX); ParserKeyword s_materialize_index(Keyword::MATERIALIZE_INDEX); - ParserKeyword s_add_statistic(Keyword::ADD_STATISTIC); - ParserKeyword s_drop_statistic(Keyword::DROP_STATISTIC); - ParserKeyword s_clear_statistic(Keyword::CLEAR_STATISTIC); - ParserKeyword s_materialize_statistic(Keyword::MATERIALIZE_STATISTIC); + ParserKeyword s_add_statistics(Keyword::ADD_STATISTICS); + ParserKeyword s_drop_statistics(Keyword::DROP_STATISTICS); + ParserKeyword s_modify_statistics(Keyword::MODIFY_STATISTICS); + ParserKeyword s_clear_statistics(Keyword::CLEAR_STATISTICS); + ParserKeyword s_materialize_statistics(Keyword::MATERIALIZE_STATISTICS); ParserKeyword s_add_constraint(Keyword::ADD_CONSTRAINT); ParserKeyword s_drop_constraint(Keyword::DROP_CONSTRAINT); @@ -126,7 +127,8 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserIdentifier parser_remove_property; ParserCompoundColumnDeclaration parser_col_decl; ParserIndexDeclaration parser_idx_decl; - ParserStatisticDeclaration parser_stat_decl; + ParserStatisticsDeclaration parser_stat_decl; + ParserStatisticsDeclarationWithoutTypes parser_stat_decl_without_types; ParserConstraintDeclaration parser_constraint_decl; ParserProjectionDeclaration parser_projection_decl; ParserCompoundColumnDeclaration parser_modify_col_decl(false, false, true); @@ -154,7 +156,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ASTPtr command_constraint; ASTPtr command_projection_decl; ASTPtr command_projection; - ASTPtr command_statistic_decl; + ASTPtr command_statistics_decl; ASTPtr command_partition; ASTPtr command_predicate; ASTPtr command_update_assignments; @@ -368,36 +370,43 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected return false; } } - else if (s_add_statistic.ignore(pos, expected)) + else if (s_add_statistics.ignore(pos, expected)) { if (s_if_not_exists.ignore(pos, expected)) command->if_not_exists = true; - if (!parser_stat_decl.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::ADD_STATISTIC; + command->type = ASTAlterCommand::ADD_STATISTICS; } - else if (s_drop_statistic.ignore(pos, expected)) + else if (s_modify_statistics.ignore(pos, expected)) + { + if (!parser_stat_decl.parse(pos, command_statistics_decl, expected)) + return false; + + command->type = ASTAlterCommand::MODIFY_STATISTICS; + } + else if (s_drop_statistics.ignore(pos, expected)) { if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl_without_types.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::DROP_STATISTIC; + command->type = ASTAlterCommand::DROP_STATISTICS; } - else if (s_clear_statistic.ignore(pos, expected)) + else if (s_clear_statistics.ignore(pos, expected)) { if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl_without_types.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::DROP_STATISTIC; - command->clear_statistic = true; + command->type = ASTAlterCommand::DROP_STATISTICS; + command->clear_statistics = true; command->detach = false; if (s_in_partition.ignore(pos, expected)) @@ -406,15 +415,15 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected return false; } } - else if (s_materialize_statistic.ignore(pos, expected)) + else if (s_materialize_statistics.ignore(pos, expected)) { if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl_without_types.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::MATERIALIZE_STATISTIC; + command->type = ASTAlterCommand::MATERIALIZE_STATISTICS; command->detach = false; if (s_in_partition.ignore(pos, expected)) @@ -931,8 +940,8 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected command->projection_decl = command->children.emplace_back(std::move(command_projection_decl)).get(); if (command_projection) command->projection = command->children.emplace_back(std::move(command_projection)).get(); - if (command_statistic_decl) - command->statistic_decl = command->children.emplace_back(std::move(command_statistic_decl)).get(); + if (command_statistics_decl) + command->statistics_decl = command->children.emplace_back(std::move(command_statistics_decl)).get(); if (command_partition) command->partition = command->children.emplace_back(std::move(command_partition)).get(); if (command_predicate) diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index c1b45871577..014dc7bd3bf 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -225,15 +225,15 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe return true; } -bool ParserStatisticDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +bool ParserStatisticsDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserKeyword s_type(Keyword::TYPE); ParserList columns_p(std::make_unique(), std::make_unique(TokenType::Comma), false); - ParserIdentifier type_p; + ParserList types_p(std::make_unique(), std::make_unique(TokenType::Comma), false); ASTPtr columns; - ASTPtr type; + ASTPtr types; if (!columns_p.parse(pos, columns, expected)) return false; @@ -241,12 +241,29 @@ bool ParserStatisticDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & if (!s_type.ignore(pos, expected)) return false; - if (!type_p.parse(pos, type, expected)) + if (!types_p.parse(pos, types, expected)) return false; - auto stat = std::make_shared(); + auto stat = std::make_shared(); + stat->set(stat->columns, columns); + stat->set(stat->types, types); + node = stat; + + return true; +} + +bool ParserStatisticsDeclarationWithoutTypes::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + + ParserList columns_p(std::make_unique(), std::make_unique(TokenType::Comma), false); + + ASTPtr columns; + + if (!columns_p.parse(pos, columns, expected)) + return false; + + auto stat = std::make_shared(); stat->set(stat->columns, columns); - stat->type = type->as().name(); node = stat; return true; diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index d001c097114..27bb524970d 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -138,7 +138,7 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ParserKeyword s_auto_increment{Keyword::AUTO_INCREMENT}; ParserKeyword s_comment{Keyword::COMMENT}; ParserKeyword s_codec{Keyword::CODEC}; - ParserKeyword s_stat{Keyword::STATISTIC}; + ParserKeyword s_stat{Keyword::STATISTICS}; ParserKeyword s_ttl{Keyword::TTL}; ParserKeyword s_remove{Keyword::REMOVE}; ParserKeyword s_modify_setting(Keyword::MODIFY_SETTING); @@ -155,7 +155,7 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ParserLiteral literal_parser; ParserCodec codec_parser; ParserCollation collation_parser; - ParserStatisticType stat_type_parser; + ParserStatisticsType stat_type_parser; ParserExpression expression_parser; ParserSetQuery settings_parser(true); @@ -452,16 +452,27 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; -class ParserStatisticDeclaration : public IParserBase +class ParserStatisticsDeclaration : public IParserBase { public: - ParserStatisticDeclaration() = default; + ParserStatisticsDeclaration() = default; protected: const char * getName() const override { return "statistics declaration"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +class ParserStatisticsDeclarationWithoutTypes : public IParserBase +{ +public: + ParserStatisticsDeclarationWithoutTypes() = default; + +protected: + const char * getName() const override { return "statistics declaration"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + + class ParserConstraintDeclaration : public IParserBase { protected: diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 26e78ea69ac..6ec9a1e2f0d 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -1592,6 +1592,8 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ left_join_tree_query_plan.actions_dags.emplace_back(std::move(join_clauses_and_actions.left_join_expressions_actions)); if (join_clauses_and_actions.right_join_expressions_actions) left_join_tree_query_plan.actions_dags.emplace_back(std::move(join_clauses_and_actions.right_join_expressions_actions)); + if (join_clauses_and_actions.mixed_join_expressions_actions) + left_join_tree_query_plan.actions_dags.push_back(join_clauses_and_actions.mixed_join_expressions_actions); auto mapping = std::move(left_join_tree_query_plan.query_node_to_plan_step_mapping); auto & r_mapping = right_join_tree_query_plan.query_node_to_plan_step_mapping; diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp index fbd9b451ddc..e374d8811db 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp @@ -83,7 +83,7 @@ void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) MergeTreeWhereOptimizer where_optimizer{ std::move(column_compressed_sizes), storage_metadata, - storage.getConditionEstimatorByPredicate(storage_snapshot, source_step_with_filter->getFilterActionsDAG(), context), + storage.getConditionSelectivityEstimatorByPredicate(storage_snapshot, source_step_with_filter->getFilterActionsDAG(), context), queried_columns, storage.supportedPrewhereColumns(), getLogger("QueryPlanOptimizePrewhere")}; diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index df9e095af30..136d474751a 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -118,7 +118,7 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s optimizePrimaryKeyCondition(stack); /// NOTE: optimizePrewhere can modify the stack. - /// Prewhere optimization relies on PK optimization (getConditionEstimatorByPredicate) + /// Prewhere optimization relies on PK optimization (getConditionSelectivityEstimatorByPredicate) if (optimization_settings.optimize_prewhere) optimizePrewhere(stack, nodes); diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index d1db4cb3951..02d0959ff50 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -67,6 +67,8 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; + extern const int CANNOT_COMPILE_REGEXP; + extern const int CANNOT_OPEN_FILE; extern const int CANNOT_PARSE_TEXT; extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; extern const int CANNOT_PARSE_QUOTED_STRING; @@ -78,8 +80,7 @@ namespace ErrorCodes extern const int CANNOT_PARSE_IPV6; extern const int CANNOT_PARSE_UUID; extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; - extern const int CANNOT_OPEN_FILE; - extern const int CANNOT_COMPILE_REGEXP; + extern const int CANNOT_SCHEDULE_TASK; extern const int DUPLICATE_COLUMN; extern const int ILLEGAL_COLUMN; extern const int THERE_IS_NO_COLUMN; @@ -267,6 +268,10 @@ static Poco::Net::HTTPResponse::HTTPStatus exceptionCodeToHTTPStatus(int excepti { return HTTPResponse::HTTP_REQUEST_TIMEOUT; } + else if (exception_code == ErrorCodes::CANNOT_SCHEDULE_TASK) + { + return HTTPResponse::HTTP_SERVICE_UNAVAILABLE; + } return HTTPResponse::HTTP_INTERNAL_SERVER_ERROR; } diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 4879d1a16dc..3a4c1d94750 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -44,7 +44,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; - extern const int ILLEGAL_STATISTIC; + extern const int ILLEGAL_STATISTICS; extern const int BAD_ARGUMENTS; extern const int NOT_FOUND_COLUMN_IN_BLOCK; extern const int LOGICAL_ERROR; @@ -263,17 +263,32 @@ std::optional AlterCommand::parse(const ASTAlterCommand * command_ return command; } - else if (command_ast->type == ASTAlterCommand::ADD_STATISTIC) + else if (command_ast->type == ASTAlterCommand::ADD_STATISTICS) { AlterCommand command; command.ast = command_ast->clone(); - command.statistic_decl = command_ast->statistic_decl->clone(); - command.type = AlterCommand::ADD_STATISTIC; + command.statistics_decl = command_ast->statistics_decl->clone(); + command.type = AlterCommand::ADD_STATISTICS; - const auto & ast_stat_decl = command_ast->statistic_decl->as(); + const auto & ast_stat_decl = command_ast->statistics_decl->as(); - command.statistic_columns = ast_stat_decl.getColumnNames(); - command.statistic_type = ast_stat_decl.type; + command.statistics_columns = ast_stat_decl.getColumnNames(); + command.statistics_types = ast_stat_decl.getTypeNames(); + command.if_not_exists = command_ast->if_not_exists; + + return command; + } + else if (command_ast->type == ASTAlterCommand::MODIFY_STATISTICS) + { + AlterCommand command; + command.ast = command_ast->clone(); + command.statistics_decl = command_ast->statistics_decl->clone(); + command.type = AlterCommand::MODIFY_STATISTICS; + + const auto & ast_stat_decl = command_ast->statistics_decl->as(); + + command.statistics_columns = ast_stat_decl.getColumnNames(); + command.statistics_types = ast_stat_decl.getTypeNames(); command.if_not_exists = command_ast->if_not_exists; return command; @@ -337,17 +352,17 @@ std::optional AlterCommand::parse(const ASTAlterCommand * command_ return command; } - else if (command_ast->type == ASTAlterCommand::DROP_STATISTIC) + else if (command_ast->type == ASTAlterCommand::DROP_STATISTICS) { AlterCommand command; command.ast = command_ast->clone(); - command.type = AlterCommand::DROP_STATISTIC; - const auto & ast_stat_decl = command_ast->statistic_decl->as(); + command.statistics_decl = command_ast->statistics_decl->clone(); + command.type = AlterCommand::DROP_STATISTICS; + const auto & ast_stat_decl = command_ast->statistics_decl->as(); - command.statistic_columns = ast_stat_decl.getColumnNames(); - command.statistic_type = ast_stat_decl.type; + command.statistics_columns = ast_stat_decl.getColumnNames(); command.if_exists = command_ast->if_exists; - command.clear = command_ast->clear_statistic; + command.clear = command_ast->clear_statistics; if (command_ast->partition) command.partition = command_ast->partition->clone(); @@ -676,41 +691,56 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) metadata.secondary_indices.erase(erase_it); } } - else if (type == ADD_STATISTIC) + else if (type == ADD_STATISTICS) { - for (const auto & statistic_column_name : statistic_columns) + for (const auto & statistics_column_name : statistics_columns) { - if (!metadata.columns.has(statistic_column_name)) + if (!metadata.columns.has(statistics_column_name)) { - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Cannot add statistic {} with type {}: this column is not found", statistic_column_name, statistic_type); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Cannot add statistics for column {}: this column is not found", statistics_column_name); } - if (!if_exists && metadata.columns.get(statistic_column_name).stat) - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Cannot add statistic {} with type {}: statistic on this column with this type already exists", statistic_column_name, statistic_type); } - auto stats = StatisticDescription::getStatisticsFromAST(statistic_decl, metadata.columns); - for (auto && stat : stats) + auto stats_vec = ColumnStatisticsDescription::fromAST(statistics_decl, metadata.columns); + for (const auto & stats : stats_vec) { - metadata.columns.modify(stat.column_name, - [&](ColumnDescription & column) { column.stat = std::move(stat); }); + metadata.columns.modify(stats.column_name, + [&](ColumnDescription & column) { column.statistics.merge(stats, column.name, column.type, if_not_exists); }); } } - else if (type == DROP_STATISTIC) + else if (type == DROP_STATISTICS) { - for (const auto & stat_column_name : statistic_columns) + for (const auto & statistics_column_name : statistics_columns) { - if (!metadata.columns.has(stat_column_name) || !metadata.columns.get(stat_column_name).stat) + if (!metadata.columns.has(statistics_column_name) + || metadata.columns.get(statistics_column_name).statistics.empty()) { if (if_exists) return; - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Wrong statistic name. Cannot find statistic {} with type {} to drop", backQuote(stat_column_name), statistic_type); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Wrong statistics name. Cannot find statistics {} to drop", backQuote(statistics_column_name)); } - if (!partition && !clear) + + if (!clear && !partition) + metadata.columns.modify(statistics_column_name, + [&](ColumnDescription & column) { column.statistics.clear(); }); + } + } + else if (type == MODIFY_STATISTICS) + { + for (const auto & statistics_column_name : statistics_columns) + { + if (!metadata.columns.has(statistics_column_name)) { - metadata.columns.modify(stat_column_name, - [&](ColumnDescription & column) { column.stat = std::nullopt; }); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Cannot add statistics for column {}: this column is not found", statistics_column_name); } } + + auto stats_vec = ColumnStatisticsDescription::fromAST(statistics_decl, metadata.columns); + for (const auto & stats : stats_vec) + { + metadata.columns.modify(stats.column_name, + [&](ColumnDescription & column) { column.statistics.assign(stats); }); + } } else if (type == ADD_CONSTRAINT) { @@ -833,8 +863,8 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) rename_visitor.visit(column_to_modify.default_desc.expression); if (column_to_modify.ttl) rename_visitor.visit(column_to_modify.ttl); - if (column_to_modify.name == column_name && column_to_modify.stat) - column_to_modify.stat->column_name = rename_to; + if (column_to_modify.name == column_name && !column_to_modify.statistics.empty()) + column_to_modify.statistics.column_name = rename_to; }); } if (metadata.table_ttl.definition_ast) @@ -958,7 +988,7 @@ bool AlterCommand::isRequireMutationStage(const StorageInMemoryMetadata & metada if (isRemovingProperty() || type == REMOVE_TTL || type == REMOVE_SAMPLE_BY) return false; - if (type == DROP_INDEX || type == DROP_PROJECTION || type == RENAME_COLUMN || type == DROP_STATISTIC) + if (type == DROP_INDEX || type == DROP_PROJECTION || type == RENAME_COLUMN || type == DROP_STATISTICS) return true; /// Drop alias is metadata alter, in other case mutation is required. @@ -1065,10 +1095,10 @@ std::optional AlterCommand::tryConvertToMutationCommand(Storage result.predicate = nullptr; } - else if (type == DROP_STATISTIC) + else if (type == DROP_STATISTICS) { - result.type = MutationCommand::Type::DROP_STATISTIC; - result.statistic_columns = statistic_columns; + result.type = MutationCommand::Type::DROP_STATISTICS; + result.statistics_columns = statistics_columns; if (clear) result.clear = true; diff --git a/src/Storages/AlterCommands.h b/src/Storages/AlterCommands.h index 46abffab8ad..a91bac10214 100644 --- a/src/Storages/AlterCommands.h +++ b/src/Storages/AlterCommands.h @@ -38,8 +38,9 @@ struct AlterCommand DROP_CONSTRAINT, ADD_PROJECTION, DROP_PROJECTION, - ADD_STATISTIC, - DROP_STATISTIC, + ADD_STATISTICS, + DROP_STATISTICS, + MODIFY_STATISTICS, MODIFY_TTL, MODIFY_SETTING, RESET_SETTING, @@ -123,9 +124,9 @@ struct AlterCommand /// For ADD/DROP PROJECTION String projection_name; - ASTPtr statistic_decl = nullptr; - std::vector statistic_columns; - String statistic_type; + ASTPtr statistics_decl = nullptr; + std::vector statistics_columns; + std::vector statistics_types; /// For MODIFY TTL ASTPtr ttl = nullptr; diff --git a/src/Storages/ColumnDependency.h b/src/Storages/ColumnDependency.h index b9088dd0227..dcbda7a4b86 100644 --- a/src/Storages/ColumnDependency.h +++ b/src/Storages/ColumnDependency.h @@ -26,8 +26,8 @@ struct ColumnDependency /// TTL is set for @column_name. TTL_TARGET, - /// Exists any statistic, that requires @column_name - STATISTIC, + /// Exists any statistics, that requires @column_name + STATISTICS, }; ColumnDependency(const String & column_name_, Kind kind_) diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index a8869970300..69e39323219 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -72,7 +72,7 @@ ColumnDescription & ColumnDescription::operator=(const ColumnDescription & other codec = other.codec ? other.codec->clone() : nullptr; settings = other.settings; ttl = other.ttl ? other.ttl->clone() : nullptr; - stat = other.stat; + statistics = other.statistics; return *this; } @@ -95,7 +95,7 @@ ColumnDescription & ColumnDescription::operator=(ColumnDescription && other) noe ttl = other.ttl ? other.ttl->clone() : nullptr; other.ttl.reset(); - stat = std::move(other.stat); + statistics = std::move(other.statistics); return *this; } @@ -107,7 +107,7 @@ bool ColumnDescription::operator==(const ColumnDescription & other) const return name == other.name && type->equals(*other.type) && default_desc == other.default_desc - && stat == other.stat + && statistics == other.statistics && ast_to_str(codec) == ast_to_str(other.codec) && settings == other.settings && ast_to_str(ttl) == ast_to_str(other.ttl); @@ -154,10 +154,10 @@ void ColumnDescription::writeText(WriteBuffer & buf) const DB::writeText(")", buf); } - if (stat) + if (!statistics.empty()) { writeChar('\t', buf); - writeEscapedString(queryToString(stat->ast), buf); + writeEscapedString(queryToString(statistics.getAST()), buf); } if (ttl) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 79e43d0a4e4..f0760160f0a 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -89,7 +89,7 @@ struct ColumnDescription ASTPtr codec; SettingsChanges settings; ASTPtr ttl; - std::optional stat; + ColumnStatisticsDescription statistics; ColumnDescription() = default; ColumnDescription(const ColumnDescription & other) { *this = other; } diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 9afafe9f52b..1f7ac23eb82 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -236,7 +236,7 @@ StorageID IStorage::getStorageID() const return storage_id; } -ConditionEstimator IStorage::getConditionEstimatorByPredicate(const StorageSnapshotPtr &, const ActionsDAGPtr &, ContextPtr) const +ConditionSelectivityEstimator IStorage::getConditionSelectivityEstimatorByPredicate(const StorageSnapshotPtr &, const ActionsDAGPtr &, ContextPtr) const { return {}; } diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 9d6b3457a24..98afd844046 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -68,7 +68,7 @@ using DatabaseAndTableName = std::pair; class BackupEntriesCollector; class RestorerFromBackup; -class ConditionEstimator; +class ConditionSelectivityEstimator; struct ColumnSize { @@ -135,7 +135,7 @@ public: /// Returns true if the storage supports queries with the PREWHERE section. virtual bool supportsPrewhere() const { return false; } - virtual ConditionEstimator getConditionEstimatorByPredicate(const StorageSnapshotPtr &, const ActionsDAGPtr &, ContextPtr) const; + virtual ConditionSelectivityEstimator getConditionSelectivityEstimatorByPredicate(const StorageSnapshotPtr &, const ActionsDAGPtr &, ContextPtr) const; /// Returns which columns supports PREWHERE, or empty std::nullopt if all columns is supported. /// This is needed for engines whose aggregates data from multiple tables, like Merge. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index c276361559c..4c8f1240cf5 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -673,16 +673,16 @@ String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(bool with_subc return *minimum_size_column; } -Statistics IMergeTreeDataPart::loadStatistics() const +ColumnsStatistics IMergeTreeDataPart::loadStatistics() const { const auto & metadata_snaphost = storage.getInMemoryMetadata(); auto total_statistics = MergeTreeStatisticsFactory::instance().getMany(metadata_snaphost.getColumns()); - Statistics result; + ColumnsStatistics result; for (auto & stat : total_statistics) { - String file_name = stat->getFileName() + STAT_FILE_SUFFIX; + String file_name = stat->getFileName() + STATS_FILE_SUFFIX; String file_path = fs::path(getDataPartStorage().getRelativePath()) / file_name; if (!metadata_manager->exists(file_name)) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index bd3814bf415..c9b3ec26df0 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -171,7 +171,7 @@ public: void remove(); - Statistics loadStatistics() const; + ColumnsStatistics loadStatistics() const; /// Initialize columns (from columns.txt if exists, or create from column files if not). /// Load various metadata into memory: checksums from checksums.txt, index if required, etc. diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index 891ba1b9660..6152da78395 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -119,7 +119,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, @@ -136,7 +136,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, @@ -156,7 +156,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index f04beb37ebb..d9e9a433827 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -84,7 +84,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index f1f856da3a2..62bc3b42d1f 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -658,7 +658,7 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const /// because all of them were already recalculated and written /// as key part of vertical merge std::vector{}, - std::vector{}, /// TODO: think about it + ColumnsStatistics{}, /// TODO(hanfei) &global_ctx->written_offset_columns, global_ctx->to->getIndexGranularity()); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 32d08d1562e..14a310364dc 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -73,7 +73,7 @@ #include #include #include -#include +#include #include #include #include @@ -471,10 +471,10 @@ StoragePolicyPtr MergeTreeData::getStoragePolicy() const return storage_policy; } -ConditionEstimator MergeTreeData::getConditionEstimatorByPredicate( +ConditionSelectivityEstimator MergeTreeData::getConditionSelectivityEstimatorByPredicate( const StorageSnapshotPtr & storage_snapshot, const ActionsDAGPtr & filter_dag, ContextPtr local_context) const { - if (!local_context->getSettings().allow_statistic_optimize) + if (!local_context->getSettings().allow_statistics_optimize) return {}; const auto & parts = assert_cast(*storage_snapshot->data).parts; @@ -486,23 +486,29 @@ ConditionEstimator MergeTreeData::getConditionEstimatorByPredicate( ASTPtr expression_ast; - ConditionEstimator result; + ConditionSelectivityEstimator result; PartitionPruner partition_pruner(storage_snapshot->metadata, filter_dag, local_context); if (partition_pruner.isUseless()) { /// Read all partitions. for (const auto & part : parts) + try { auto stats = part->loadStatistics(); /// TODO: We only have one stats file for every part. for (const auto & stat : stats) result.merge(part->info.getPartNameV1(), part->rows_count, stat); } + catch (...) + { + tryLogCurrentException(log, fmt::format("while loading statistics on part {}", part->info.getPartNameV1())); + } } else { for (const auto & part : parts) + try { if (!partition_pruner.canBePruned(*part)) { @@ -511,6 +517,10 @@ ConditionEstimator MergeTreeData::getConditionEstimatorByPredicate( result.merge(part->info.getPartNameV1(), part->rows_count, stat); } } + catch (...) + { + tryLogCurrentException(log, fmt::format("while loading statistics on part {}", part->info.getPartNameV1())); + } } return result; @@ -691,8 +701,8 @@ void MergeTreeData::checkProperties( for (const auto & col : new_metadata.columns) { - if (col.stat) - MergeTreeStatisticsFactory::instance().validate(*col.stat, col.type); + if (!col.statistics.empty()) + MergeTreeStatisticsFactory::instance().validate(col.statistics, col.type); } checkKeyExpression(*new_sorting_key.expression, new_sorting_key.sample_block, "Sorting", allow_nullable_key_); @@ -3469,13 +3479,13 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context new_metadata.getColumns().getPhysical(command.column_name)); const auto & old_column = old_metadata.getColumns().get(command.column_name); - if (old_column.stat) + if (!old_column.statistics.empty()) { const auto & new_column = new_metadata.getColumns().get(command.column_name); if (!old_column.type->equals(*new_column.type)) throw Exception(ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN, - "ALTER types of column {} with statistic is not not safe " - "because it can change the representation of statistic", + "ALTER types of column {} with statistics is not not safe " + "because it can change the representation of statistics", backQuoteIfNeed(command.column_name)); } } @@ -8514,7 +8524,7 @@ std::pair MergeTreeData::createE const auto & index_factory = MergeTreeIndexFactory::instance(); MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), - Statistics{}, + ColumnsStatistics{}, compression_codec, txn ? txn->tid : Tx::PrehistoricTID); bool sync_on_insert = settings->fsync_after_insert; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 440daaf6ced..c6f736a4afd 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -426,7 +426,7 @@ public: bool supportsPrewhere() const override { return true; } - ConditionEstimator getConditionEstimatorByPredicate(const StorageSnapshotPtr &, const ActionsDAGPtr &, ContextPtr) const override; + ConditionSelectivityEstimator getConditionSelectivityEstimatorByPredicate(const StorageSnapshotPtr &, const ActionsDAGPtr &, ContextPtr) const override; bool supportsFinal() const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 4a160e5e229..d628fd6b529 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -59,7 +59,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index 149f86cef00..379c465a409 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -64,7 +64,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index fb0f0ba9154..21d046c76f2 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -20,7 +20,7 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( const StorageMetadataPtr & metadata_snapshot_, const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc_, - const Statistics & stats_to_recalc, + const ColumnsStatistics & stats_to_recalc, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index a5527b74e69..b440a37222d 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -21,7 +21,7 @@ public: const StorageMetadataPtr & metadata_snapshot_, const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc, + const ColumnsStatistics & stats_to_recalc, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index bcf51bfcd3d..a576720294f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -150,7 +150,7 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const StorageMetadataPtr & metadata_snapshot_, const VirtualsDescriptionPtr & virtual_columns_, const MergeTreeIndices & indices_to_recalc_, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, @@ -265,7 +265,7 @@ void MergeTreeDataPartWriterOnDisk::initStatistics() stats_streams.emplace_back(std::make_unique>( stats_name, data_part_storage, - stats_name, STAT_FILE_SUFFIX, + stats_name, STATS_FILE_SUFFIX, default_codec, settings.max_compress_block_size, settings.query_write_settings)); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index 0c31cabc8c4..bdf0fdb7f32 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -111,7 +111,7 @@ public: const StorageMetadataPtr & metadata_snapshot_, const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, @@ -155,7 +155,7 @@ protected: const MergeTreeIndices skip_indices; - const Statistics stats; + const ColumnsStatistics stats; std::vector stats_streams; const String marks_file_extension; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index afa14d8a98a..5ba326cef0c 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -86,7 +86,7 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( const StorageMetadataPtr & metadata_snapshot_, const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc_, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index 9d18ac76880..ab86ed27c7e 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -31,7 +31,7 @@ public: const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 11058c542a6..2e287ff3042 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -760,9 +760,16 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd CurrentMetrics::MergeTreeDataSelectExecutorThreadsScheduled, num_threads); + + /// Instances of ThreadPool "borrow" threads from the global thread pool. + /// We intentionally use scheduleOrThrow here to avoid a deadlock. + /// For example, queries can already be running with threads from the + /// global pool, and if we saturate max_thread_pool_size whilst requesting + /// more in this loop, queries will block infinitely. + /// So we wait until lock_acquire_timeout, and then raise an exception. for (size_t part_index = 0; part_index < parts.size(); ++part_index) { - pool.scheduleOrThrowOnError([&, part_index, thread_group = CurrentThread::getGroup()] + pool.scheduleOrThrow([&, part_index, thread_group = CurrentThread::getGroup()] { setThreadName("MergeTreeIndex"); @@ -774,7 +781,7 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd CurrentThread::attachToGroupIfDetached(thread_group); process_part(part_index); - }); + }, Priority{}, context->getSettingsRef().lock_acquire_timeout.totalMicroseconds()); } pool.wait(); diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 04182062b12..8e304936747 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -469,7 +469,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( if (context->getSettingsRef().materialize_skip_indexes_on_insert) indices = MergeTreeIndexFactory::instance().getMany(metadata_snapshot->getSecondaryIndices()); - Statistics statistics; + ColumnsStatistics statistics; if (context->getSettingsRef().materialize_statistics_on_insert) statistics = MergeTreeStatisticsFactory::instance().getMany(metadata_snapshot->getColumns()); @@ -754,7 +754,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( metadata_snapshot, columns, MergeTreeIndices{}, - Statistics{}, /// TODO(hanfei): It should be helpful to write statistics for projection result. + /// TODO(hanfei): It should be helpful to write statistics for projection result. + ColumnsStatistics{}, compression_codec, Tx::PrehistoricTID, false, false, data.getContext()->getWriteSettings()); diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilterText.cpp b/src/Storages/MergeTree/MergeTreeIndexBloomFilterText.cpp index 6f46ee0c184..8cf58687125 100644 --- a/src/Storages/MergeTree/MergeTreeIndexBloomFilterText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilterText.cpp @@ -566,7 +566,7 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals( out.function = RPNElement::FUNCTION_EQUALS; out.bloom_filter = std::make_unique(params); const auto & value = const_value.get(); - token_extractor->stringToBloomFilter(value.data(), value.size(), *out.bloom_filter); + token_extractor->substringToBloomFilter(value.data(), value.size(), *out.bloom_filter, true, false); return true; } else if (function_name == "endsWith") @@ -575,7 +575,7 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals( out.function = RPNElement::FUNCTION_EQUALS; out.bloom_filter = std::make_unique(params); const auto & value = const_value.get(); - token_extractor->stringToBloomFilter(value.data(), value.size(), *out.bloom_filter); + token_extractor->substringToBloomFilter(value.data(), value.size(), *out.bloom_filter, false, true); return true; } else if (function_name == "multiSearchAny" @@ -596,7 +596,15 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals( bloom_filters.back().emplace_back(params); const auto & value = element.get(); - token_extractor->stringToBloomFilter(value.data(), value.size(), bloom_filters.back().back()); + + if (function_name == "multiSearchAny") + { + token_extractor->substringToBloomFilter(value.data(), value.size(), bloom_filters.back().back(), false, false); + } + else + { + token_extractor->stringToBloomFilter(value.data(), value.size(), bloom_filters.back().back()); + } } out.set_bloom_filters = std::move(bloom_filters); return true; @@ -625,12 +633,12 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals( for (const auto & alternative : alternatives) { bloom_filters.back().emplace_back(params); - token_extractor->stringToBloomFilter(alternative.data(), alternative.size(), bloom_filters.back().back()); + token_extractor->substringToBloomFilter(alternative.data(), alternative.size(), bloom_filters.back().back(), false, false); } out.set_bloom_filters = std::move(bloom_filters); } else - token_extractor->stringToBloomFilter(required_substring.data(), required_substring.size(), *out.bloom_filter); + token_extractor->substringToBloomFilter(required_substring.data(), required_substring.size(), *out.bloom_filter, false, false); return true; } diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index af9ee710f88..47ce24b91eb 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -595,7 +595,7 @@ bool MergeTreeConditionFullText::traverseASTEquals( out.function = RPNElement::FUNCTION_EQUALS; out.gin_filter = std::make_unique(params); const auto & value = const_value.get(); - token_extractor->stringToGinFilter(value.data(), value.size(), *out.gin_filter); + token_extractor->substringToGinFilter(value.data(), value.size(), *out.gin_filter, true, false); return true; } else if (function_name == "endsWith") @@ -604,7 +604,7 @@ bool MergeTreeConditionFullText::traverseASTEquals( out.function = RPNElement::FUNCTION_EQUALS; out.gin_filter = std::make_unique(params); const auto & value = const_value.get(); - token_extractor->stringToGinFilter(value.data(), value.size(), *out.gin_filter); + token_extractor->substringToGinFilter(value.data(), value.size(), *out.gin_filter, false, true); return true; } else if (function_name == "multiSearchAny") @@ -622,7 +622,7 @@ bool MergeTreeConditionFullText::traverseASTEquals( gin_filters.back().emplace_back(params); const auto & value = element.get(); - token_extractor->stringToGinFilter(value.data(), value.size(), gin_filters.back().back()); + token_extractor->substringToGinFilter(value.data(), value.size(), gin_filters.back().back(), false, false); } out.set_gin_filters = std::move(gin_filters); return true; @@ -650,14 +650,14 @@ bool MergeTreeConditionFullText::traverseASTEquals( for (const auto & alternative : alternatives) { gin_filters.back().emplace_back(params); - token_extractor->stringToGinFilter(alternative.data(), alternative.size(), gin_filters.back().back()); + token_extractor->substringToGinFilter(alternative.data(), alternative.size(), gin_filters.back().back(), false, false); } out.set_gin_filters = std::move(gin_filters); } else { out.gin_filter = std::make_unique(params); - token_extractor->stringToGinFilter(required_substring.data(), required_substring.size(), *out.gin_filter); + token_extractor->substringToGinFilter(required_substring.data(), required_substring.size(), *out.gin_filter, false, false); } return true; @@ -742,6 +742,7 @@ bool MergeTreeConditionFullText::tryPrepareSetGinFilter( MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const { + /// ------ /// Index type 'inverted' was renamed to 'full_text' in May 2024. /// Tables with old indexes can be loaded during a transition period. We still want let users know that they should drop existing /// indexes and re-create them. Function `createIndexGranule` is called whenever the index is used by queries. Reject the query if we @@ -749,6 +750,7 @@ MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const /// TODO: remove this at the end of 2024. if (index.type == INVERTED_INDEX_NAME) throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'inverted' are no longer supported. Please drop and recreate the index as type 'full-text'"); + /// ------ return std::make_shared(index.name, index.column_names.size(), params); } diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 3844ac18268..a9a5fddace4 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -53,7 +53,7 @@ static Int64 findMinPosition(const NameSet & condition_table_columns, const Name MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( std::unordered_map column_sizes_, const StorageMetadataPtr & metadata_snapshot, - const ConditionEstimator & estimator_, + const ConditionSelectivityEstimator & estimator_, const Names & queried_columns_, const std::optional & supported_columns_, LoggerPtr log_) @@ -92,7 +92,7 @@ void MergeTreeWhereOptimizer::optimize(SelectQueryInfo & select_query_info, cons where_optimizer_context.move_all_conditions_to_prewhere = context->getSettingsRef().move_all_conditions_to_prewhere; where_optimizer_context.move_primary_key_columns_to_end_of_prewhere = context->getSettingsRef().move_primary_key_columns_to_end_of_prewhere; where_optimizer_context.is_final = select.final(); - where_optimizer_context.use_statistic = context->getSettingsRef().allow_statistic_optimize; + where_optimizer_context.use_statistics = context->getSettingsRef().allow_statistics_optimize; RPNBuilderTreeContext tree_context(context, std::move(block_with_constants), {} /*prepared_sets*/); RPNBuilderTreeNode node(select.where().get(), tree_context); @@ -123,7 +123,7 @@ MergeTreeWhereOptimizer::FilterActionsOptimizeResult MergeTreeWhereOptimizer::op where_optimizer_context.move_all_conditions_to_prewhere = context->getSettingsRef().move_all_conditions_to_prewhere; where_optimizer_context.move_primary_key_columns_to_end_of_prewhere = context->getSettingsRef().move_primary_key_columns_to_end_of_prewhere; where_optimizer_context.is_final = is_final; - where_optimizer_context.use_statistic = context->getSettingsRef().allow_statistic_optimize; + where_optimizer_context.use_statistics = context->getSettingsRef().allow_statistics_optimize; RPNBuilderTreeContext tree_context(context); RPNBuilderTreeNode node(&filter_dag->findInOutputs(filter_column_name), tree_context); @@ -273,15 +273,17 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const RPNBuilderTree /// Do not move conditions involving all queried columns. && cond.table_columns.size() < queried_columns.size(); - if (where_optimizer_context.use_statistic) + if (cond.viable) + cond.good = isConditionGood(node, table_columns); + + if (where_optimizer_context.use_statistics) { cond.good = cond.viable; - cond.selectivity = estimator.estimateSelectivity(node); - LOG_TEST(log, "Condition {} has selectivity {}", node.getColumnName(), cond.selectivity); - } - else if (cond.viable) - { - cond.good = isConditionGood(node, table_columns); + + cond.estimated_row_count = estimator.estimateRowCount(node); + + if (node.getASTNode() != nullptr) + LOG_DEBUG(log, "Condition {} has estimated row count {}", node.getASTNode()->dumpTree(), cond.estimated_row_count); } if (where_optimizer_context.move_primary_key_columns_to_end_of_prewhere) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 6c5ff29bc76..ba6b4660924 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include @@ -38,7 +38,7 @@ public: MergeTreeWhereOptimizer( std::unordered_map column_sizes_, const StorageMetadataPtr & metadata_snapshot, - const ConditionEstimator & estimator_, + const ConditionSelectivityEstimator & estimator_, const Names & queried_columns_, const std::optional & supported_columns_, LoggerPtr log_); @@ -76,7 +76,7 @@ private: bool good = false; /// the lower the better - Float64 selectivity = 1.0; + Float64 estimated_row_count = 0; /// Does the condition contain primary key column? /// If so, it is better to move it further to the end of PREWHERE chain depending on minimal position in PK of any @@ -85,7 +85,7 @@ private: auto tuple() const { - return std::make_tuple(!viable, !good, -min_position_in_primary_key, selectivity, columns_size, table_columns.size()); + return std::make_tuple(!viable, !good, -min_position_in_primary_key, estimated_row_count, columns_size, table_columns.size()); } /// Is condition a better candidate for moving to PREWHERE? @@ -104,7 +104,7 @@ private: bool move_all_conditions_to_prewhere = false; bool move_primary_key_columns_to_end_of_prewhere = false; bool is_final = false; - bool use_statistic = false; + bool use_statistics = false; }; struct OptimizeResult @@ -147,7 +147,7 @@ private: static NameSet determineArrayJoinedNames(const ASTSelectQuery & select); - const ConditionEstimator estimator; + const ConditionSelectivityEstimator estimator; const NameSet table_columns; const Names queried_columns; diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index c5799fab09f..164658c914e 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -20,7 +20,7 @@ MergedBlockOutputStream::MergedBlockOutputStream( const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, - const Statistics & statistics, + const ColumnsStatistics & statistics, CompressionCodecPtr default_codec_, TransactionID tid, bool reset_columns_, diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.h b/src/Storages/MergeTree/MergedBlockOutputStream.h index c1e3d75fefc..e212fe5bb5a 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -20,7 +20,7 @@ public: const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, - const Statistics & statistics, + const ColumnsStatistics & statistics, CompressionCodecPtr default_codec_, TransactionID tid, bool reset_columns_ = false, diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 674a9bd498f..54d177943d0 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -16,7 +16,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( const Block & header_, CompressionCodecPtr default_codec, const MergeTreeIndices & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, WrittenOffsetColumns * offset_columns_, const MergeTreeIndexGranularity & index_granularity, const MergeTreeIndexGranularityInfo * index_granularity_info) diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h index ad3cabe459e..16a54ff33b6 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h @@ -20,7 +20,7 @@ public: const Block & header_, CompressionCodecPtr default_codec_, const MergeTreeIndices & indices_to_recalc_, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, WrittenOffsetColumns * offset_columns_ = nullptr, const MergeTreeIndexGranularity & index_granularity = {}, const MergeTreeIndexGranularityInfo * index_granularity_info_ = nullptr); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 1828b8a7eeb..1318563e469 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -130,7 +130,7 @@ static void splitAndModifyMutationCommands( } } if (command.type == MutationCommand::Type::MATERIALIZE_INDEX - || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC + || command.type == MutationCommand::Type::MATERIALIZE_STATISTICS || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL || command.type == MutationCommand::Type::DELETE @@ -143,7 +143,7 @@ static void splitAndModifyMutationCommands( } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION - || command.type == MutationCommand::Type::DROP_STATISTIC) + || command.type == MutationCommand::Type::DROP_STATISTICS) { for_file_renames.push_back(command); } @@ -258,7 +258,7 @@ static void splitAndModifyMutationCommands( for_interpreter.push_back(command); } else if (command.type == MutationCommand::Type::MATERIALIZE_INDEX - || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC + || command.type == MutationCommand::Type::MATERIALIZE_STATISTICS || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL || command.type == MutationCommand::Type::DELETE @@ -269,7 +269,7 @@ static void splitAndModifyMutationCommands( } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION - || command.type == MutationCommand::Type::DROP_STATISTIC) + || command.type == MutationCommand::Type::DROP_STATISTICS) { for_file_renames.push_back(command); } @@ -532,16 +532,16 @@ static ExecuteTTLType shouldExecuteTTL(const StorageMetadataPtr & metadata_snaps return has_ttl_expression ? ExecuteTTLType::RECALCULATE : ExecuteTTLType::NONE; } -static std::set getStatisticsToRecalculate(const StorageMetadataPtr & metadata_snapshot, const NameSet & materialized_stats) +static std::set getStatisticsToRecalculate(const StorageMetadataPtr & metadata_snapshot, const NameSet & materialized_stats) { const auto & stats_factory = MergeTreeStatisticsFactory::instance(); - std::set stats_to_recalc; + std::set stats_to_recalc; const auto & columns = metadata_snapshot->getColumns(); for (const auto & col_desc : columns) { - if (col_desc.stat && materialized_stats.contains(col_desc.name)) + if (!col_desc.statistics.empty() && materialized_stats.contains(col_desc.name)) { - stats_to_recalc.insert(stats_factory.get(*col_desc.stat)); + stats_to_recalc.insert(stats_factory.get(col_desc.statistics)); } } return stats_to_recalc; @@ -655,7 +655,7 @@ static NameSet collectFilesToSkip( const std::set & indices_to_recalc, const String & mrk_extension, const std::set & projections_to_recalc, - const std::set & stats_to_recalc) + const std::set & stats_to_recalc) { NameSet files_to_skip = source_part->getFileNamesWithoutChecksums(); @@ -683,7 +683,7 @@ static NameSet collectFilesToSkip( files_to_skip.insert(projection->getDirectoryName()); for (const auto & stat : stats_to_recalc) - files_to_skip.insert(stat->getFileName() + STAT_FILE_SUFFIX); + files_to_skip.insert(stat->getFileName() + STATS_FILE_SUFFIX); if (isWidePart(source_part)) { @@ -772,11 +772,11 @@ static NameToNameVector collectFilesForRenames( if (source_part->checksums.has(command.column_name + ".proj")) add_rename(command.column_name + ".proj", ""); } - else if (command.type == MutationCommand::Type::DROP_STATISTIC) + else if (command.type == MutationCommand::Type::DROP_STATISTICS) { - for (const auto & statistic_column_name : command.statistic_columns) - if (source_part->checksums.has(STAT_FILE_PREFIX + statistic_column_name + STAT_FILE_SUFFIX)) - add_rename(STAT_FILE_PREFIX + statistic_column_name + STAT_FILE_SUFFIX, ""); + for (const auto & statistics_column_name : command.statistics_columns) + if (source_part->checksums.has(STATS_FILE_PREFIX + statistics_column_name + STATS_FILE_SUFFIX)) + add_rename(STATS_FILE_PREFIX + statistics_column_name + STATS_FILE_SUFFIX, ""); } else if (isWidePart(source_part)) { @@ -797,9 +797,9 @@ static NameToNameVector collectFilesForRenames( if (auto serialization = source_part->tryGetSerialization(command.column_name)) serialization->enumerateStreams(callback); - /// if we drop a column with statistic, we should also drop the stat file. - if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) - add_rename(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX, ""); + /// if we drop a column with statistics, we should also drop the stat file. + if (source_part->checksums.has(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX)) + add_rename(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX, ""); } else if (command.type == MutationCommand::Type::RENAME_COLUMN) { @@ -833,9 +833,9 @@ static NameToNameVector collectFilesForRenames( if (auto serialization = source_part->tryGetSerialization(command.column_name)) serialization->enumerateStreams(callback); - /// if we rename a column with statistic, we should also rename the stat file. - if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) - add_rename(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX, STAT_FILE_PREFIX + command.rename_to + STAT_FILE_SUFFIX); + /// if we rename a column with statistics, we should also rename the stat file. + if (source_part->checksums.has(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX)) + add_rename(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX, STATS_FILE_PREFIX + command.rename_to + STATS_FILE_SUFFIX); } else if (command.type == MutationCommand::Type::READ_COLUMN) { @@ -1022,7 +1022,7 @@ struct MutationContext IMergeTreeDataPart::MinMaxIndexPtr minmax_idx; std::set indices_to_recalc; - std::set stats_to_recalc; + std::set stats_to_recalc; std::set projections_to_recalc; MergeTreeData::DataPart::Checksums existing_indices_stats_checksums; NameSet files_to_skip; @@ -1473,12 +1473,12 @@ private: { if (command.type == MutationCommand::DROP_INDEX) removed_indices.insert(command.column_name); - else if (command.type == MutationCommand::DROP_STATISTIC) - for (const auto & column_name : command.statistic_columns) + else if (command.type == MutationCommand::DROP_STATISTICS) + for (const auto & column_name : command.statistics_columns) removed_stats.insert(column_name); else if (command.type == MutationCommand::RENAME_COLUMN - && ctx->source_part->checksums.files.contains(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) - renamed_stats[STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX] = STAT_FILE_PREFIX + command.rename_to + STAT_FILE_SUFFIX; + && ctx->source_part->checksums.files.contains(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX)) + renamed_stats[STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX] = STATS_FILE_PREFIX + command.rename_to + STATS_FILE_SUFFIX; } bool is_full_part_storage = isFullPartStorage(ctx->new_data_part->getDataPartStorage()); @@ -1514,23 +1514,23 @@ private: } } - Statistics stats_to_rewrite; + ColumnsStatistics stats_to_rewrite; const auto & columns = ctx->metadata_snapshot->getColumns(); for (const auto & col : columns) { - if (!col.stat || removed_stats.contains(col.name)) + if (col.statistics.empty() || removed_stats.contains(col.name)) continue; if (ctx->materialized_statistics.contains(col.name)) { - stats_to_rewrite.push_back(MergeTreeStatisticsFactory::instance().get(*col.stat)); + stats_to_rewrite.push_back(MergeTreeStatisticsFactory::instance().get(col.statistics)); } else { /// We do not hard-link statistics which - /// 1. In `DROP STATISTIC` statement. It is filtered by `removed_stats` + /// 1. In `DROP STATISTICS` statement. It is filtered by `removed_stats` /// 2. Not in column list anymore, including `DROP COLUMN`. It is not touched by this loop. - String stat_file_name = STAT_FILE_PREFIX + col.name + STAT_FILE_SUFFIX; + String stat_file_name = STATS_FILE_PREFIX + col.name + STATS_FILE_SUFFIX; auto it = ctx->source_part->checksums.files.find(stat_file_name); if (it != ctx->source_part->checksums.files.end()) { @@ -1904,7 +1904,7 @@ private: ctx->updated_header, ctx->compression_codec, std::vector(ctx->indices_to_recalc.begin(), ctx->indices_to_recalc.end()), - Statistics(ctx->stats_to_recalc.begin(), ctx->stats_to_recalc.end()), + ColumnsStatistics(ctx->stats_to_recalc.begin(), ctx->stats_to_recalc.end()), nullptr, ctx->source_part->index_granularity, &ctx->source_part->index_granularity_info diff --git a/src/Storages/MutationCommands.cpp b/src/Storages/MutationCommands.cpp index aaf5c1b5d87..f736c863eee 100644 --- a/src/Storages/MutationCommands.cpp +++ b/src/Storages/MutationCommands.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -83,15 +83,15 @@ std::optional MutationCommand::parse(ASTAlterCommand * command, res.index_name = command->index->as().name(); return res; } - else if (command->type == ASTAlterCommand::MATERIALIZE_STATISTIC) + else if (command->type == ASTAlterCommand::MATERIALIZE_STATISTICS) { MutationCommand res; res.ast = command->ptr(); - res.type = MATERIALIZE_STATISTIC; + res.type = MATERIALIZE_STATISTICS; if (command->partition) res.partition = command->partition->clone(); res.predicate = nullptr; - res.statistic_columns = command->statistic_decl->as().getColumnNames(); + res.statistics_columns = command->statistics_decl->as().getColumnNames(); return res; } else if (command->type == ASTAlterCommand::MATERIALIZE_PROJECTION) @@ -150,16 +150,16 @@ std::optional MutationCommand::parse(ASTAlterCommand * command, res.clear = true; return res; } - else if (parse_alter_commands && command->type == ASTAlterCommand::DROP_STATISTIC) + else if (parse_alter_commands && command->type == ASTAlterCommand::DROP_STATISTICS) { MutationCommand res; res.ast = command->ptr(); - res.type = MutationCommand::Type::DROP_STATISTIC; + res.type = MutationCommand::Type::DROP_STATISTICS; if (command->partition) res.partition = command->partition->clone(); if (command->clear_index) res.clear = true; - res.statistic_columns = command->statistic_decl->as().getColumnNames(); + res.statistics_columns = command->statistics_decl->as().getColumnNames(); return res; } else if (parse_alter_commands && command->type == ASTAlterCommand::DROP_PROJECTION) diff --git a/src/Storages/MutationCommands.h b/src/Storages/MutationCommands.h index 6e10f7d9b2d..f999aab1f4d 100644 --- a/src/Storages/MutationCommands.h +++ b/src/Storages/MutationCommands.h @@ -30,12 +30,12 @@ struct MutationCommand UPDATE, MATERIALIZE_INDEX, MATERIALIZE_PROJECTION, - MATERIALIZE_STATISTIC, + MATERIALIZE_STATISTICS, READ_COLUMN, /// Read column and apply conversions (MODIFY COLUMN alter query). DROP_COLUMN, DROP_INDEX, DROP_PROJECTION, - DROP_STATISTIC, + DROP_STATISTICS, MATERIALIZE_TTL, RENAME_COLUMN, MATERIALIZE_COLUMN, @@ -51,10 +51,11 @@ struct MutationCommand /// Columns with corresponding actions std::unordered_map column_to_update_expression = {}; - /// For MATERIALIZE INDEX and PROJECTION and STATISTIC + /// For MATERIALIZE INDEX and PROJECTION and STATISTICS String index_name = {}; String projection_name = {}; - std::vector statistic_columns = {}; + std::vector statistics_columns = {}; + std::vector statistics_types = {}; /// For MATERIALIZE INDEX, UPDATE and DELETE. ASTPtr partition = {}; diff --git a/src/Storages/S3Queue/S3QueueMetadata.cpp b/src/Storages/S3Queue/S3QueueMetadata.cpp index f4c8c5c5ef2..9c77bb2d24c 100644 --- a/src/Storages/S3Queue/S3QueueMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueMetadata.cpp @@ -167,7 +167,7 @@ S3QueueMetadata::FileMetadataPtr S3QueueMetadata::getFileMetadata( S3QueueOrderedFileMetadata::BucketInfoPtr bucket_info) { auto file_status = local_file_statuses->get(path, /* create */true); - switch (settings.mode) + switch (settings.mode.value) { case S3QueueMode::ORDERED: return std::make_shared( diff --git a/src/Storages/S3Queue/S3QueueMetadata.h b/src/Storages/S3Queue/S3QueueMetadata.h index ef4a9808c68..25d01fb52b9 100644 --- a/src/Storages/S3Queue/S3QueueMetadata.h +++ b/src/Storages/S3Queue/S3QueueMetadata.h @@ -82,7 +82,6 @@ private: const fs::path zookeeper_path; const size_t buckets_num; - bool initialized = false; LoggerPtr log; std::atomic_bool shutdown_called = false; diff --git a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp index d1298b8c4fa..bac87c95cc9 100644 --- a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp @@ -371,7 +371,6 @@ void S3QueueOrderedFileMetadata::setProcessedImpl() }; const auto zk_client = getZooKeeper(); - const auto node_metadata_str = node_metadata.toString(); std::string failure_reason; while (true) diff --git a/src/Storages/S3Queue/S3QueueSettings.h b/src/Storages/S3Queue/S3QueueSettings.h index c486a7fbb5d..4a92d99c411 100644 --- a/src/Storages/S3Queue/S3QueueSettings.h +++ b/src/Storages/S3Queue/S3QueueSettings.h @@ -13,7 +13,7 @@ class ASTStorage; #define S3QUEUE_RELATED_SETTINGS(M, ALIAS) \ M(S3QueueMode, \ mode, \ - S3QueueMode::UNORDERED, \ + S3QueueMode::ORDERED, \ "With unordered mode, the set of all already processed files is tracked with persistent nodes in ZooKepeer." \ "With ordered mode, only the max name of the successfully consumed file stored.", \ 0) \ diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 0844d0a479e..afb75a21b21 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -71,8 +71,14 @@ namespace return zkutil::extractZooKeeperPath(result_zk_path, true); } - void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings) + void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings, bool is_attach) { + if (!is_attach && !s3queue_settings.mode.changed) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `mode` (Unordered/Ordered) is not specified, but is required."); + } + /// In case !is_attach, we leave Ordered mode as default for compatibility. + if (!s3queue_settings.s3queue_processing_threads_num) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `s3queue_processing_threads_num` cannot be set to zero"); @@ -125,15 +131,7 @@ StorageS3Queue::StorageS3Queue( throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue url must either end with '/' or contain globs"); } - if (mode == LoadingStrictnessLevel::CREATE - && !context_->getSettingsRef().s3queue_allow_experimental_sharded_mode - && s3queue_settings->mode == S3QueueMode::ORDERED - && (s3queue_settings->s3queue_buckets > 1 || s3queue_settings->s3queue_processing_threads_num > 1)) - { - throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue sharded mode is not allowed. To enable use `s3queue_allow_experimental_sharded_mode`"); - } - - checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef()); + checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef(), mode > LoadingStrictnessLevel::CREATE); object_storage = configuration->createObjectStorage(context_, /* is_readonly */true); FormatFactory::instance().checkFormatName(configuration->format); diff --git a/src/Storages/Statistics/ConditionSelectivityEstimator.cpp b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp new file mode 100644 index 00000000000..757136fdf42 --- /dev/null +++ b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp @@ -0,0 +1,201 @@ +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +void ConditionSelectivityEstimator::ColumnSelectivityEstimator::merge(String part_name, ColumnStatisticsPtr stats) +{ + if (part_statistics.contains(part_name)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "part {} has been added in column {}", part_name, stats->columnName()); + part_statistics[part_name] = stats; +} + +Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(Float64 val, Float64 rows) const +{ + if (part_statistics.empty()) + return default_normal_cond_factor * rows; + Float64 result = 0; + Float64 part_rows = 0; + for (const auto & [key, estimator] : part_statistics) + { + result += estimator->estimateLess(val); + part_rows += estimator->rowCount(); + } + return result * rows / part_rows; +} + +Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateGreater(Float64 val, Float64 rows) const +{ + return rows - estimateLess(val, rows); +} + +Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(Float64 val, Float64 rows) const +{ + if (part_statistics.empty()) + { + if (val < - threshold || val > threshold) + return default_normal_cond_factor * rows; + else + return default_good_cond_factor * rows; + } + Float64 result = 0; + Float64 partial_cnt = 0; + for (const auto & [key, estimator] : part_statistics) + { + result += estimator->estimateEqual(val); + partial_cnt += estimator->rowCount(); + } + return result * rows / partial_cnt; +} + +/// second return value represents how many columns in the node. +static std::pair tryToExtractSingleColumn(const RPNBuilderTreeNode & node) +{ + if (node.isConstant()) + { + return {}; + } + + if (!node.isFunction()) + { + auto column_name = node.getColumnName(); + return {column_name, 1}; + } + + auto function_node = node.toFunctionNode(); + size_t arguments_size = function_node.getArgumentsSize(); + std::pair result; + for (size_t i = 0; i < arguments_size; ++i) + { + auto function_argument = function_node.getArgumentAt(i); + auto subresult = tryToExtractSingleColumn(function_argument); + if (subresult.second == 0) /// the subnode contains 0 column + continue; + else if (subresult.second > 1) /// the subnode contains more than 1 column + return subresult; + else if (result.second == 0 || result.first == subresult.first) /// subnodes contain same column. + result = subresult; + else + return {"", 2}; + } + return result; +} + +std::pair ConditionSelectivityEstimator::extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const +{ + if (!node.isFunction()) + return {}; + + auto function_node = node.toFunctionNode(); + if (function_node.getArgumentsSize() != 2) + return {}; + + String function_name = function_node.getFunctionName(); + + auto lhs_argument = function_node.getArgumentAt(0); + auto rhs_argument = function_node.getArgumentAt(1); + + auto lhs_argument_column_name = lhs_argument.getColumnName(); + auto rhs_argument_column_name = rhs_argument.getColumnName(); + + bool lhs_argument_is_column = column_name == (lhs_argument_column_name); + bool rhs_argument_is_column = column_name == (rhs_argument_column_name); + + bool lhs_argument_is_constant = lhs_argument.isConstant(); + bool rhs_argument_is_constant = rhs_argument.isConstant(); + + RPNBuilderTreeNode * constant_node = nullptr; + + if (lhs_argument_is_column && rhs_argument_is_constant) + constant_node = &rhs_argument; + else if (lhs_argument_is_constant && rhs_argument_is_column) + constant_node = &lhs_argument; + else + return {}; + + Field output_value; + DataTypePtr output_type; + if (!constant_node->tryGetConstant(output_value, output_type)) + return {}; + + const auto type = output_value.getType(); + Float64 value; + if (type == Field::Types::Int64) + value = output_value.get(); + else if (type == Field::Types::UInt64) + value = output_value.get(); + else if (type == Field::Types::Float64) + value = output_value.get(); + else + return {}; + return std::make_pair(function_name, value); +} + +Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode & node) const +{ + auto result = tryToExtractSingleColumn(node); + if (result.second != 1) + { + return default_unknown_cond_factor; + } + String col = result.first; + auto it = column_estimators.find(col); + + /// If there the estimator of the column is not found or there are no data at all, + /// we use dummy estimation. + bool dummy = total_rows == 0; + ColumnSelectivityEstimator estimator; + if (it != column_estimators.end()) + { + estimator = it->second; + } + else + { + dummy = true; + } + auto [op, val] = extractBinaryOp(node, col); + if (op == "equals") + { + if (dummy) + { + if (val < - threshold || val > threshold) + return default_normal_cond_factor * total_rows; + else + return default_good_cond_factor * total_rows; + } + return estimator.estimateEqual(val, total_rows); + } + else if (op == "less" || op == "lessOrEquals") + { + if (dummy) + return default_normal_cond_factor * total_rows; + return estimator.estimateLess(val, total_rows); + } + else if (op == "greater" || op == "greaterOrEquals") + { + if (dummy) + return default_normal_cond_factor * total_rows; + return estimator.estimateGreater(val, total_rows); + } + else + return default_unknown_cond_factor * total_rows; +} + +void ConditionSelectivityEstimator::merge(String part_name, UInt64 part_rows, ColumnStatisticsPtr column_stat) +{ + if (!part_names.contains(part_name)) + { + total_rows += part_rows; + part_names.insert(part_name); + } + if (column_stat != nullptr) + column_estimators[column_stat->columnName()].merge(part_name, column_stat); +} + +} diff --git a/src/Storages/Statistics/ConditionSelectivityEstimator.h b/src/Storages/Statistics/ConditionSelectivityEstimator.h new file mode 100644 index 00000000000..f0599742276 --- /dev/null +++ b/src/Storages/Statistics/ConditionSelectivityEstimator.h @@ -0,0 +1,50 @@ +#pragma once + +#include + +namespace DB +{ + +class RPNBuilderTreeNode; + +/// It estimates the selectivity of a condition. +class ConditionSelectivityEstimator +{ +private: + friend class ColumnStatistics; + struct ColumnSelectivityEstimator + { + /// We store the part_name and part_statistics. + /// then simply get selectivity for every part_statistics and combine them. + std::map part_statistics; + + void merge(String part_name, ColumnStatisticsPtr stats); + + Float64 estimateLess(Float64 val, Float64 rows) const; + + Float64 estimateGreater(Float64 val, Float64 rows) const; + + Float64 estimateEqual(Float64 val, Float64 rows) const; + }; + + static constexpr auto default_good_cond_factor = 0.1; + static constexpr auto default_normal_cond_factor = 0.5; + static constexpr auto default_unknown_cond_factor = 1.0; + /// Conditions like "x = N" are considered good if abs(N) > threshold. + /// This is used to assume that condition is likely to have good selectivity. + static constexpr auto threshold = 2; + + UInt64 total_rows = 0; + std::set part_names; + std::map column_estimators; + std::pair extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const; + +public: + /// TODO: Support the condition consists of CNF/DNF like (cond1 and cond2) or (cond3) ... + /// Right now we only support simple condition like col = val / col < val + Float64 estimateRowCount(const RPNBuilderTreeNode & node) const; + + void merge(String part_name, UInt64 part_rows, ColumnStatisticsPtr column_stat); +}; + +} diff --git a/src/Storages/Statistics/Estimator.cpp b/src/Storages/Statistics/Estimator.cpp deleted file mode 100644 index e272014c1c2..00000000000 --- a/src/Storages/Statistics/Estimator.cpp +++ /dev/null @@ -1,137 +0,0 @@ -#include -#include - -namespace DB -{ - -/// second return value represents how many columns in the node. -static std::pair tryToExtractSingleColumn(const RPNBuilderTreeNode & node) -{ - if (node.isConstant()) - { - return {}; - } - - if (!node.isFunction()) - { - auto column_name = node.getColumnName(); - return {column_name, 1}; - } - - auto function_node = node.toFunctionNode(); - size_t arguments_size = function_node.getArgumentsSize(); - std::pair result; - for (size_t i = 0; i < arguments_size; ++i) - { - auto function_argument = function_node.getArgumentAt(i); - auto subresult = tryToExtractSingleColumn(function_argument); - if (subresult.second == 0) /// the subnode contains 0 column - continue; - else if (subresult.second > 1) /// the subnode contains more than 1 column - return subresult; - else if (result.second == 0 || result.first == subresult.first) /// subnodes contain same column. - result = subresult; - else - return {"", 2}; - } - return result; -} - -std::pair ConditionEstimator::extractBinaryOp(const RPNBuilderTreeNode & node, const std::string & column_name) const -{ - if (!node.isFunction()) - return {}; - - auto function_node = node.toFunctionNode(); - if (function_node.getArgumentsSize() != 2) - return {}; - - std::string function_name = function_node.getFunctionName(); - - auto lhs_argument = function_node.getArgumentAt(0); - auto rhs_argument = function_node.getArgumentAt(1); - - auto lhs_argument_column_name = lhs_argument.getColumnName(); - auto rhs_argument_column_name = rhs_argument.getColumnName(); - - bool lhs_argument_is_column = column_name == (lhs_argument_column_name); - bool rhs_argument_is_column = column_name == (rhs_argument_column_name); - - bool lhs_argument_is_constant = lhs_argument.isConstant(); - bool rhs_argument_is_constant = rhs_argument.isConstant(); - - RPNBuilderTreeNode * constant_node = nullptr; - - if (lhs_argument_is_column && rhs_argument_is_constant) - constant_node = &rhs_argument; - else if (lhs_argument_is_constant && rhs_argument_is_column) - constant_node = &lhs_argument; - else - return {}; - - Field output_value; - DataTypePtr output_type; - if (!constant_node->tryGetConstant(output_value, output_type)) - return {}; - - const auto type = output_value.getType(); - Float64 value; - if (type == Field::Types::Int64) - value = output_value.get(); - else if (type == Field::Types::UInt64) - value = output_value.get(); - else if (type == Field::Types::Float64) - value = output_value.get(); - else - return {}; - return std::make_pair(function_name, value); -} - -Float64 ConditionEstimator::estimateSelectivity(const RPNBuilderTreeNode & node) const -{ - auto result = tryToExtractSingleColumn(node); - if (result.second != 1) - { - return default_unknown_cond_factor; - } - String col = result.first; - auto it = column_estimators.find(col); - - /// If there the estimator of the column is not found or there are no data at all, - /// we use dummy estimation. - bool dummy = total_count == 0; - ColumnEstimator estimator; - if (it != column_estimators.end()) - { - estimator = it->second; - } - else - { - dummy = true; - } - auto [op, val] = extractBinaryOp(node, col); - if (op == "equals") - { - if (val < -threshold || val > threshold) - return default_normal_cond_factor; - else - return default_good_cond_factor; - } - else if (op == "less" || op == "lessThan") - { - if (dummy) - return default_normal_cond_factor; - return estimator.estimateLess(val) / total_count; - } - else if (op == "greater" || op == "greaterThan") - { - if (dummy) - return default_normal_cond_factor; - return estimator.estimateGreater(val) / total_count; - } - else - return default_unknown_cond_factor; -} - - -} diff --git a/src/Storages/Statistics/Estimator.h b/src/Storages/Statistics/Estimator.h deleted file mode 100644 index 903bb57eb80..00000000000 --- a/src/Storages/Statistics/Estimator.h +++ /dev/null @@ -1,111 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -class RPNBuilderTreeNode; - -/// It estimates the selectivity of a condition. -class ConditionEstimator -{ -private: - - static constexpr auto default_good_cond_factor = 0.1; - static constexpr auto default_normal_cond_factor = 0.5; - static constexpr auto default_unknown_cond_factor = 1.0; - /// Conditions like "x = N" are considered good if abs(N) > threshold. - /// This is used to assume that condition is likely to have good selectivity. - static constexpr auto threshold = 2; - - UInt64 total_count = 0; - - /// Minimum estimator for values in a part. It can contains multiple types of statistics. - /// But right now we only have tdigest; - struct PartColumnEstimator - { - UInt64 part_count = 0; - - std::shared_ptr tdigest; - - void merge(StatisticPtr statistic) - { - UInt64 cur_part_count = statistic->count(); - if (part_count == 0) - part_count = cur_part_count; - - if (typeid_cast(statistic.get())) - { - tdigest = std::static_pointer_cast(statistic); - } - } - - Float64 estimateLess(Float64 val) const - { - if (tdigest != nullptr) - return tdigest -> estimateLess(val); - return part_count * default_normal_cond_factor; - } - - Float64 estimateGreator(Float64 val) const - { - if (tdigest != nullptr) - return part_count - tdigest -> estimateLess(val); - return part_count * default_normal_cond_factor; - } - }; - - /// An estimator for a column consists of several PartColumnEstimator. - /// We simply get selectivity for every part estimator and combine the result. - struct ColumnEstimator - { - std::map estimators; - - void merge(std::string part_name, StatisticPtr statistic) - { - estimators[part_name].merge(statistic); - } - - Float64 estimateLess(Float64 val) const - { - if (estimators.empty()) - return default_normal_cond_factor; - Float64 result = 0; - for (const auto & [key, estimator] : estimators) - result += estimator.estimateLess(val); - return result; - } - - Float64 estimateGreater(Float64 val) const - { - if (estimators.empty()) - return default_normal_cond_factor; - Float64 result = 0; - for (const auto & [key, estimator] : estimators) - result += estimator.estimateGreator(val); - return result; - } - }; - - std::map column_estimators; - /// std::optional extractSingleColumn(const RPNBuilderTreeNode & node) const; - std::pair extractBinaryOp(const RPNBuilderTreeNode & node, const std::string & column_name) const; - -public: - ConditionEstimator() = default; - - /// TODO: Support the condition consists of CNF/DNF like (cond1 and cond2) or (cond3) ... - /// Right now we only support simple condition like col = val / col < val - Float64 estimateSelectivity(const RPNBuilderTreeNode & node) const; - - void merge(std::string part_name, UInt64 part_count, StatisticPtr statistic) - { - total_count += part_count; - if (statistic != nullptr) - column_estimators[statistic->columnName()].merge(part_name, statistic); - } -}; - - -} diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp index 6619eac19dc..fed0bd61c03 100644 --- a/src/Storages/Statistics/Statistics.cpp +++ b/src/Storages/Statistics/Statistics.cpp @@ -1,11 +1,14 @@ #include #include -#include #include -#include +#include +#include +#include #include #include +#include +#include #include namespace DB @@ -15,39 +18,133 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int INCORRECT_QUERY; - extern const int ILLEGAL_STATISTIC; } -void MergeTreeStatisticsFactory::registerCreator(StatisticType stat_type, Creator creator) +/// Version / bitmask of statistics / data of statistics / +enum StatisticsFileVersion : UInt16 { - if (!creators.emplace(stat_type, std::move(creator)).second) - throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistic creator type {} is not unique", stat_type); -} + V0 = 0, +}; -void MergeTreeStatisticsFactory::registerValidator(StatisticType stat_type, Validator validator) +IStatistics::IStatistics(const SingleStatisticsDescription & stat_) : stat(stat_) {} + +ColumnStatistics::ColumnStatistics(const ColumnStatisticsDescription & stats_desc_) + : stats_desc(stats_desc_), rows(0) { - if (!validators.emplace(stat_type, std::move(validator)).second) - throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistic validator type {} is not unique", stat_type); - } -StatisticPtr TDigestCreator(const StatisticDescription & stat) +void ColumnStatistics::update(const ColumnPtr & column) { - return StatisticPtr(new TDigestStatistic(stat)); + rows += column->size(); + for (const auto & iter : stats) + { + iter.second->update(column); + } } -void TDigestValidator(const StatisticDescription &, DataTypePtr data_type) +Float64 ColumnStatistics::estimateLess(Float64 val) const { - data_type = removeNullable(data_type); - if (!data_type->isValueRepresentedByNumber()) - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "TDigest does not support type {}", data_type->getName()); + if (stats.contains(StatisticsType::TDigest)) + return std::static_pointer_cast(stats.at(StatisticsType::TDigest))->estimateLess(val); + return rows * ConditionSelectivityEstimator::default_normal_cond_factor; } +Float64 ColumnStatistics::estimateGreater(Float64 val) const +{ + return rows - estimateLess(val); +} + +Float64 ColumnStatistics::estimateEqual(Float64 val) const +{ + if (stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest)) + { + auto uniq_static = std::static_pointer_cast(stats.at(StatisticsType::Uniq)); + /// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) + /// for every bucket. + if (uniq_static->getCardinality() < 2048) + { + auto tdigest_static = std::static_pointer_cast(stats.at(StatisticsType::TDigest)); + return tdigest_static->estimateEqual(val); + } + } + if (val < - ConditionSelectivityEstimator::threshold || val > ConditionSelectivityEstimator::threshold) + return rows * ConditionSelectivityEstimator::default_normal_cond_factor; + else + return rows * ConditionSelectivityEstimator::default_good_cond_factor; +} + +void ColumnStatistics::serialize(WriteBuffer & buf) +{ + writeIntBinary(V0, buf); + UInt64 stat_types_mask = 0; + for (const auto & [type, _]: stats) + stat_types_mask |= 1 << UInt8(type); + writeIntBinary(stat_types_mask, buf); + /// We write some basic statistics + writeIntBinary(rows, buf); + /// We write complex statistics + for (const auto & [type, stat_ptr]: stats) + stat_ptr->serialize(buf); +} + +void ColumnStatistics::deserialize(ReadBuffer &buf) +{ + UInt16 version; + readIntBinary(version, buf); + if (version != V0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown file format version: {}", version); + + UInt64 stat_types_mask = 0; + readIntBinary(stat_types_mask, buf); + readIntBinary(rows, buf); + for (auto it = stats.begin(); it != stats.end();) + { + if (!(stat_types_mask & 1 << UInt8(it->first))) + { + stats.erase(it++); + } + else + { + it->second->deserialize(buf); + ++it; + } + } +} + +String ColumnStatistics::getFileName() const +{ + return STATS_FILE_PREFIX + columnName(); +} + +const String & ColumnStatistics::columnName() const +{ + return stats_desc.column_name; +} + +UInt64 ColumnStatistics::rowCount() const +{ + return rows; +} + +void MergeTreeStatisticsFactory::registerCreator(StatisticsType stats_type, Creator creator) +{ + if (!creators.emplace(stats_type, std::move(creator)).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistics creator type {} is not unique", stats_type); +} + +void MergeTreeStatisticsFactory::registerValidator(StatisticsType stats_type, Validator validator) +{ + if (!validators.emplace(stats_type, std::move(validator)).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistics validator type {} is not unique", stats_type); + +} MergeTreeStatisticsFactory::MergeTreeStatisticsFactory() { - registerCreator(TDigest, TDigestCreator); - registerValidator(TDigest, TDigestValidator); + registerCreator(StatisticsType::TDigest, TDigestCreator); + registerCreator(StatisticsType::Uniq, UniqCreator); + registerValidator(StatisticsType::TDigest, TDigestValidator); + registerValidator(StatisticsType::Uniq, UniqValidator); } MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance() @@ -56,33 +153,42 @@ MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance() return instance; } -void MergeTreeStatisticsFactory::validate(const StatisticDescription & stat, DataTypePtr data_type) const +void MergeTreeStatisticsFactory::validate(const ColumnStatisticsDescription & stats, DataTypePtr data_type) const { - auto it = validators.find(stat.type); - if (it == validators.end()) + for (const auto & [type, desc] : stats.types_to_desc) { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown Statistic type '{}'", stat.type); + auto it = validators.find(type); + if (it == validators.end()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown Statistic type '{}'", type); + } + it->second(desc, data_type); } - it->second(stat, data_type); } -StatisticPtr MergeTreeStatisticsFactory::get(const StatisticDescription & stat) const +ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const ColumnStatisticsDescription & stats) const { - auto it = creators.find(stat.type); - if (it == creators.end()) + ColumnStatisticsPtr column_stat = std::make_shared(stats); + for (const auto & [type, desc] : stats.types_to_desc) { - throw Exception(ErrorCodes::INCORRECT_QUERY, - "Unknown Statistic type '{}'. Available types: tdigest", stat.type); + auto it = creators.find(type); + if (it == creators.end()) + { + throw Exception(ErrorCodes::INCORRECT_QUERY, + "Unknown Statistic type '{}'. Available types: tdigest, uniq", type); + } + auto stat_ptr = (it->second)(desc, stats.data_type); + column_stat->stats[type] = stat_ptr; } - return std::make_shared(stat); + return column_stat; } -Statistics MergeTreeStatisticsFactory::getMany(const ColumnsDescription & columns) const +ColumnsStatistics MergeTreeStatisticsFactory::getMany(const ColumnsDescription & columns) const { - Statistics result; + ColumnsStatistics result; for (const auto & col : columns) - if (col.stat) - result.push_back(get(*col.stat)); + if (!col.statistics.empty()) + result.push_back(get(col.statistics)); return result; } diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h index e6d9666ce1c..2ab1337af02 100644 --- a/src/Storages/Statistics/Statistics.h +++ b/src/Storages/Statistics/Statistics.h @@ -1,12 +1,8 @@ #pragma once -#include #include -#include - #include -#include #include #include #include @@ -14,38 +10,23 @@ #include -/// this is for user-defined statistic. -constexpr auto STAT_FILE_PREFIX = "statistic_"; -constexpr auto STAT_FILE_SUFFIX = ".stat"; - namespace DB { -class IStatistic; -using StatisticPtr = std::shared_ptr; -using Statistics = std::vector; +/// this is for user-defined statistic. +constexpr auto STATS_FILE_PREFIX = "statistics_"; +constexpr auto STATS_FILE_SUFFIX = ".stats"; -/// Statistic contains the distribution of values in a column. -/// right now we support -/// - tdigest -class IStatistic +/// Statistics describe properties of the values in the column, +/// e.g. how many unique values exist, +/// what are the N most frequent values, +/// how frequent is a value V, etc. +class IStatistics { public: - explicit IStatistic(const StatisticDescription & stat_) - : stat(stat_) - { - } - virtual ~IStatistic() = default; + explicit IStatistics(const SingleStatisticsDescription & stat_); - String getFileName() const - { - return STAT_FILE_PREFIX + columnName(); - } - - const String & columnName() const - { - return stat.column_name; - } + virtual ~IStatistics() = default; virtual void serialize(WriteBuffer & buf) = 0; @@ -53,40 +34,68 @@ public: virtual void update(const ColumnPtr & column) = 0; - virtual UInt64 count() = 0; - protected: + SingleStatisticsDescription stat; +}; - StatisticDescription stat; +using StatisticsPtr = std::shared_ptr; +class ColumnStatistics +{ +public: + explicit ColumnStatistics(const ColumnStatisticsDescription & stats_); + void serialize(WriteBuffer & buf); + void deserialize(ReadBuffer & buf); + String getFileName() const; + + const String & columnName() const; + + UInt64 rowCount() const; + + void update(const ColumnPtr & column); + + Float64 estimateLess(Float64 val) const; + + Float64 estimateGreater(Float64 val) const; + + Float64 estimateEqual(Float64 val) const; + +private: + + friend class MergeTreeStatisticsFactory; + ColumnStatisticsDescription stats_desc; + std::map stats; + UInt64 rows; /// the number of rows of the column }; class ColumnsDescription; +using ColumnStatisticsPtr = std::shared_ptr; +using ColumnsStatistics = std::vector; class MergeTreeStatisticsFactory : private boost::noncopyable { public: static MergeTreeStatisticsFactory & instance(); - void validate(const StatisticDescription & stat, DataTypePtr data_type) const; + void validate(const ColumnStatisticsDescription & stats, DataTypePtr data_type) const; - using Creator = std::function; + using Creator = std::function; - using Validator = std::function; + using Validator = std::function; - StatisticPtr get(const StatisticDescription & stat) const; + ColumnStatisticsPtr get(const ColumnStatisticsDescription & stats) const; - Statistics getMany(const ColumnsDescription & columns) const; + ColumnsStatistics getMany(const ColumnsDescription & columns) const; - void registerCreator(StatisticType type, Creator creator); - void registerValidator(StatisticType type, Validator validator); + void registerCreator(StatisticsType type, Creator creator); + void registerValidator(StatisticsType type, Validator validator); protected: MergeTreeStatisticsFactory(); private: - using Creators = std::unordered_map; - using Validators = std::unordered_map; + using Creators = std::unordered_map; + using Validators = std::unordered_map; Creators creators; Validators validators; }; diff --git a/src/Storages/Statistics/TDigestStatistic.cpp b/src/Storages/Statistics/TDigestStatistic.cpp deleted file mode 100644 index efb4282d203..00000000000 --- a/src/Storages/Statistics/TDigestStatistic.cpp +++ /dev/null @@ -1,38 +0,0 @@ -#include - -namespace DB -{ - -Float64 TDigestStatistic::estimateLess(Float64 val) const -{ - return data.getCountLessThan(val); -} - -void TDigestStatistic::serialize(WriteBuffer & buf) -{ - data.serialize(buf); -} - -void TDigestStatistic::deserialize(ReadBuffer & buf) -{ - data.deserialize(buf); -} - -void TDigestStatistic::update(const ColumnPtr & column) -{ - size_t size = column->size(); - - for (size_t i = 0; i < size; ++i) - { - /// TODO: support more types. - Float64 value = column->getFloat64(i); - data.add(value, 1); - } -} - -UInt64 TDigestStatistic::count() -{ - return static_cast(data.count); -} - -} diff --git a/src/Storages/Statistics/TDigestStatistic.h b/src/Storages/Statistics/TDigestStatistic.h deleted file mode 100644 index 295b5f69900..00000000000 --- a/src/Storages/Statistics/TDigestStatistic.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -/// TDigestStatistic is a kind of histogram. -class TDigestStatistic : public IStatistic -{ - QuantileTDigest data; -public: - explicit TDigestStatistic(const StatisticDescription & stat_) : IStatistic(stat_) - { - } - - Float64 estimateLess(Float64 val) const; - - void serialize(WriteBuffer & buf) override; - - void deserialize(ReadBuffer & buf) override; - - void update(const ColumnPtr & column) override; - - UInt64 count() override; -}; - -} diff --git a/src/Storages/Statistics/TDigestStatistics.cpp b/src/Storages/Statistics/TDigestStatistics.cpp new file mode 100644 index 00000000000..aa5662c979d --- /dev/null +++ b/src/Storages/Statistics/TDigestStatistics.cpp @@ -0,0 +1,60 @@ +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_STATISTICS; +} + +TDigestStatistics::TDigestStatistics(const SingleStatisticsDescription & stat_): + IStatistics(stat_) +{ +} + +Float64 TDigestStatistics::estimateLess(Float64 val) const +{ + return data.getCountLessThan(val); +} + +Float64 TDigestStatistics::estimateEqual(Float64 val) const +{ + return data.getCountEqual(val); +} + +void TDigestStatistics::serialize(WriteBuffer & buf) +{ + data.serialize(buf); +} + +void TDigestStatistics::deserialize(ReadBuffer & buf) +{ + data.deserialize(buf); +} + +void TDigestStatistics::update(const ColumnPtr & column) +{ + size_t size = column->size(); + + for (size_t i = 0; i < size; ++i) + { + /// TODO: support more types. + Float64 value = column->getFloat64(i); + data.add(value, 1); + } +} + +StatisticsPtr TDigestCreator(const SingleStatisticsDescription & stat, DataTypePtr) +{ + return std::make_shared(stat); +} + +void TDigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type) +{ + data_type = removeNullable(data_type); + if (!data_type->isValueRepresentedByNumber()) + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' does not support type {}", data_type->getName()); +} + +} diff --git a/src/Storages/Statistics/TDigestStatistics.h b/src/Storages/Statistics/TDigestStatistics.h new file mode 100644 index 00000000000..7c361b8751f --- /dev/null +++ b/src/Storages/Statistics/TDigestStatistics.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include + +namespace DB +{ + + +/// TDigestStatistic is a kind of histogram. +class TDigestStatistics : public IStatistics +{ +public: + explicit TDigestStatistics(const SingleStatisticsDescription & stat_); + + Float64 estimateLess(Float64 val) const; + + Float64 estimateEqual(Float64 val) const; + + void serialize(WriteBuffer & buf) override; + + void deserialize(ReadBuffer & buf) override; + + void update(const ColumnPtr & column) override; +private: + QuantileTDigest data; +}; + +StatisticsPtr TDigestCreator(const SingleStatisticsDescription & stat, DataTypePtr); +void TDigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type); + +} diff --git a/src/Storages/Statistics/UniqStatistics.cpp b/src/Storages/Statistics/UniqStatistics.cpp new file mode 100644 index 00000000000..fc748e769ca --- /dev/null +++ b/src/Storages/Statistics/UniqStatistics.cpp @@ -0,0 +1,66 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_STATISTICS; +} + +UniqStatistics::UniqStatistics(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type) + : IStatistics(stat_) +{ + arena = std::make_unique(); + AggregateFunctionProperties properties; + collector = AggregateFunctionFactory::instance().get("uniq", NullsAction::IGNORE_NULLS, {data_type}, Array(), properties); + data = arena->alignedAlloc(collector->sizeOfData(), collector->alignOfData()); + collector->create(data); +} + +UniqStatistics::~UniqStatistics() +{ + collector->destroy(data); +} + +UInt64 UniqStatistics::getCardinality() +{ + auto column = DataTypeUInt64().createColumn(); + collector->insertResultInto(data, *column, nullptr); + return column->getUInt(0); +} + +void UniqStatistics::serialize(WriteBuffer & buf) +{ + collector->serialize(data, buf); +} + +void UniqStatistics::deserialize(ReadBuffer & buf) +{ + collector->deserialize(data, buf); +} + +void UniqStatistics::update(const ColumnPtr & column) +{ + /// TODO(hanfei): For low cardinality, it's very slow to convert to full column. We can read the dictionary directly. + /// Here we intend to avoid crash in CI. + auto col_ptr = column->convertToFullColumnIfLowCardinality(); + const IColumn * raw_ptr = col_ptr.get(); + collector->addBatchSinglePlace(0, column->size(), data, &(raw_ptr), nullptr); +} + +void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type) +{ + data_type = removeNullable(data_type); + if (!data_type->isValueRepresentedByNumber()) + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' does not support type {}", data_type->getName()); +} + +StatisticsPtr UniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type) +{ + return std::make_shared(stat, data_type); +} + +} diff --git a/src/Storages/Statistics/UniqStatistics.h b/src/Storages/Statistics/UniqStatistics.h new file mode 100644 index 00000000000..0d86a6e458a --- /dev/null +++ b/src/Storages/Statistics/UniqStatistics.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class UniqStatistics : public IStatistics +{ +public: + UniqStatistics(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type); + + ~UniqStatistics() override; + + UInt64 getCardinality(); + + void serialize(WriteBuffer & buf) override; + + void deserialize(ReadBuffer & buf) override; + + void update(const ColumnPtr & column) override; + +private: + + std::unique_ptr arena; + AggregateFunctionPtr collector; + AggregateDataPtr data; + +}; + +StatisticsPtr UniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type); +void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type); + +} diff --git a/src/Storages/Statistics/tests/gtest_stats.cpp b/src/Storages/Statistics/tests/gtest_stats.cpp index 45f8271be97..f94f310be56 100644 --- a/src/Storages/Statistics/tests/gtest_stats.cpp +++ b/src/Storages/Statistics/tests/gtest_stats.cpp @@ -1,6 +1,6 @@ #include -#include +#include TEST(Statistics, TDigestLessThan) { diff --git a/src/Storages/StatisticsDescription.cpp b/src/Storages/StatisticsDescription.cpp index 7d4226f2fbe..dff1b7d3602 100644 --- a/src/Storages/StatisticsDescription.cpp +++ b/src/Storages/StatisticsDescription.cpp @@ -1,14 +1,16 @@ +#include + #include #include #include #include -#include +#include #include #include +#include #include #include #include -#include #include #include @@ -19,93 +21,187 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_QUERY; + extern const int ILLEGAL_STATISTICS; extern const int LOGICAL_ERROR; }; -StatisticDescription & StatisticDescription::operator=(const StatisticDescription & other) +SingleStatisticsDescription & SingleStatisticsDescription::operator=(const SingleStatisticsDescription & other) { if (this == &other) return *this; type = other.type; - column_name = other.column_name; ast = other.ast ? other.ast->clone() : nullptr; return *this; } -StatisticDescription & StatisticDescription::operator=(StatisticDescription && other) noexcept +SingleStatisticsDescription & SingleStatisticsDescription::operator=(SingleStatisticsDescription && other) noexcept { if (this == &other) return *this; - type = std::exchange(other.type, StatisticType{}); - column_name = std::move(other.column_name); + type = std::exchange(other.type, StatisticsType{}); ast = other.ast ? other.ast->clone() : nullptr; other.ast.reset(); return *this; } -StatisticType stringToType(String type) +static StatisticsType stringToStatisticsType(String type) { if (type == "tdigest") - return TDigest; - throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. We only support statistic type `tdigest` right now.", type); + return StatisticsType::TDigest; + if (type == "uniq") + return StatisticsType::Uniq; + throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistics type: {}. Supported statistics types are `tdigest` and `uniq`.", type); } -String StatisticDescription::getTypeName() const +String SingleStatisticsDescription::getTypeName() const { - if (type == TDigest) - return "tdigest"; - throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. We only support statistic type `tdigest` right now.", type); -} - -std::vector StatisticDescription::getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns) -{ - const auto * stat_definition = definition_ast->as(); - if (!stat_definition) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create statistic from non ASTStatisticDeclaration AST"); - - std::vector stats; - stats.reserve(stat_definition->columns->children.size()); - for (const auto & column_ast : stat_definition->columns->children) + switch (type) { - StatisticDescription stat; - stat.type = stringToType(Poco::toLower(stat_definition->type)); - String column_name = column_ast->as().name(); + case StatisticsType::TDigest: + return "TDigest"; + case StatisticsType::Uniq: + return "Uniq"; + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown statistics type: {}. Supported statistics types are `tdigest` and `uniq`.", type); + } +} - if (!columns.hasPhysical(column_name)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", column_name); +SingleStatisticsDescription::SingleStatisticsDescription(StatisticsType type_, ASTPtr ast_) + : type(type_), ast(ast_) +{} - const auto & column = columns.getPhysical(column_name); - stat.column_name = column.name; - stat.ast = makeASTFunction("STATISTIC", std::make_shared(stat_definition->type)); - stats.push_back(stat); +bool SingleStatisticsDescription::operator==(const SingleStatisticsDescription & other) const +{ + return type == other.type; +} + +bool ColumnStatisticsDescription::operator==(const ColumnStatisticsDescription & other) const +{ + return types_to_desc == other.types_to_desc; +} + +bool ColumnStatisticsDescription::empty() const +{ + return types_to_desc.empty(); +} + +bool ColumnStatisticsDescription::contains(const String & stat_type) const +{ + return types_to_desc.contains(stringToStatisticsType(stat_type)); +} + +void ColumnStatisticsDescription::merge(const ColumnStatisticsDescription & other, const String & merging_column_name, DataTypePtr merging_column_type, bool if_not_exists) +{ + chassert(merging_column_type); + + if (column_name.empty()) + { + column_name = merging_column_name; + data_type = merging_column_type; } - if (stats.empty()) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Empty statistic column list"); + for (const auto & [stats_type, stats_desc]: other.types_to_desc) + { + if (!if_not_exists && types_to_desc.contains(stats_type)) + { + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics type name {} has existed in column {}", stats_type, column_name); + } + else if (!types_to_desc.contains(stats_type)) + types_to_desc.emplace(stats_type, stats_desc); + } +} + +void ColumnStatisticsDescription::assign(const ColumnStatisticsDescription & other) +{ + if (other.column_name != column_name) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot assign statistics from column {} to {}", column_name, other.column_name); + + types_to_desc = other.types_to_desc; +} + +void ColumnStatisticsDescription::clear() +{ + types_to_desc.clear(); +} + +std::vector ColumnStatisticsDescription::fromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns) +{ + const auto * stat_definition_ast = definition_ast->as(); + if (!stat_definition_ast) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot cast AST to ASTSingleStatisticsDeclaration"); + + StatisticsTypeDescMap statistics_types; + for (const auto & stat_ast : stat_definition_ast->types->children) + { + String stat_type_name = stat_ast->as().name; + auto stat_type = stringToStatisticsType(Poco::toLower(stat_type_name)); + if (statistics_types.contains(stat_type)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistics type {} was specified more than once", stat_type_name); + SingleStatisticsDescription stat(stat_type, stat_ast->clone()); + + statistics_types.emplace(stat.type, stat); + } + + std::vector result; + result.reserve(stat_definition_ast->columns->children.size()); + + for (const auto & column_ast : stat_definition_ast->columns->children) + { + ColumnStatisticsDescription stats; + String physical_column_name = column_ast->as().name(); + + if (!columns.hasPhysical(physical_column_name)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", physical_column_name); + + const auto & column = columns.getPhysical(physical_column_name); + stats.column_name = column.name; + stats.types_to_desc = statistics_types; + result.push_back(stats); + } + + if (result.empty()) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Empty statistics column list is not allowed."); + + return result; +} + +ColumnStatisticsDescription ColumnStatisticsDescription::fromColumnDeclaration(const ASTColumnDeclaration & column) +{ + const auto & stat_type_list_ast = column.stat_type->as().arguments; + if (stat_type_list_ast->children.empty()) + throw Exception(ErrorCodes::INCORRECT_QUERY, "We expect at least one statistics type for column {}", queryToString(column)); + ColumnStatisticsDescription stats; + stats.column_name = column.name; + for (const auto & ast : stat_type_list_ast->children) + { + const auto & stat_type = ast->as().name; + + SingleStatisticsDescription stat(stringToStatisticsType(Poco::toLower(stat_type)), ast->clone()); + if (stats.types_to_desc.contains(stat.type)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Column {} already contains statistics type {}", stats.column_name, stat_type); + stats.types_to_desc.emplace(stat.type, std::move(stat)); + } return stats; } -String queryToString(const IAST & query); - -StatisticDescription StatisticDescription::getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column) +ASTPtr ColumnStatisticsDescription::getAST() const { - const auto & stat_type_list_ast = column.stat_type->as().arguments; - if (stat_type_list_ast->children.size() != 1) - throw Exception(ErrorCodes::INCORRECT_QUERY, "We expect only one statistic type for column {}", queryToString(column)); - - const auto & stat_type = stat_type_list_ast->children[0]->as().name; - - StatisticDescription stat; - stat.type = stringToType(Poco::toLower(stat_type)); - stat.column_name = column.name; - stat.ast = column.stat_type; - - return stat; + auto function_node = std::make_shared(); + function_node->name = "STATISTICS"; + function_node->arguments = std::make_shared(); + for (const auto & [type, desc] : types_to_desc) + { + if (desc.ast == nullptr) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown ast"); + function_node->arguments->children.push_back(desc.ast); + } + function_node->children.push_back(function_node->arguments); + return function_node; } } diff --git a/src/Storages/StatisticsDescription.h b/src/Storages/StatisticsDescription.h index b571fa31e9d..59ad8944850 100644 --- a/src/Storages/StatisticsDescription.h +++ b/src/Storages/StatisticsDescription.h @@ -1,45 +1,66 @@ #pragma once +#include #include #include + #include namespace DB { -enum StatisticType +enum class StatisticsType : UInt8 { TDigest = 0, + Uniq = 1, + + Max = 63, }; -class ColumnsDescription; - -struct StatisticDescription +struct SingleStatisticsDescription { - /// the type of statistic, right now it's only tdigest. - StatisticType type; - - /// Names of statistic columns - String column_name; + StatisticsType type; ASTPtr ast; String getTypeName() const; - StatisticDescription() = default; - StatisticDescription(const StatisticDescription & other) { *this = other; } - StatisticDescription & operator=(const StatisticDescription & other); - StatisticDescription(StatisticDescription && other) noexcept { *this = std::move(other); } - StatisticDescription & operator=(StatisticDescription && other) noexcept; + SingleStatisticsDescription() = delete; + SingleStatisticsDescription(StatisticsType type_, ASTPtr ast_); - bool operator==(const StatisticDescription & other) const - { - return type == other.type && column_name == other.column_name; - } + SingleStatisticsDescription(const SingleStatisticsDescription & other) { *this = other; } + SingleStatisticsDescription & operator=(const SingleStatisticsDescription & other); + SingleStatisticsDescription(SingleStatisticsDescription && other) noexcept { *this = std::move(other); } + SingleStatisticsDescription & operator=(SingleStatisticsDescription && other) noexcept; - static StatisticDescription getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column); + bool operator==(const SingleStatisticsDescription & other) const; +}; - static std::vector getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); +class ColumnsDescription; + +struct ColumnStatisticsDescription +{ + bool operator==(const ColumnStatisticsDescription & other) const; + + bool empty() const; + + bool contains(const String & stat_type) const; + + void merge(const ColumnStatisticsDescription & other, const String & column_name, DataTypePtr column_type, bool if_not_exists); + + void assign(const ColumnStatisticsDescription & other); + + void clear(); + + ASTPtr getAST() const; + + static std::vector fromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); + static ColumnStatisticsDescription fromColumnDeclaration(const ASTColumnDeclaration & column); + + using StatisticsTypeDescMap = std::map; + StatisticsTypeDescMap types_to_desc; + String column_name; + DataTypePtr data_type; }; } diff --git a/src/Storages/System/StorageSystemDashboards.cpp b/src/Storages/System/StorageSystemDashboards.cpp index 9682fbc74a1..5faa37d951e 100644 --- a/src/Storages/System/StorageSystemDashboards.cpp +++ b/src/Storages/System/StorageSystemDashboards.cpp @@ -212,6 +212,20 @@ FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'MaxPartCountForPartition' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} +)EOQ") } + }, + { + { "dashboard", "Overview" }, + { "title", "Concurrent network connections" }, + { "query", trim(R"EOQ( +SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, + sum(CurrentMetric_TCPConnection) AS TCP_Connections, + sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, + sum(CurrentMetric_HTTPConnection) AS HTTP_Connections +FROM merge('system', '^metric_log') +WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} +GROUP BY t +ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, /// Default dashboard for ClickHouse Cloud @@ -349,6 +363,11 @@ ORDER BY t WITH FILL STEP {rounding:UInt32} { "dashboard", "Cloud overview" }, { "title", "Network send bytes/sec" }, { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM (\n SELECT event_time, sum(value) AS value\n FROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n AND metric LIKE 'NetworkSendBytes%'\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Concurrent network connections" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(TCP_Connections), max(MySQL_Connections), max(HTTP_Connections) FROM (SELECT event_time, sum(CurrentMetric_TCPConnection) AS TCP_Connections, sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, sum(CurrentMetric_HTTPConnection) AS HTTP_Connections FROM clusterAllReplicas(default, merge('system', '^metric_log')) WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY event_time) GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } } }; diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py index 0f6c8e5aa8a..ce5b46a664e 100644 --- a/tests/ci/build_download_helper.py +++ b/tests/ci/build_download_helper.py @@ -10,9 +10,21 @@ from typing import Any, Callable, List, Optional, Union import requests -import get_robot_token as grt # we need an updated ROBOT_TOKEN from ci_config import CI_CONFIG +try: + # A work around for scripts using this downloading module without required deps + import get_robot_token as grt # we need an updated ROBOT_TOKEN +except ImportError: + + class grt: # type: ignore + ROBOT_TOKEN = None + + @staticmethod + def get_best_robot_token() -> str: + return "" + + DOWNLOAD_RETRIES_COUNT = 5 @@ -63,15 +75,10 @@ def get_gh_api( """ def set_auth_header(): - if "headers" in kwargs: - if "Authorization" not in kwargs["headers"]: - kwargs["headers"][ - "Authorization" - ] = f"Bearer {grt.get_best_robot_token()}" - else: - kwargs["headers"] = { - "Authorization": f"Bearer {grt.get_best_robot_token()}" - } + headers = kwargs.get("headers", {}) + if "Authorization" not in headers: + headers["Authorization"] = f"Bearer {grt.get_best_robot_token()}" + kwargs["headers"] = headers if grt.ROBOT_TOKEN is not None: set_auth_header() diff --git a/tests/ci/download_release_packages.py b/tests/ci/download_release_packages.py index 1ba4ff8ff2e..8f3a2190ae8 100755 --- a/tests/ci/download_release_packages.py +++ b/tests/ci/download_release_packages.py @@ -1,79 +1,38 @@ #!/usr/bin/env python3 import logging -import os +from pathlib import Path -import requests -from requests.adapters import HTTPAdapter # type: ignore -from urllib3.util.retry import Retry # type: ignore - -from get_previous_release_tag import ReleaseInfo, get_previous_release - -CLICKHOUSE_TAGS_URL = "https://api.github.com/repos/ClickHouse/ClickHouse/tags" - -DOWNLOAD_PREFIX = ( - "https://github.com/ClickHouse/ClickHouse/releases/download/v{version}-{type}/" +from build_download_helper import DownloadException, download_build_with_progress +from get_previous_release_tag import ( + ReleaseInfo, + get_previous_release, + get_release_by_tag, ) -CLICKHOUSE_COMMON_STATIC_PACKAGE_NAME = "clickhouse-common-static_{version}_amd64.deb" -CLICKHOUSE_COMMON_STATIC_DBG_PACKAGE_NAME = ( - "clickhouse-common-static-dbg_{version}_amd64.deb" -) -CLICKHOUSE_CLIENT_PACKAGE_NAME = "clickhouse-client_{version}_amd64.deb" -CLICKHOUSE_LIBRARY_BRIDGE_PACKAGE_NAME = "clickhouse-library-bridge_{version}_amd64.deb" -CLICKHOUSE_ODBC_BRIDGE_PACKAGE_NAME = "clickhouse-odbc-bridge_{version}_amd64.deb" -CLICKHOUSE_SERVER_PACKAGE_NAME = "clickhouse-server_{version}_amd64.deb" -PACKAGES_DIR = "previous_release_package_folder/" -VERSION_PATTERN = r"((?:\d+\.)?(?:\d+\.)?(?:\d+\.)?\d+-[a-zA-Z]*)" +PACKAGES_DIR = Path("previous_release_package_folder") -def download_package(url, out_path, retries=10, backoff_factor=0.3): - session = requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=[500, 502, 503, 504], - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount("http://", adapter) - session.mount("https://", adapter) - response = session.get(url) - response.raise_for_status() - print(f"Download {url} to {out_path}") - with open(out_path, "wb") as fd: - fd.write(response.content) - - -def download_packages(release, dest_path=PACKAGES_DIR): - if not os.path.exists(dest_path): - os.makedirs(dest_path) +def download_packages(release: ReleaseInfo, dest_path: Path = PACKAGES_DIR) -> None: + dest_path.mkdir(parents=True, exist_ok=True) logging.info("Will download %s", release) - def get_dest_path(pkg_name): - return os.path.join(dest_path, pkg_name) - - for pkg in ( - CLICKHOUSE_COMMON_STATIC_PACKAGE_NAME, - CLICKHOUSE_COMMON_STATIC_DBG_PACKAGE_NAME, - CLICKHOUSE_CLIENT_PACKAGE_NAME, - CLICKHOUSE_LIBRARY_BRIDGE_PACKAGE_NAME, - CLICKHOUSE_ODBC_BRIDGE_PACKAGE_NAME, - CLICKHOUSE_SERVER_PACKAGE_NAME, - ): - url = (DOWNLOAD_PREFIX + pkg).format(version=release.version, type=release.type) - pkg_name = get_dest_path(pkg.format(version=release.version)) - download_package(url, pkg_name) + for pkg, url in release.assets.items(): + if not pkg.endswith("_amd64.deb") or "-dbg_" in pkg: + continue + pkg_name = dest_path / pkg + download_build_with_progress(url, pkg_name) -def download_last_release(dest_path): +def download_last_release(dest_path: Path) -> None: current_release = get_previous_release(None) + if current_release is None: + raise DownloadException("The current release is not found") download_packages(current_release, dest_path=dest_path) if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - release = ReleaseInfo(input()) + release = get_release_by_tag(input()) download_packages(release) diff --git a/tests/ci/get_previous_release_tag.py b/tests/ci/get_previous_release_tag.py index bc0cb975ef5..2b4d09aa326 100755 --- a/tests/ci/get_previous_release_tag.py +++ b/tests/ci/get_previous_release_tag.py @@ -2,47 +2,37 @@ import logging import re -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple -import requests - -CLICKHOUSE_TAGS_URL = "https://api.github.com/repos/ClickHouse/ClickHouse/tags" -CLICKHOUSE_PACKAGE_URL = ( - "https://github.com/ClickHouse/ClickHouse/releases/download/" - "v{version}-{type}/clickhouse-common-static_{version}_amd64.deb" +from build_download_helper import get_gh_api +from git_helper import TAG_REGEXP +from version_helper import ( + ClickHouseVersion, + get_version_from_string, + get_version_from_tag, ) -VERSION_PATTERN = r"(v(?:\d+\.)?(?:\d+\.)?(?:\d+\.)?\d+-[a-zA-Z]*)" + +CLICKHOUSE_TAGS_URL = "https://api.github.com/repos/ClickHouse/ClickHouse/releases" +PACKAGE_REGEXP = r"\Aclickhouse-common-static_.+[.]deb" logger = logging.getLogger(__name__) -class Version: - def __init__(self, version: str): - self.version = version - - def __lt__(self, other: "Version") -> bool: - return list(map(int, self.version.split("."))) < list( - map(int, other.version.split(".")) - ) - - def __str__(self): - return self.version - - class ReleaseInfo: - def __init__(self, release_tag: str): - self.version = Version(release_tag[1:].split("-")[0]) - self.type = release_tag[1:].split("-")[1] + def __init__(self, release_tag: str, assets: Dict[str, str]): + self.version = get_version_from_tag(release_tag) + self.type = self.version.description + self.assets = assets def __str__(self): - return f"v{self.version}-{self.type}" + return self.version.describe def __repr__(self): - return f"ReleaseInfo: {self.version}-{self.type}" + return f"ReleaseInfo: {self.version.describe}" def find_previous_release( - server_version: Optional[Version], releases: List[ReleaseInfo] + server_version: Optional[ClickHouseVersion], releases: List[ReleaseInfo] ) -> Tuple[bool, Optional[ReleaseInfo]]: releases.sort(key=lambda x: x.version, reverse=True) @@ -54,15 +44,7 @@ def find_previous_release( # Check if the artifact exists on GitHub. # It can be not true for a short period of time # after creating a tag for a new release before uploading the packages. - if ( - requests.head( - CLICKHOUSE_PACKAGE_URL.format( - version=release.version, type=release.type - ), - timeout=10, - ).status_code - != 404 - ): + if any(re.match(PACKAGE_REGEXP, name) for name in release.assets.keys()): return True, release logger.debug( @@ -74,12 +56,14 @@ def find_previous_release( return False, None -def get_previous_release(server_version: Optional[Version]) -> Optional[ReleaseInfo]: +def get_previous_release( + server_version: Optional[ClickHouseVersion], +) -> Optional[ReleaseInfo]: page = 1 found = False while not found: - response = requests.get( - CLICKHOUSE_TAGS_URL, {"page": page, "per_page": 100}, timeout=10 + response = get_gh_api( + CLICKHOUSE_TAGS_URL, params={"page": page, "per_page": 100}, timeout=10 ) if not response.ok: logger.error( @@ -87,24 +71,42 @@ def get_previous_release(server_version: Optional[Version]) -> Optional[ReleaseI ) response.raise_for_status() - releases_str = set(re.findall(VERSION_PATTERN, response.text)) - if len(releases_str) == 0: - raise ValueError( - "Cannot find previous release for " - + str(server_version) - + " server version" - ) + releases = response.json() - releases = [ReleaseInfo(release) for release in releases_str] - found, previous_release = find_previous_release(server_version, releases) + release_infos = [] # type: List[ReleaseInfo] + for r in releases: + if re.match(TAG_REGEXP, r["tag_name"]): + assets = { + a["name"]: a["browser_download_url"] + for a in r["assets"] + if a["state"] == "uploaded" + } + release_infos.append(ReleaseInfo(r["tag_name"], assets)) + found, previous_release = find_previous_release(server_version, release_infos) page += 1 return previous_release +def get_release_by_tag(tag: str) -> ReleaseInfo: + response = get_gh_api(f"{CLICKHOUSE_TAGS_URL}/tags/{tag}", timeout=10) + release = response.json() + assets = { + a["name"]: a["browser_download_url"] + for a in release["assets"] + if a["state"] == "uploaded" + } + return ReleaseInfo(release["tag_name"], assets) + + def main(): logging.basicConfig(level=logging.INFO) - server_version = Version(input()) + version_string = input() + version_string = version_string.split("+", maxsplit=1)[0] + try: + server_version = get_version_from_string(version_string) + except ValueError: + server_version = get_version_from_tag(version_string) print(get_previous_release(server_version)) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 9d9d1433073..131cbeef786 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -23,7 +23,7 @@ from lambda_shared_package.lambda_shared.pr import ( check_pr_description, ) from pr_info import PRInfo -from report import FAILURE, PENDING, SUCCESS +from report import FAILURE, PENDING, SUCCESS, StatusType TRUSTED_ORG_IDS = { 54801242, # clickhouse @@ -58,7 +58,7 @@ def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): # Returns can_run, description def should_run_ci_for_pr(pr_info: PRInfo) -> Tuple[bool, str]: # Consider the labels and whether the user is trusted. - print("Got labels", pr_info.labels) + logging.info("Got labels: %s", pr_info.labels) if OK_SKIP_LABELS.intersection(pr_info.labels): return True, "Don't try new checks for release/backports/cherry-picks" @@ -66,9 +66,10 @@ def should_run_ci_for_pr(pr_info: PRInfo) -> Tuple[bool, str]: if Labels.CAN_BE_TESTED not in pr_info.labels and not pr_is_by_trusted_user( pr_info.user_login, pr_info.user_orgs ): - print( - f"PRs by untrusted users need the '{Labels.CAN_BE_TESTED}' label - " - "please contact a member of the core team" + logging.info( + "PRs by untrusted users need the '%s' label - " + "please contact a member of the core team", + Labels.CAN_BE_TESTED, ) return False, "Needs 'can be tested' label" @@ -93,6 +94,7 @@ def main(): description = format_description(description) gh = Github(get_best_robot_token(), per_page=100) commit = get_commit(gh, pr_info.sha) + status = SUCCESS # type: StatusType description_error, category = check_pr_description(pr_info.body, GITHUB_REPOSITORY) pr_labels_to_add = [] @@ -125,13 +127,16 @@ def main(): f"::notice :: Add backport labels [{backport_labels}] for a given PR category" ) - print(f"Change labels: add {pr_labels_to_add}, remove {pr_labels_to_remove}") + logging.info( + "Change labels: add %s, remove %s", pr_labels_to_add, pr_labels_to_remove + ) if pr_labels_to_add: post_labels(gh, pr_info, pr_labels_to_add) if pr_labels_to_remove: remove_labels(gh, pr_info, pr_labels_to_remove) + # 1. Next three IFs are in a correct order. First - fatal error if description_error: print( "::error ::Cannot run, PR description does not match the template: " @@ -146,9 +151,10 @@ def main(): f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/" "blob/master/.github/PULL_REQUEST_TEMPLATE.md?plain=1" ) + status = FAILURE post_commit_status( commit, - FAILURE, + status, url, format_description(description_error), PR_CHECK, @@ -156,41 +162,38 @@ def main(): ) sys.exit(1) + # 2. Then we check if the documentation is not created to fail the Mergeable check if ( Labels.PR_FEATURE in pr_info.labels and not pr_info.has_changes_in_documentation() ): print( - f"The '{Labels.PR_FEATURE}' in the labels, " + f"::error ::The '{Labels.PR_FEATURE}' in the labels, " "but there's no changed documentation" ) - post_commit_status( - commit, - FAILURE, - "", - f"expect adding docs for {Labels.PR_FEATURE}", - PR_CHECK, - pr_info, - ) - # allow the workflow to continue + status = FAILURE + description = f"expect adding docs for {Labels.PR_FEATURE}" + # 3. But we allow the workflow to continue + # 4. And post only a single commit status on a failure if not can_run: post_commit_status( commit, - FAILURE, + status, "", description, PR_CHECK, pr_info, ) - print("::notice ::Cannot run") + print("::error ::Cannot run") sys.exit(1) + # The status for continue can be posted only one time, not more. post_commit_status( commit, - SUCCESS, + status, "", - "ok", + description, PR_CHECK, pr_info, ) diff --git a/tests/integration/test_access_control_on_cluster/test.py b/tests/integration/test_access_control_on_cluster/test.py index 8dbb87c67d8..b12add7ad3f 100644 --- a/tests/integration/test_access_control_on_cluster/test.py +++ b/tests/integration/test_access_control_on_cluster/test.py @@ -74,3 +74,18 @@ def test_grant_all_on_cluster(): assert ch2.query("SHOW GRANTS FOR Alex") == "GRANT ALL ON *.* TO Alex\n" ch1.query("DROP USER Alex ON CLUSTER 'cluster'") + + +def test_grant_current_database_on_cluster(): + ch1.query("CREATE DATABASE user_db ON CLUSTER 'cluster'") + ch1.query( + "CREATE USER IF NOT EXISTS test_user ON CLUSTER 'cluster' DEFAULT DATABASE user_db" + ) + ch1.query( + "GRANT SELECT ON user_db.* TO test_user ON CLUSTER 'cluster' WITH GRANT OPTION" + ) + ch1.query("GRANT CLUSTER ON *.* TO test_user ON CLUSTER 'cluster'") + + assert ch1.query("SHOW DATABASES", user="test_user") == "user_db\n" + ch1.query("GRANT SELECT ON * TO test_user ON CLUSTER 'cluster'", user="test_user") + assert ch1.query("SHOW DATABASES", user="test_user") == "user_db\n" diff --git a/tests/integration/test_manipulate_statistic/__init__.py b/tests/integration/test_manipulate_statistics/__init__.py similarity index 100% rename from tests/integration/test_manipulate_statistic/__init__.py rename to tests/integration/test_manipulate_statistics/__init__.py diff --git a/tests/integration/test_manipulate_statistic/config/config.xml b/tests/integration/test_manipulate_statistics/config/config.xml similarity index 55% rename from tests/integration/test_manipulate_statistic/config/config.xml rename to tests/integration/test_manipulate_statistics/config/config.xml index b47f8123499..24225173eeb 100644 --- a/tests/integration/test_manipulate_statistic/config/config.xml +++ b/tests/integration/test_manipulate_statistics/config/config.xml @@ -1,7 +1,7 @@ - 1 + 1 diff --git a/tests/integration/test_manipulate_statistic/test.py b/tests/integration/test_manipulate_statistics/test.py similarity index 82% rename from tests/integration/test_manipulate_statistic/test.py rename to tests/integration/test_manipulate_statistics/test.py index 7b96b392da8..2b26af940d1 100644 --- a/tests/integration/test_manipulate_statistic/test.py +++ b/tests/integration/test_manipulate_statistics/test.py @@ -34,14 +34,14 @@ def check_stat_file_on_disk(node, table, part_name, column_name, exist): [ "bash", "-c", - "find {p} -type f -name statistic_{col}.stat".format( + "find {p} -type f -name statistics_{col}.stats".format( p=part_path, col=column_name ), ], privileged=True, ) logging.debug( - f"Checking stat file in {part_path} for column {column_name}, got {output}" + f"Checking stats file in {part_path} for column {column_name}, got {output}" ) if exist: assert len(output) != 0 @@ -56,26 +56,26 @@ def run_test_single_node(started_cluster): check_stat_file_on_disk(node1, "test_stat", "all_1_1_0", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0", "c", True) - node1.query("ALTER TABLE test_stat DROP STATISTIC a type tdigest") + node1.query("ALTER TABLE test_stat DROP STATISTICS a") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "c", True) - node1.query("ALTER TABLE test_stat CLEAR STATISTIC b, c type tdigest") + node1.query("ALTER TABLE test_stat CLEAR STATISTICS b, c") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "b", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "c", False) - node1.query("ALTER TABLE test_stat MATERIALIZE STATISTIC b, c type tdigest") + node1.query("ALTER TABLE test_stat MATERIALIZE STATISTICS b, c") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "c", True) - node1.query("ALTER TABLE test_stat ADD STATISTIC a type tdigest") - node1.query("ALTER TABLE test_stat MATERIALIZE STATISTIC a type tdigest") + node1.query("ALTER TABLE test_stat ADD STATISTICS a type tdigest") + node1.query("ALTER TABLE test_stat MATERIALIZE STATISTICS a") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_5", "a", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_5", "b", True) @@ -104,7 +104,7 @@ def test_single_node_wide(started_cluster): node1.query( """ - CREATE TABLE test_stat(a Int64 STATISTIC(tdigest), b Int64 STATISTIC(tdigest), c Int64 STATISTIC(tdigest)) + CREATE TABLE test_stat(a Int64 STATISTICS(tdigest), b Int64 STATISTICS(tdigest), c Int64 STATISTICS(tdigest)) ENGINE = MergeTree() ORDER BY a SETTINGS min_bytes_for_wide_part = 0; """ @@ -117,7 +117,7 @@ def test_single_node_normal(started_cluster): node1.query( """ - CREATE TABLE test_stat(a Int64 STATISTIC(tdigest), b Int64 STATISTIC(tdigest), c Int64 STATISTIC(tdigest)) + CREATE TABLE test_stat(a Int64 STATISTICS(tdigest), b Int64 STATISTICS(tdigest), c Int64 STATISTICS(tdigest)) ENGINE = MergeTree() ORDER BY a; """ ) diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py index 251da7e4e09..38cbf8c1aed 100644 --- a/tests/integration/test_mask_sensitive_info/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -195,10 +195,10 @@ def test_create_table(): f"DeltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", "DNS_ERROR", ), - f"S3Queue('http://minio1:9001/root/data/', 'CSV')", - f"S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip')", - f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV')", - f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV', 'gzip')", + f"S3Queue('http://minio1:9001/root/data/', 'CSV') settings mode = 'ordered'", + f"S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip') settings mode = 'ordered'", + f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV') settings mode = 'ordered'", + f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV', 'gzip') settings mode = 'ordered'", ] def make_test_case(i): @@ -258,10 +258,11 @@ def test_create_table(): "CREATE TABLE table14 (x int) ENGINE = S3('http://minio1:9001/root/data/test9.csv.gz', 'NOSIGN', 'CSV', 'gzip')", "CREATE TABLE table15 (`x` int) ENGINE = S3('http://minio1:9001/root/data/test10.csv.gz', 'minio', '[HIDDEN]')", "CREATE TABLE table16 (`x` int) ENGINE = DeltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", - "CREATE TABLE table17 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV')", - "CREATE TABLE table18 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip')", - "CREATE TABLE table19 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV')", - "CREATE TABLE table20 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV', 'gzip')", + "CREATE TABLE table17 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV') settings mode = 'ordered'", + "CREATE TABLE table18 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip') settings mode = 'ordered'", + # due to sensitive data substituion the query will be normalized, so not "settings" but "SETTINGS" + "CREATE TABLE table19 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV') SETTINGS mode = 'ordered'", + "CREATE TABLE table20 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV', 'gzip') SETTINGS mode = 'ordered'", ], must_not_contain=[password], ) diff --git a/tests/integration/test_modify_engine_on_restart/test_zk_path_exists.py b/tests/integration/test_modify_engine_on_restart/test_zk_path_exists.py new file mode 100644 index 00000000000..3bf492cf69d --- /dev/null +++ b/tests/integration/test_modify_engine_on_restart/test_zk_path_exists.py @@ -0,0 +1,69 @@ +import pytest +from test_modify_engine_on_restart.common import ( + get_table_path, + set_convert_flags, +) +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +ch1 = cluster.add_instance( + "ch1", + main_configs=[ + "configs/config.d/clusters.xml", + "configs/config.d/distributed_ddl.xml", + ], + with_zookeeper=True, + macros={"replica": "node1"}, + stay_alive=True, +) + +database_name = "modify_engine_zk_path" + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def q(node, query): + return node.query(database=database_name, sql=query) + + +def test_modify_engine_fails_if_zk_path_exists(started_cluster): + ch1.query("CREATE DATABASE " + database_name) + + q( + ch1, + "CREATE TABLE already_exists_1 ( A Int64, D Date, S String ) ENGINE MergeTree() PARTITION BY toYYYYMM(D) ORDER BY A;", + ) + uuid = q( + ch1, + f"SELECT uuid FROM system.tables WHERE table = 'already_exists_1' and database = '{database_name}'", + ).strip("'[]\n") + + q( + ch1, + f"CREATE TABLE already_exists_2 ( A Int64, D Date, S String ) ENGINE ReplicatedMergeTree('/clickhouse/tables/{uuid}/{{shard}}', 'node2') PARTITION BY toYYYYMM(D) ORDER BY A;", + ) + + set_convert_flags(ch1, database_name, ["already_exists_1"]) + + table_data_path = get_table_path(ch1, "already_exists_1", database_name) + + ch1.stop_clickhouse() + ch1.start_clickhouse(start_wait_sec=120, expected_to_fail=True) + + # Check if we can cancel convertation + ch1.exec_in_container( + [ + "bash", + "-c", + f"rm {table_data_path}convert_to_replicated", + ] + ) + ch1.start_clickhouse() diff --git a/tests/queries/0_stateless/02864_statistic_exception.reference b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/__init__.py similarity index 100% rename from tests/queries/0_stateless/02864_statistic_exception.reference rename to tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/__init__.py diff --git a/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/configs/settings.xml b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/configs/settings.xml new file mode 100644 index 00000000000..0a390937413 --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/configs/settings.xml @@ -0,0 +1,6 @@ + + + 300 + 1 + 128 + diff --git a/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/test.py b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/test.py new file mode 100644 index 00000000000..515d9530424 --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/test.py @@ -0,0 +1,68 @@ +import concurrent.futures + +import pytest +from helpers.cluster import ClickHouseCluster + + +MAX_THREADS = 60 + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance( + "node1", + macros={"cluster": "test-cluster", "replica": "node1"}, + main_configs=["configs/settings.xml"], + with_zookeeper=True, +) + + +def prepare_cluster(): + node1.query("DROP TABLE IF EXISTS test_threads_busy SYNC") + node1.query( + """ + CREATE TABLE test_threads_busy(d Date, i Int64, s String) ENGINE=MergeTree PARTITION BY toYYYYMMDD(d) ORDER BY d + """ + ) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def do_slow_select(): + # Do a bunch of slow queries that use a large number of threads to saturate max_thread_pool_size + # explicitly set max_threads as otherwise it's relative to the number of CPU cores + query = ( + "SELECT d, i, s, sleepEachRow(3) from test_threads_busy SETTINGS max_threads=40" + ) + node1.query(query) + + +def test_query_exception_on_thread_pool_full(started_cluster): + prepare_cluster() + # Generate some sample data so sleepEachRow in do_slow_select works + node1.query( + f"INSERT INTO test_threads_busy VALUES ('2024-01-01', 1, 'thread-test')" + ) + + futures = [] + errors = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: + for _ in range(MAX_THREADS): + futures.append(executor.submit(do_slow_select)) + + for f in futures: + try: + f.result() + except Exception as err: + errors.append(str(err)) + assert len(errors) > 0, "Should be 'Cannot schedule a task' exceptions" + assert all( + "Cannot schedule a task" in err for err in errors + ), "Query threads are stuck, or returned an unexpected error" diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 2a7ceab57ba..17554f5c8a5 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -24,10 +24,11 @@ ALTER DROP INDEX ['DROP INDEX'] TABLE ALTER INDEX ALTER MATERIALIZE INDEX ['MATERIALIZE INDEX'] TABLE ALTER INDEX ALTER CLEAR INDEX ['CLEAR INDEX'] TABLE ALTER INDEX ALTER INDEX ['INDEX'] \N ALTER TABLE -ALTER ADD STATISTIC ['ALTER ADD STATISTIC'] TABLE ALTER STATISTIC -ALTER DROP STATISTIC ['ALTER DROP STATISTIC'] TABLE ALTER STATISTIC -ALTER MATERIALIZE STATISTIC ['ALTER MATERIALIZE STATISTIC'] TABLE ALTER STATISTIC -ALTER STATISTIC ['STATISTIC'] \N ALTER TABLE +ALTER ADD STATISTICS ['ALTER ADD STATISTIC'] TABLE ALTER STATISTICS +ALTER DROP STATISTICS ['ALTER DROP STATISTIC'] TABLE ALTER STATISTICS +ALTER MODIFY STATISTICS ['ALTER MODIFY STATISTIC'] TABLE ALTER STATISTICS +ALTER MATERIALIZE STATISTICS ['ALTER MATERIALIZE STATISTIC'] TABLE ALTER STATISTICS +ALTER STATISTICS ['STATISTIC'] \N ALTER TABLE ALTER ADD PROJECTION ['ADD PROJECTION'] TABLE ALTER PROJECTION ALTER DROP PROJECTION ['DROP PROJECTION'] TABLE ALTER PROJECTION ALTER MATERIALIZE PROJECTION ['MATERIALIZE PROJECTION'] TABLE ALTER PROJECTION diff --git a/tests/queries/0_stateless/01601_accurate_cast.reference b/tests/queries/0_stateless/01601_accurate_cast.reference index 82138e6354a..6a438c49f13 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.reference +++ b/tests/queries/0_stateless/01601_accurate_cast.reference @@ -4,6 +4,11 @@ 5 5 5 +5 +5 +5 +5 +5 1 12 2023-05-30 14:38:20 diff --git a/tests/queries/0_stateless/01601_accurate_cast.sql b/tests/queries/0_stateless/01601_accurate_cast.sql index 471e4e34a4a..3d418b5a36f 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.sql +++ b/tests/queries/0_stateless/01601_accurate_cast.sql @@ -16,6 +16,21 @@ SELECT accurateCast(-129, 'Int8'); -- { serverError CANNOT_CONVERT_TYPE } SELECT accurateCast(5, 'Int8'); SELECT accurateCast(128, 'Int8'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast('-1', 'UInt8'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'UInt8'); +SELECT accurateCast('257', 'UInt8'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('-1', 'UInt16'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'UInt16'); +SELECT accurateCast('65536', 'UInt16'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('-1', 'UInt32'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'UInt32'); +SELECT accurateCast('4294967296', 'UInt32'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('-1', 'UInt64'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'UInt64'); +SELECT accurateCast('-129', 'Int8'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'Int8'); +SELECT accurateCast('128', 'Int8'); -- { serverError CANNOT_PARSE_TEXT } + SELECT accurateCast(10, 'Decimal32(9)'); -- { serverError DECIMAL_OVERFLOW } SELECT accurateCast(1, 'Decimal32(9)'); SELECT accurateCast(-10, 'Decimal32(9)'); -- { serverError DECIMAL_OVERFLOW } diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql index b2a04788bbb..f8faa3e653b 100644 --- a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql +++ b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql @@ -68,4 +68,10 @@ with last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) select d.repo_name, columns('count') from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name; +set allow_suspicious_low_cardinality_types=1; + +CREATE TABLE github_events__fuzz_0 (`file_time` Int64, `event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22), `actor_login` LowCardinality(String), `repo_name` LowCardinality(Nullable(String)), `created_at` DateTime, `updated_at` DateTime, `action` Array(Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20)), `comment_id` UInt64, `body` String, `path` LowCardinality(String), `position` Int32, `line` Int32, `ref` String, `ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4), `creator_user_login` Int16, `number` UInt32, `title` String, `labels` Array(Array(LowCardinality(String))), `state` Enum8('none' = 0, 'open' = 1, 'closed' = 2), `locked` UInt8, `assignee` Array(LowCardinality(String)), `assignees` Array(LowCardinality(String)), `comments` UInt32, `author_association` Array(Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5)), `closed_at` UUID, `merged_at` DateTime, `merge_commit_sha` Nullable(String), `requested_reviewers` Array(LowCardinality(Int64)), `requested_teams` Array(String), `head_ref` String, `head_sha` String, `base_ref` String, `base_sha` String, `merged` Nullable(UInt8), `mergeable` Nullable(UInt8), `rebaseable` LowCardinality(UInt8), `mergeable_state` Array(Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4)), `merged_by` LowCardinality(String), `review_comments` UInt32, `maintainer_can_modify` Nullable(UInt8), `commits` UInt32, `additions` Nullable(UInt32), `deletions` UInt32, `changed_files` UInt32, `diff_hunk` Nullable(String), `original_position` UInt32, `commit_id` String, `original_commit_id` String, `push_size` UInt32, `push_distinct_size` UInt32, `member_login` LowCardinality(String), `release_tag_name` LowCardinality(String), `release_name` String, `review_state` Int16) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at) settings allow_nullable_key=1; + +EXPLAIN PIPELINE header = true, compact = true WITH top_repos AS (SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toMonday(created_at) = toMonday(today() - toIntervalWeek(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 PREWHERE (event_type = 'WatchEvent') AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events WHERE (event_type = 'WatchEvent') AND (toYear(created_at) = (toYear(today()) - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100), last_day AS (SELECT repo_name, count() AS count_last_day, rowNumberInAllBlocks() + 1 AS position_last_day FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count_last_day DESC), last_week AS (SELECT repo_name, count() AS count_last_week, rowNumberInAllBlocks() + 1 AS position_last_week FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toMonday(created_at) = (toMonday(today()) - toIntervalWeek(2))) GROUP BY repo_name ORDER BY count_last_week DESC), last_month AS (SELECT repo_name, count() AS count_last_month, rowNumberInAllBlocks() + 1 AS position_last_month FROM github_events__fuzz_0 WHERE ('deleted' = 4) AND in(repo_name) AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count_last_month DESC) SELECT d.repo_name, COLUMNS(count) FROM last_day AS d INNER JOIN last_week AS w ON d.repo_name = w.repo_name INNER JOIN last_month AS m ON d.repo_name = m.repo_name format Null; -- { serverError INVALID_SETTING_VALUE } + DROP TABLE github_events; diff --git a/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference index 84fc422379c..e890eac1794 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference +++ b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference @@ -1,19 +1,19 @@ -1 Hello ClickHouse -2 Hello World +1 Well, Hello ClickHouse ! +2 Well, Hello World ! Granules: 6/6 Granules: 2/6 Granules: 6/6 Granules: 2/6 --- -1 Hello ClickHouse -2 Hello World -6 World Champion +1 Well, Hello ClickHouse ! +2 Well, Hello World ! +6 True World Champion Granules: 6/6 Granules: 3/6 Granules: 6/6 Granules: 3/6 --- -5 OLAP Database +5 Its An OLAP Database Granules: 6/6 Granules: 1/6 Granules: 6/6 diff --git a/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql index 927e605c20a..7f36c423a41 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql +++ b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql @@ -14,19 +14,19 @@ ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 1; -INSERT INTO tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion'); +INSERT INTO tab VALUES (1, 'Well, Hello ClickHouse !'), (2, 'Well, Hello World !'), (3, 'Good Weather !'), (4, 'Say Hello !'), (5, 'Its An OLAP Database'), (6, 'True World Champion'); -SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id; +SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id; -- Read 2/6 granules --- Required string: 'Hello ' --- Alternatives: 'Hello ClickHouse', 'Hello World' +-- Required string: ' Hello ' +-- Alternatives: ' Hello ClickHouse ', ' Hello World ' SELECT * FROM ( EXPLAIN PLAN indexes=1 - SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id + SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -37,7 +37,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes=1 - SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id + SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -46,17 +46,17 @@ SETTINGS SELECT '---'; -SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id; +SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id; -- Read 3/6 granules -- Required string: - --- Alternatives: 'ClickHouse', 'World' +-- Alternatives: ' ClickHouse ', ' World ' SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id + SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -67,7 +67,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id + SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -76,17 +76,17 @@ SETTINGS SELECT '---'; -SELECT * FROM tab WHERE match(str, 'OLAP.*') ORDER BY id; +SELECT * FROM tab WHERE match(str, ' OLAP .*') ORDER BY id; -- Read 1/6 granules --- Required string: 'OLAP' +-- Required string: ' OLAP ' -- Alternatives: - SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id + SELECT * FROM tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -97,7 +97,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id + SELECT * FROM tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id ) WHERE explain LIKE '%Granules: %' diff --git a/tests/queries/0_stateless/02346_fulltext_index_old_name.sql b/tests/queries/0_stateless/02346_fulltext_index_old_name.sql index bc641caf237..4e52e689211 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_old_name.sql +++ b/tests/queries/0_stateless/02346_fulltext_index_old_name.sql @@ -1,22 +1,16 @@ +-- Index type 'inverted' was renamed to 'full_text' in April 2024. +-- Such indexes are experimental. Test what happens when ClickHouse encounters tables with the old index type. + DROP TABLE IF EXISTS tab; --- Index type 'inverted' was renamed to 'full_text' in April 2024. --- Such indexes are experimental. Nevertheless test what happens when ClickHouse encounters tables with the old index type. +-- It must be possible to load old tables with 'inverted'-type indexes +-- In stateless tests, we cannot use old persistences. Emulate "loading an old index" by creating it (internally, similar code executes). --- Create a full text index with the old type --- This was how it was done in the old days. These days this throws an exception. -SET allow_experimental_inverted_index = 1; -CREATE TABLE tab(k UInt64, s String, INDEX idx(s) TYPE inverted(2)) ENGINE = MergeTree() ORDER BY k; -- { serverError ILLEGAL_INDEX }; - --- There are unfortunately side effects of this behavior. In particular, if ClickHouse's automatic table load during --- startup finds a table with 'inverted'-type indexes created by an older version, it immediately halts as it thinks --- the persistence is corrupt. Similarly (but less severely), tables with 'inverted' index cannot be attached. --- A backdoor avoids this. Just set allow_experimental_inverted_index = 0 (which is the default). --- --- Note that the backdoor will exist only temporarily during a transition period. It will be removed in future. Its only purpose is --- to simplify the migrationn of experimental inverted indexes to experimental full-text indexes instead of simply breaking existing --- tables. +-- Creation only works with the (old) setting enabled. SET allow_experimental_inverted_index = 0; +CREATE TABLE tab(k UInt64, s String, INDEX idx(s) TYPE inverted(2)) ENGINE = MergeTree() ORDER BY k; -- { serverError ILLEGAL_INDEX } + +SET allow_experimental_inverted_index = 1; CREATE TABLE tab(k UInt64, s String, INDEX idx(s) TYPE inverted(2)) ENGINE = MergeTree() ORDER BY k; INSERT INTO tab VALUES (1, 'ab') (2, 'bc'); @@ -24,14 +18,12 @@ INSERT INTO tab VALUES (1, 'ab') (2, 'bc'); DETACH TABLE tab; ATTACH TABLE tab; --- No, the backdoor does not make 'inverted' indexes non-experimental. --- On the one hand, the backdoor is undocumented, on the other hand, SELECTs that use such indexes now throw an exception, --- making 'inverted' indexes useless. +-- To encourage users to migrate to the new index type, we now throw an exception when the index is used by queries. SELECT * from tab WHERE s = 'bc'; -- { serverError ILLEGAL_INDEX } -- The exception recommends to drop the index and create a 'full_text' index instead. Let's try. ALTER TABLE tab DROP INDEX idx; -SET allow_experimental_full_text_index = 1; -- note that this is a different setting +SET allow_experimental_full_text_index = 1; -- the new setting ALTER TABLE tab ADD INDEX idx(s) TYPE full_text(2); SELECT * from tab WHERE s = 'bc'; diff --git a/tests/queries/0_stateless/02346_fulltext_index_search.reference b/tests/queries/0_stateless/02346_fulltext_index_search.reference index d742bbc77ec..d7c89d434e7 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_search.reference +++ b/tests/queries/0_stateless/02346_fulltext_index_search.reference @@ -13,19 +13,19 @@ af full_text 1 Test full_text() af full_text -101 Alick a01 -106 Alick a06 -111 Alick b01 -116 Alick b06 -101 Alick a01 -106 Alick a06 +101 x Alick a01 y +106 x Alick a06 y +111 x Alick b01 y +116 x Alick b06 y +101 x Alick a01 y +106 x Alick a06 y 1 -101 Alick a01 -111 Alick b01 +101 x Alick a01 y +111 x Alick b01 y 1 Test on array columns af full_text -3 ['Click a03','Click b03'] +3 ['x Click a03 y','x Click b03 y'] 1 Test on map columns af full_text diff --git a/tests/queries/0_stateless/02346_fulltext_index_search.sql b/tests/queries/0_stateless/02346_fulltext_index_search.sql index 6b06bde6598..80f49790201 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_search.sql +++ b/tests/queries/0_stateless/02346_fulltext_index_search.sql @@ -67,7 +67,7 @@ CREATE TABLE tab_x(k UInt64, s String, INDEX af(s) TYPE full_text()) ENGINE = MergeTree() ORDER BY k SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; -INSERT INTO tab_x VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick a10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10'); +INSERT INTO tab_x VALUES (101, 'x Alick a01 y'), (102, 'x Blick a02 y'), (103, 'x Click a03 y'), (104, 'x Dlick a04 y'), (105, 'x Elick a05 y'), (106, 'x Alick a06 y'), (107, 'x Blick a07 y'), (108, 'x Click a08 y'), (109, 'x Dlick a09 y'), (110, 'x Elick a10 y'), (111, 'x Alick b01 y'), (112, 'x Blick b02 y'), (113, 'x Click b03 y'), (114, 'x Dlick b04 y'), (115, 'x Elick b05 y'), (116, 'x Alick b06 y'), (117, 'x Blick b07 y'), (118, 'x Click b08 y'), (119, 'x Dlick b09 y'), (120, 'x Elick b10 y'); -- check full_text index was created SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab_x' AND database = currentDatabase() LIMIT 1; @@ -86,27 +86,27 @@ SELECT read_rows==8 from system.query_log LIMIT 1; -- search full_text index with IN operator -SELECT * FROM tab_x WHERE s IN ('Alick a01', 'Alick a06') ORDER BY k; +SELECT * FROM tab_x WHERE s IN ('x Alick a01 y', 'x Alick a06 y') ORDER BY k; -- check the query only read 2 granules (4 rows total; each granule has 2 rows) SYSTEM FLUSH LOGS; SELECT read_rows==4 from system.query_log WHERE query_kind ='Select' AND current_database = currentDatabase() - AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE s IN (\'Alick a01\', \'Alick a06\') ORDER BY k;') + AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE s IN (\'x Alick a01 y\', \'x Alick a06 y\') ORDER BY k;') AND type='QueryFinish' AND result_rows==2 LIMIT 1; -- search full_text index with multiSearch -SELECT * FROM tab_x WHERE multiSearchAny(s, ['a01', 'b01']) ORDER BY k; +SELECT * FROM tab_x WHERE multiSearchAny(s, [' a01 ', ' b01 ']) ORDER BY k; -- check the query only read 2 granules (4 rows total; each granule has 2 rows) SYSTEM FLUSH LOGS; SELECT read_rows==4 from system.query_log WHERE query_kind ='Select' AND current_database = currentDatabase() - AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE multiSearchAny(s, [\'a01\', \'b01\']) ORDER BY k;') + AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE multiSearchAny(s, [\' a01 \', \' b01 \']) ORDER BY k;') AND type='QueryFinish' AND result_rows==2 LIMIT 1; @@ -126,14 +126,14 @@ INSERT INTO tab SELECT rowNumberInBlock(), groupArray(s) FROM tab_x GROUP BY k%1 SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1; -- search full_text index with has -SELECT * FROM tab WHERE has(s, 'Click a03') ORDER BY k; +SELECT * FROM tab WHERE has(s, 'x Click a03 y') ORDER BY k; -- check the query must read all 10 granules (20 rows total; each granule has 2 rows) SYSTEM FLUSH LOGS; SELECT read_rows==2 from system.query_log WHERE query_kind ='Select' AND current_database = currentDatabase() - AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE has(s, \'Click a03\') ORDER BY k;') + AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE has(s, \'x Click a03 y\') ORDER BY k;') AND type='QueryFinish' AND result_rows==1 LIMIT 1; diff --git a/tests/queries/0_stateless/02864_statistic_exception.sql b/tests/queries/0_stateless/02864_statistic_exception.sql deleted file mode 100644 index 092fa9bda85..00000000000 --- a/tests/queries/0_stateless/02864_statistic_exception.sql +++ /dev/null @@ -1,53 +0,0 @@ -DROP TABLE IF EXISTS t1; - -CREATE TABLE t1 -( - a Float64 STATISTIC(tdigest), - b Int64 STATISTIC(tdigest), - pk String, -) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } - -SET allow_experimental_statistic = 1; - -CREATE TABLE t1 -( - a Float64 STATISTIC(tdigest), - b Int64, - pk String STATISTIC(tdigest), -) Engine = MergeTree() ORDER BY pk; -- { serverError ILLEGAL_STATISTIC } - -CREATE TABLE t1 -( - a Float64 STATISTIC(tdigest, tdigest(10)), - b Int64, -) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } - -CREATE TABLE t1 -( - a Float64 STATISTIC(xyz), - b Int64, -) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } - -CREATE TABLE t1 -( - a Float64, - b Int64, - pk String, -) Engine = MergeTree() ORDER BY pk; - -ALTER TABLE t1 ADD STATISTIC a TYPE xyz; -- { serverError INCORRECT_QUERY } -ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; -ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 ADD STATISTIC pk TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 DROP STATISTIC b TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 DROP STATISTIC a TYPE tdigest; -ALTER TABLE t1 DROP STATISTIC a TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 CLEAR STATISTIC a TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 MATERIALIZE STATISTIC b TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } - -ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; -ALTER TABLE t1 ADD STATISTIC b TYPE tdigest; -ALTER TABLE t1 MODIFY COLUMN a Float64 TTL toDateTime(b) + INTERVAL 1 MONTH; -ALTER TABLE t1 MODIFY COLUMN a Int64; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN } - -DROP TABLE t1; diff --git a/tests/queries/0_stateless/02864_statistics_exception.reference b/tests/queries/0_stateless/02864_statistics_exception.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02864_statistics_exception.sql b/tests/queries/0_stateless/02864_statistics_exception.sql new file mode 100644 index 00000000000..c531d39cd69 --- /dev/null +++ b/tests/queries/0_stateless/02864_statistics_exception.sql @@ -0,0 +1,57 @@ +DROP TABLE IF EXISTS t1; + +CREATE TABLE t1 +( + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), + pk String, +) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } + +SET allow_experimental_statistics = 1; + +CREATE TABLE t1 +( + a Float64 STATISTICS(tdigest), + b Int64, + pk String STATISTICS(tdigest), +) Engine = MergeTree() ORDER BY pk; -- { serverError ILLEGAL_STATISTICS } + +CREATE TABLE t1 +( + a Float64 STATISTICS(tdigest, tdigest(10)), + b Int64, +) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } + +CREATE TABLE t1 +( + a Float64 STATISTICS(xyz), + b Int64, +) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } + +CREATE TABLE t1 +( + a Float64, + b Int64, + pk String, +) Engine = MergeTree() ORDER BY pk; + +ALTER TABLE t1 ADD STATISTICS a TYPE xyz; -- { serverError INCORRECT_QUERY } +ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS IF NOT EXISTS a TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; -- { serverError ILLEGAL_STATISTICS } +-- Statistics can be created only on integer columns +ALTER TABLE t1 MODIFY STATISTICS a TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS pk TYPE tdigest; -- { serverError ILLEGAL_STATISTICS } +ALTER TABLE t1 DROP STATISTICS b; -- { serverError ILLEGAL_STATISTICS } +ALTER TABLE t1 DROP STATISTICS a; +ALTER TABLE t1 DROP STATISTICS IF EXISTS a; +ALTER TABLE t1 CLEAR STATISTICS a; -- { serverError ILLEGAL_STATISTICS } +ALTER TABLE t1 CLEAR STATISTICS IF EXISTS a; +ALTER TABLE t1 MATERIALIZE STATISTICS b; -- { serverError ILLEGAL_STATISTICS } + +ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS b TYPE tdigest; +ALTER TABLE t1 MODIFY COLUMN a Float64 TTL toDateTime(b) + INTERVAL 1 MONTH; +ALTER TABLE t1 MODIFY COLUMN a Int64; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN } + +DROP TABLE t1; diff --git a/tests/queries/0_stateless/02864_statistic_operate.reference b/tests/queries/0_stateless/02864_statistics_operate.reference similarity index 58% rename from tests/queries/0_stateless/02864_statistic_operate.reference rename to tests/queries/0_stateless/02864_statistics_operate.reference index 3e291485031..6398a9bd000 100644 --- a/tests/queries/0_stateless/02864_statistic_operate.reference +++ b/tests/queries/0_stateless/02864_statistics_operate.reference @@ -1,4 +1,4 @@ -CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +CREATE TABLE default.t1\n(\n `a` Float64 STATISTICS(tdigest),\n `b` Int64 STATISTICS(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After insert Prewhere info Prewhere filter @@ -12,7 +12,7 @@ After drop statistic 10 CREATE TABLE default.t1\n(\n `a` Float64,\n `b` Int64,\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After add statistic -CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +CREATE TABLE default.t1\n(\n `a` Float64 STATISTICS(tdigest),\n `b` Int64 STATISTICS(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After materialize statistic Prewhere info Prewhere filter @@ -23,7 +23,7 @@ After merge Prewhere filter Prewhere filter column: and(less(a, 10), less(b, 10)) (removed) 20 -CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `c` Int64 STATISTIC(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +CREATE TABLE default.t1\n(\n `a` Float64 STATISTICS(tdigest),\n `c` Int64 STATISTICS(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After rename Prewhere info Prewhere filter diff --git a/tests/queries/0_stateless/02864_statistic_operate.sql b/tests/queries/0_stateless/02864_statistics_operate.sql similarity index 87% rename from tests/queries/0_stateless/02864_statistic_operate.sql rename to tests/queries/0_stateless/02864_statistics_operate.sql index 5f1c30f8eec..bf69c11bc91 100644 --- a/tests/queries/0_stateless/02864_statistic_operate.sql +++ b/tests/queries/0_stateless/02864_statistics_operate.sql @@ -1,12 +1,12 @@ DROP TABLE IF EXISTS t1; -SET allow_experimental_statistic = 1; -SET allow_statistic_optimize = 1; +SET allow_experimental_statistics = 1; +SET allow_statistics_optimize = 1; CREATE TABLE t1 ( - a Float64 STATISTIC(tdigest), - b Int64 STATISTIC(tdigest), + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), pk String, ) Engine = MergeTree() ORDER BY pk SETTINGS min_bytes_for_wide_part = 0; @@ -20,7 +20,7 @@ SELECT replaceRegexpAll(explain, '__table1\.|_UInt8', '') FROM (EXPLAIN actions= SELECT count(*) FROM t1 WHERE b < 10 and a < 10; SELECT count(*) FROM t1 WHERE b < NULL and a < '10'; -ALTER TABLE t1 DROP STATISTIC a, b TYPE tdigest; +ALTER TABLE t1 DROP STATISTICS a, b; SELECT 'After drop statistic'; SELECT replaceRegexpAll(explain, '__table1\.|_UInt8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; @@ -28,13 +28,13 @@ SELECT count(*) FROM t1 WHERE b < 10 and a < 10; SHOW CREATE TABLE t1; -ALTER TABLE t1 ADD STATISTIC a, b TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS a, b TYPE tdigest; SELECT 'After add statistic'; SHOW CREATE TABLE t1; -ALTER TABLE t1 MATERIALIZE STATISTIC a, b TYPE tdigest; +ALTER TABLE t1 MATERIALIZE STATISTICS a, b; INSERT INTO t1 select number, -number, generateUUIDv4() FROM system.numbers LIMIT 10000; SELECT 'After materialize statistic'; diff --git a/tests/queries/0_stateless/02864_statistics_uniq.reference b/tests/queries/0_stateless/02864_statistics_uniq.reference new file mode 100644 index 00000000000..77786dbdd8c --- /dev/null +++ b/tests/queries/0_stateless/02864_statistics_uniq.reference @@ -0,0 +1,35 @@ +CREATE TABLE default.t1\n(\n `a` Float64 STATISTICS(tdigest),\n `b` Int64 STATISTICS(tdigest),\n `c` Int64 STATISTICS(tdigest, uniq),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +After insert + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(c, 11), less(a, 10), less(b, 10)) (removed) +After merge + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(c, 11), less(a, 10), less(b, 10)) (removed) +After modify TDigest + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 11), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(c, -1), less(a, 10), less(b, 10)) (removed) +After drop + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 11), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), less(c, -1), less(b, 10)) (removed) diff --git a/tests/queries/0_stateless/02864_statistics_uniq.sql b/tests/queries/0_stateless/02864_statistics_uniq.sql new file mode 100644 index 00000000000..c6b51d2a377 --- /dev/null +++ b/tests/queries/0_stateless/02864_statistics_uniq.sql @@ -0,0 +1,71 @@ +DROP TABLE IF EXISTS t1; + +SET allow_experimental_statistics = 1; +SET allow_statistics_optimize = 1; + +CREATE TABLE t1 +( + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), + c Int64 STATISTICS(tdigest, uniq), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; + +SHOW CREATE TABLE t1; + +INSERT INTO t1 select number, -number, number/1000, generateUUIDv4() FROM system.numbers LIMIT 10000; +INSERT INTO t1 select 0, 0, 11, generateUUIDv4(); + +SELECT 'After insert'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +OPTIMIZE TABLE t1 FINAL; + +SELECT 'After merge'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; + +SELECT 'After modify TDigest'; +ALTER TABLE t1 MODIFY STATISTICS c TYPE TDigest; +ALTER TABLE t1 MATERIALIZE STATISTICS c; + +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; + + +ALTER TABLE t1 DROP STATISTICS c; + +SELECT 'After drop'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; +SET allow_suspicious_low_cardinality_types=1; +CREATE TABLE t2 +( + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), + c LowCardinality(Int64) STATISTICS(tdigest, uniq), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; +INSERT INTO t2 select number, -number, number/1000, generateUUIDv4() FROM system.numbers LIMIT 10000; + +DROP TABLE IF EXISTS t2; +DROP TABLE IF EXISTS t3; + +CREATE TABLE t3 +( + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), + c Nullable(Int64) STATISTICS(tdigest, uniq), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; +INSERT INTO t3 select number, -number, number/1000, generateUUIDv4() FROM system.numbers LIMIT 10000; + +DROP TABLE IF EXISTS t3; diff --git a/tests/queries/0_stateless/02915_analyzer_fuzz_1.reference b/tests/queries/0_stateless/02915_analyzer_fuzz_1.reference new file mode 100644 index 00000000000..ac3f57c1a2e --- /dev/null +++ b/tests/queries/0_stateless/02915_analyzer_fuzz_1.reference @@ -0,0 +1 @@ +With ba\0 diff --git a/tests/queries/0_stateless/02915_analyzer_fuzz_1.sql b/tests/queries/0_stateless/02915_analyzer_fuzz_1.sql new file mode 100644 index 00000000000..94849453063 --- /dev/null +++ b/tests/queries/0_stateless/02915_analyzer_fuzz_1.sql @@ -0,0 +1,2 @@ +set allow_experimental_analyzer=1; +SELECT concat('With ', materialize(_CAST('ba\0', 'LowCardinality(FixedString(3))'))) AS `concat('With ', materialize(CAST('ba\\0', 'LowCardinality(FixedString(3))')))` FROM system.one GROUP BY 'With '; diff --git a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference index 0e1954cde62..5b7ad7ddce0 100644 --- a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference +++ b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference @@ -1,5 +1,5 @@ -1 Hello ClickHouse -2 Hello World +1 Well, Hello ClickHouse ! +2 Well, Hello World ! 1 Hello ClickHouse 2 Hello World Granules: 6/6 @@ -11,9 +11,9 @@ Granules: 6/6 Granules: 2/6 --- -1 Hello ClickHouse -2 Hello World -6 World Champion +1 Well, Hello ClickHouse ! +2 Well, Hello World ! +6 True World Champion 1 Hello ClickHouse 2 Hello World 6 World Champion @@ -26,7 +26,7 @@ Granules: 6/6 Granules: 3/6 --- -5 OLAP Database +5 Its An OLAP Database 5 OLAP Database Granules: 6/6 Granules: 1/6 diff --git a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql index 49d39c601ef..42175cbb2c6 100644 --- a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql +++ b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql @@ -21,21 +21,22 @@ ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 1; -INSERT INTO tokenbf_tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion'); +INSERT INTO tokenbf_tab VALUES (1, 'Well, Hello ClickHouse !'), (2, 'Well, Hello World !'), (3, 'Good Weather !'), (4, 'Say Hello !'), (5, 'Its An OLAP Database'), (6, 'True World Champion'); INSERT INTO ngrambf_tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion'); -SELECT * FROM tokenbf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id; +SELECT * FROM tokenbf_tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id; SELECT * FROM ngrambf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id; -- Read 2/6 granules -- Required string: 'Hello ' -- Alternatives: 'Hello ClickHouse', 'Hello World' +-- Surrounded by spaces for tokenbf SELECT * FROM ( EXPLAIN PLAN indexes=1 - SELECT * FROM tokenbf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -46,7 +47,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes=1 - SELECT * FROM tokenbf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -78,18 +79,19 @@ SETTINGS SELECT '---'; -SELECT * FROM tokenbf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id; +SELECT * FROM tokenbf_tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id; SELECT * FROM ngrambf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id; -- Read 3/6 granules -- Required string: - -- Alternatives: 'ClickHouse', 'World' +-- Surrounded by spaces for tokenbf SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tokenbf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -100,7 +102,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tokenbf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -131,18 +133,19 @@ SETTINGS SELECT '---'; -SELECT * FROM tokenbf_tab WHERE match(str, 'OLAP.*') ORDER BY id; +SELECT * FROM tokenbf_tab WHERE match(str, ' OLAP .*') ORDER BY id; SELECT * FROM ngrambf_tab WHERE match(str, 'OLAP.*') ORDER BY id; -- Read 1/6 granules -- Required string: 'OLAP' -- Alternatives: - +-- Surrounded by spaces for tokenbf SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tokenbf_tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -152,7 +155,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tokenbf_tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id ) WHERE explain LIKE '%Granules: %' diff --git a/tests/queries/0_stateless/02995_baseline_23_12_1.tsv b/tests/queries/0_stateless/02995_baseline_23_12_1.tsv index 4c0c9125b46..a391473e7c9 100644 --- a/tests/queries/0_stateless/02995_baseline_23_12_1.tsv +++ b/tests/queries/0_stateless/02995_baseline_23_12_1.tsv @@ -41,7 +41,7 @@ allow_experimental_query_deduplication 0 allow_experimental_refreshable_materialized_view 0 allow_experimental_s3queue 1 allow_experimental_shared_merge_tree 0 -allow_experimental_statistic 0 +allow_experimental_statistics 0 allow_experimental_undrop_table_query 1 allow_experimental_usearch_index 0 allow_experimental_window_functions 1 @@ -58,7 +58,7 @@ allow_prefetched_read_pool_for_remote_filesystem 1 allow_push_predicate_when_subquery_contains_with 1 allow_settings_after_format_in_insert 0 allow_simdjson 1 -allow_statistic_optimize 0 +allow_statistics_optimize 0 allow_suspicious_codecs 0 allow_suspicious_fixed_string_types 0 allow_suspicious_indices 0 diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference index 806596f8a63..46f24f73356 100644 --- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference +++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference @@ -38,6 +38,17 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 LEFT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key2 a2 1 1 1 0 0 \N +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); key1 a 1 1 2 key1 A 1 2 1 key1 a 1 1 2 key1 B 2 1 2 @@ -67,6 +78,16 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 INNER JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 A 1 2 1 @@ -102,6 +123,17 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 RIGHT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; + 0 0 \N key3 a3 1 1 1 +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 A 1 2 1 @@ -146,6 +178,18 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 FULL JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; + 0 0 \N key3 a3 1 1 1 +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key2 a2 1 1 1 0 0 \N +key4 f 2 3 4 key4 F 1 1 1 SET join_algorithm='grace_hash'; SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); key1 a 1 1 2 key1 A 1 2 1 @@ -185,6 +229,17 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 LEFT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key2 a2 1 1 1 0 0 \N +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); key1 a 1 1 2 key1 A 1 2 1 key1 a 1 1 2 key1 B 2 1 2 @@ -214,6 +269,16 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 INNER JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 A 1 2 1 @@ -249,6 +314,17 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 RIGHT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; + 0 0 \N key3 a3 1 1 1 +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 A 1 2 1 @@ -293,6 +369,18 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 FULL JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; + 0 0 \N key3 a3 1 1 1 +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key2 a2 1 1 1 0 0 \N +key4 f 2 3 4 key4 F 1 1 1 SET join_algorithm='hash'; SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND t1.a < t2.a OR t1.a = t2.a ORDER BY (t1.key, t1.attr, t2.key, t2.attr); key1 a 1 1 2 key1 A 1 2 1 diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 index d3aa74f5c38..a363101ca69 100644 --- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 +++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 @@ -18,6 +18,7 @@ SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.b + SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr); SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 {{ join_type }} JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); +SELECT t1.*, t2.* FROM t1 {{ join_type }} JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; {% endfor -%} {% endfor -%} diff --git a/tests/queries/0_stateless/03164_materialize_statistics.reference b/tests/queries/0_stateless/03164_materialize_statistics.reference index c209d2e8b63..5e969cf41cb 100644 --- a/tests/queries/0_stateless/03164_materialize_statistics.reference +++ b/tests/queries/0_stateless/03164_materialize_statistics.reference @@ -1,10 +1,10 @@ 10 10 10 -statistic not used Condition less(b, 10_UInt8) moved to PREWHERE -statistic not used Condition less(a, 10_UInt8) moved to PREWHERE -statistic used after merge Condition less(a, 10_UInt8) moved to PREWHERE -statistic used after merge Condition less(b, 10_UInt8) moved to PREWHERE -statistic used after materialize Condition less(a, 10_UInt8) moved to PREWHERE -statistic used after materialize Condition less(b, 10_UInt8) moved to PREWHERE +statistics not used Condition less(b, 10_UInt8) moved to PREWHERE +statistics not used Condition less(a, 10_UInt8) moved to PREWHERE +statistics used after merge Condition less(a, 10_UInt8) moved to PREWHERE +statistics used after merge Condition less(b, 10_UInt8) moved to PREWHERE +statistics used after materialize Condition less(a, 10_UInt8) moved to PREWHERE +statistics used after materialize Condition less(b, 10_UInt8) moved to PREWHERE 2 0 diff --git a/tests/queries/0_stateless/03164_materialize_statistics.sql b/tests/queries/0_stateless/03164_materialize_statistics.sql index 763644d16ab..43c5724dd59 100644 --- a/tests/queries/0_stateless/03164_materialize_statistics.sql +++ b/tests/queries/0_stateless/03164_materialize_statistics.sql @@ -1,34 +1,34 @@ -DROP TABLE IF EXISTS t_statistic_materialize; +DROP TABLE IF EXISTS t_statistics_materialize; SET allow_experimental_analyzer = 1; -SET allow_experimental_statistic = 1; -SET allow_statistic_optimize = 1; +SET allow_experimental_statistics = 1; +SET allow_statistics_optimize = 1; SET materialize_statistics_on_insert = 0; -CREATE TABLE t_statistic_materialize +CREATE TABLE t_statistics_materialize ( - a Int64 STATISTIC(tdigest), - b Int16 STATISTIC(tdigest), + a Int64 STATISTICS(tdigest), + b Int16 STATISTICS(tdigest), ) ENGINE = MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0, enable_vertical_merge_algorithm = 0; -- TODO: there is a bug in vertical merge with statistics. -INSERT INTO t_statistic_materialize SELECT number, -number FROM system.numbers LIMIT 10000; +INSERT INTO t_statistics_materialize SELECT number, -number FROM system.numbers LIMIT 10000; -SELECT count(*) FROM t_statistic_materialize WHERE b < 10 and a < 10 SETTINGS log_comment = 'statistic not used'; +SELECT count(*) FROM t_statistics_materialize WHERE b < 10 and a < 10 SETTINGS log_comment = 'statistics not used'; -OPTIMIZE TABLE t_statistic_materialize FINAL; +OPTIMIZE TABLE t_statistics_materialize FINAL; -SELECT count(*) FROM t_statistic_materialize WHERE b < 10 and a < 10 SETTINGS log_comment = 'statistic used after merge'; +SELECT count(*) FROM t_statistics_materialize WHERE b < 10 and a < 10 SETTINGS log_comment = 'statistics used after merge'; -TRUNCATE TABLE t_statistic_materialize; +TRUNCATE TABLE t_statistics_materialize; SET mutations_sync = 2; -INSERT INTO t_statistic_materialize SELECT number, -number FROM system.numbers LIMIT 10000; -ALTER TABLE t_statistic_materialize MATERIALIZE STATISTIC a, b TYPE tdigest; +INSERT INTO t_statistics_materialize SELECT number, -number FROM system.numbers LIMIT 10000; +ALTER TABLE t_statistics_materialize MATERIALIZE STATISTICS a, b; -SELECT count(*) FROM t_statistic_materialize WHERE b < 10 and a < 10 SETTINGS log_comment = 'statistic used after materialize'; +SELECT count(*) FROM t_statistics_materialize WHERE b < 10 and a < 10 SETTINGS log_comment = 'statistics used after materialize'; -DROP TABLE t_statistic_materialize; +DROP TABLE t_statistics_materialize; SYSTEM FLUSH LOGS; @@ -36,7 +36,7 @@ SELECT log_comment, message FROM system.text_log JOIN ( SELECT Settings['log_comment'] AS log_comment, query_id FROM system.query_log WHERE current_database = currentDatabase() - AND query LIKE 'SELECT count(*) FROM t_statistic_materialize%' + AND query LIKE 'SELECT count(*) FROM t_statistics_materialize%' AND type = 'QueryFinish' ) AS query_log USING (query_id) WHERE message LIKE '%moved to PREWHERE%' @@ -45,5 +45,5 @@ ORDER BY event_time_microseconds; SELECT count(), sum(ProfileEvents['MergeTreeDataWriterStatisticsCalculationMicroseconds']) FROM system.query_log WHERE current_database = currentDatabase() - AND query LIKE 'INSERT INTO t_statistic_materialize SELECT%' + AND query LIKE 'INSERT INTO t_statistics_materialize SELECT%' AND type = 'QueryFinish'; diff --git a/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.reference b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.reference new file mode 100644 index 00000000000..4fb6812cb4f --- /dev/null +++ b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.reference @@ -0,0 +1,83 @@ +-------- Bloom filter -------- + +-- No skip for prefix +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for prefix with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for suffix +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for suffix with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for substring +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for substring with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for multiple substrings +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for multiple substrings with complete tokens +Parts: 1/1 +Parts: 0/1 + +-- No skip for multiple non-existsing substrings, only one with complete token +Parts: 1/1 +Parts: 1/1 + +-------- GIN filter -------- + +-- No skip for prefix +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for prefix with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for suffix +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for suffix with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for substring +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for substring with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for multiple substrings +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for multiple substrings with complete tokens +Parts: 1/1 +Parts: 0/1 + +-- No skip for multiple non-existsing substrings, only one with complete token +Parts: 1/1 +Parts: 1/1 diff --git a/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql new file mode 100644 index 00000000000..fee30af0245 --- /dev/null +++ b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql @@ -0,0 +1,229 @@ +SELECT '-------- Bloom filter --------'; +SELECT ''; +DROP TABLE IF EXISTS 03165_token_bf; + +SET allow_experimental_full_text_index=1; + +CREATE TABLE 03165_token_bf +( + id Int64, + message String, + INDEX idx_message message TYPE tokenbf_v1(32768, 3, 2) GRANULARITY 1 +) +ENGINE = MergeTree +ORDER BY id; + +INSERT INTO 03165_token_bf VALUES(1, 'Service is not ready'); + +SELECT '-- No skip for prefix'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE startsWith(message, 'Serv') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE startsWith(message, 'Serv'); + +SELECT ''; +SELECT '-- Skip for prefix with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE startsWith(message, 'Serv i') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE startsWith(message, 'Serv i'); + +SELECT ''; +SELECT '-- No skip for suffix'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE endsWith(message, 'eady') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE endsWith(message, 'eady'); + +SELECT ''; +SELECT '-- Skip for suffix with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE endsWith(message, ' eady') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE endsWith(message, ' eady'); + +SELECT ''; +SELECT '-- No skip for substring'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE match(message, 'no') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE match(message, 'no'); + +SELECT ''; +SELECT '-- Skip for substring with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE match(message, ' xyz ') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE match(message, ' xyz '); + +SELECT ''; +SELECT '-- No skip for multiple substrings'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, ['ce', 'no']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, ['ce', 'no']); + +SELECT ''; +SELECT '-- Skip for multiple substrings with complete tokens'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, [' wx ', ' yz ']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, [' wx ', ' yz ']); + +SELECT ''; +SELECT '-- No skip for multiple non-existsing substrings, only one with complete token'; +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, [' wx ', 'yz']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, [' wx ', 'yz']); + +DROP TABLE IF EXISTS 03165_token_bf; + +SELECT ''; +SELECT '-------- GIN filter --------'; +SELECT ''; + +SET allow_experimental_inverted_index=1; +DROP TABLE IF EXISTS 03165_token_ft; +CREATE TABLE 03165_token_ft +( + id Int64, + message String, + INDEX idx_message message TYPE full_text() GRANULARITY 1 +) +ENGINE = MergeTree +ORDER BY id; + +INSERT INTO 03165_token_ft VALUES(1, 'Service is not ready'); + +SELECT '-- No skip for prefix'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE startsWith(message, 'Serv') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE startsWith(message, 'Serv'); + +SELECT ''; +SELECT '-- Skip for prefix with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE startsWith(message, 'Serv i') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE startsWith(message, 'Serv i'); + +SELECT ''; +SELECT '-- No skip for suffix'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE endsWith(message, 'eady') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE endsWith(message, 'eady'); + +SELECT ''; +SELECT '-- Skip for suffix with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE endsWith(message, ' eady') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE endsWith(message, ' eady'); + +SELECT ''; +SELECT '-- No skip for substring'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE match(message, 'no') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE match(message, 'no'); + +SELECT ''; +SELECT '-- Skip for substring with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE match(message, ' xyz ') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE match(message, ' xyz '); + +SELECT ''; +SELECT '-- No skip for multiple substrings'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, ['ce', 'no']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, ['ce', 'no']); + +SELECT ''; +SELECT '-- Skip for multiple substrings with complete tokens'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, [' wx ', ' yz ']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, [' wx ', ' yz ']); + +SELECT ''; +SELECT '-- No skip for multiple non-existsing substrings, only one with complete token'; +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, [' wx ', 'yz']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, [' wx ', 'yz']); diff --git a/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.reference b/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.reference new file mode 100644 index 00000000000..9daeafb9864 --- /dev/null +++ b/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.reference @@ -0,0 +1 @@ +test diff --git a/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.sql b/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.sql new file mode 100644 index 00000000000..6f563d8f2a1 --- /dev/null +++ b/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.sql @@ -0,0 +1 @@ +SELECT ‘test’ AS “column” \ No newline at end of file diff --git a/tests/sqllogic/test_parser.py b/tests/sqllogic/test_parser.py index c0abcaecd25..bd30674b23a 100755 --- a/tests/sqllogic/test_parser.py +++ b/tests/sqllogic/test_parser.py @@ -526,7 +526,7 @@ class QueryResult: for row in rows: res_row = [] for c, t in zip(row, types): - logger.debug("Builging row. c:%s t:%s", c, t) + logger.debug("Building row. c:%s t:%s", c, t) if c is None: res_row.append("NULL") continue diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index c35e860a5d7..49f43615c7e 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2000,6 +2000,8 @@ minmax mins misconfiguration mispredictions +mlock +mlockall mmap mmapped modularization