diff --git a/contrib/orc b/contrib/orc index bcc025c0982..223e1957ff3 160000 --- a/contrib/orc +++ b/contrib/orc @@ -1 +1 @@ -Subproject commit bcc025c09828c556f54cfbdf83a66b9acae7d17f +Subproject commit 223e1957ff308f2aec7793884ffb0d7692d48484 diff --git a/docs/en/development/build-osx.md b/docs/en/development/build-osx.md index 03abcb3f820..cee2693e345 100644 --- a/docs/en/development/build-osx.md +++ b/docs/en/development/build-osx.md @@ -42,15 +42,14 @@ brew install ccache cmake ninja libtool gettext llvm gcc binutils grep findutils ## Checkout ClickHouse Sources {#checkout-clickhouse-sources} -:::note -The ClickHouse build assumes a case-sensitive file system. Case-insensitive file systems may cause errors during the build process. If necessary, please follow [these instructions](https://brianboyko.medium.com/a-case-sensitive-src-folder-for-mac-programmers-176cc82a3830) to create a new disk image and checkout the code into it. -::: - ``` bash git clone --recursive git@github.com:ClickHouse/ClickHouse.git # ...alternatively, you can use https://github.com/ClickHouse/ClickHouse.git as the repo URL. ``` +Apple uses a case-insensitive file system by default. While this usually does not affect compilation (especially scratch makes will work), it can confuse file operations like `git mv`. +For serious development on macOS, make sure that the source code is stored on a case-sensitive disk volume, e.g. see [these instructions](https://brianboyko.medium.com/a-case-sensitive-src-folder-for-mac-programmers-176cc82a3830). + ## Build ClickHouse {#build-clickhouse} To build using Homebrew's vanilla Clang compiler (the only **recommended** way): diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 614f0a72c13..dc12a60e8ef 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -56,7 +56,7 @@ Parameters: distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors). - `quantization`: either `f64`, `f32`, `f16`, `bf16`, or `i8` for storing the vector with reduced precision (optional, default: `bf16`) - `hnsw_max_connections_per_layer`: the number of neighbors per HNSW graph node, also known as `M` in the [HNSW - paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 16) + paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 32) - `hnsw_candidate_list_size_for_construction`: the size of the dynamic candidate list when constructing the HNSW graph, also known as `ef_construction` in the original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 128) @@ -143,7 +143,7 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Dista ``` ::: -To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 64), also known as `ef_search` in the +To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 256), also known as `ef_search` in the original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473), run the `SELECT` query with `SETTINGS hnsw_candidate_list_size_for_search = `. diff --git a/docs/en/getting-started/example-datasets/tpch.md b/docs/en/getting-started/example-datasets/tpch.md index c675ac6f594..5fa0d779ecd 100644 --- a/docs/en/getting-started/example-datasets/tpch.md +++ b/docs/en/getting-started/example-datasets/tpch.md @@ -10,12 +10,15 @@ A popular benchmark which models the internal data warehouse of a wholesale supp The data is stored into a 3rd normal form representation, requiring lots of joins at query runtime. Despite its age and its unrealistic assumption that the data is uniformly and independently distributed, TPC-H remains the most popular OLAP benchmark to date. -References +**References** + - [TPC-H](https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) - [New TPC Benchmarks for Decision Support and Web Commerce](https://doi.org/10.1145/369275.369291) (Poess et. al., 2000) - [TPC-H Analyzed: Hidden Messages and Lessons Learned from an Influential Benchmark](https://doi.org/10.1007/978-3-319-04936-6_5) (Boncz et. al.), 2013 - [Quantifying TPC-H Choke Points and Their Optimizations](https://doi.org/10.14778/3389133.3389138) (Dresseler et. al.), 2020 +## Data Generation and Import + First, checkout the TPC-H repository and compile the data generator: ``` bash @@ -30,7 +33,17 @@ Then, generate the data. Parameter `-s` specifies the scale factor. For example, ./dbgen -s 100 ``` -Now create tables in ClickHouse: +Now create tables in ClickHouse. + +We stick as closely as possible to the rules of the TPC-H specification: +- Primary keys are created only for the columns mentioned in section 1.4.2.2 of the specification. +- Substitution parameters were replaced by the values for query validation in sections 2.1.x.4 of the specification. +- As per section 1.4.2.1, the table definitions do not use the optional `NOT NULL` constraints, even if `dbgen` generates them by default. + The performance of `SELECT` queries in ClickHouse is not affected by the presence or absence of `NOT NULL` constraints. +- As per section 1.3.1, we use ClickHouse's native datatypes (e.g. `Int32`, `String`) to implement the abstract datatypes mentioned in the + specification (e.g. `Identifier`, `Variable text, size N`). The only effect of this is better readability, the SQL-92 datatypes generated + by `dbgen` (e.g. `INTEGER`, `VARCHAR(40)`) would also work in ClickHouse. + ```sql CREATE TABLE nation ( @@ -38,13 +51,13 @@ CREATE TABLE nation ( n_name String, n_regionkey Int32, n_comment String) -ORDER BY (n_regionkey, n_name); +ORDER BY (n_nationkey); CREATE TABLE region ( r_regionkey Int32, r_name String, r_comment String) -ORDER BY (r_name); +ORDER BY (r_regionkey); CREATE TABLE part ( p_partkey Int32, @@ -56,7 +69,7 @@ CREATE TABLE part ( p_container String, p_retailprice Decimal(15,2), p_comment String) -ORDER BY (p_mfgr, p_brand, p_type, p_name); +ORDER BY (p_partkey); CREATE TABLE supplier ( s_suppkey Int32, @@ -66,7 +79,7 @@ CREATE TABLE supplier ( s_phone String, s_acctbal Decimal(15,2), s_comment String) -ORDER BY (s_nationkey, s_address, s_name); +ORDER BY (s_suppkey); CREATE TABLE partsupp ( ps_partkey Int32, @@ -74,7 +87,7 @@ CREATE TABLE partsupp ( ps_availqty Int32, ps_supplycost Decimal(15,2), ps_comment String) -ORDER BY (ps_suppkey, ps_availqty, ps_supplycost, ps_partkey); +ORDER BY (ps_partkey, ps_suppkey); CREATE TABLE customer ( c_custkey Int32, @@ -85,7 +98,7 @@ CREATE TABLE customer ( c_acctbal Decimal(15,2), c_mktsegment String, c_comment String) -ORDER BY (c_nationkey, c_mktsegment, c_address, c_name, c_custkey); +ORDER BY (c_custkey); CREATE TABLE orders ( o_orderkey Int32, @@ -97,7 +110,10 @@ CREATE TABLE orders ( o_clerk String, o_shippriority Int32, o_comment String) -ORDER BY (o_orderdate, o_orderstatus, o_custkey); +ORDER BY (o_orderkey); +-- The following is an alternative order key which is not compliant with the official TPC-H rules but recommended by sec. 4.5 in +-- "Quantifying TPC-H Choke Points and Their Optimizations": +-- ORDER BY (o_orderdate, o_orderkey); CREATE TABLE lineitem ( l_orderkey Int32, @@ -116,7 +132,10 @@ CREATE TABLE lineitem ( l_shipinstruct String, l_shipmode String, l_comment String) -ORDER BY (l_suppkey, l_partkey, l_shipdate, l_commitdate, l_receiptdate); +ORDER BY (l_orderkey, l_linenumber); +-- The following is an alternative order key which is not compliant with the official TPC-H rules but recommended by sec. 4.5 in +-- "Quantifying TPC-H Choke Points and Their Optimizations": +-- ORDER BY (l_shipdate, l_orderkey, l_linenumber); ``` The data can be imported as follows: @@ -127,19 +146,21 @@ clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO region FORMAT clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO part FORMAT CSV" < part.tbl clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO partsupp FORMAT CSV" < partsupp.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customers FORMAT CSV" < customers.tbl +clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer FORMAT CSV" < customer.tbl clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO orders FORMAT CSV" < orders.tbl clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO lineitem FORMAT CSV" < lineitem.tbl ``` The queries are generated by `./qgen -s `. Example queries for `s = 100`: -::::warning -TPC-H makes heavy use of correlated subqueries which are at the time of writing (September 2024) not supported by ClickHouse ([issue #6697](https://github.com/ClickHouse/ClickHouse/issues/6697)). -As a result, many of below benchmark queries will fail with errors. -:::: +## Queries -Q1 +**Correctness** + +The result of the queries agrees with the official results unless mentioned otherwise. To verify, generate a TPC-H database with scale +factor = 1 (`dbgen`, see above) and compare with the [expected results in tpch-kit](https://github.com/gregrahn/tpch-kit/tree/master/dbgen/answers). + +**Q1** ```sql SELECT @@ -156,7 +177,7 @@ SELECT FROM lineitem WHERE - l_shipdate <= date '1998-12-01' - interval '100' day + l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY GROUP BY l_returnflag, l_linestatus @@ -165,7 +186,7 @@ ORDER BY l_linestatus; ``` -Q2 +**Q2** ```sql SELECT @@ -186,11 +207,11 @@ FROM WHERE p_partkey = ps_partkey AND s_suppkey = ps_suppkey - AND p_size = 21 - AND p_type LIKE '%COPPER' + AND p_size = 15 + AND p_type LIKE '%BRASS' AND s_nationkey = n_nationkey AND n_regionkey = r_regionkey - AND r_name = 'AMERICA' + AND r_name = 'EUROPE' AND ps_supplycost = ( SELECT min(ps_supplycost) @@ -204,17 +225,72 @@ WHERE AND s_suppkey = ps_suppkey AND s_nationkey = n_nationkey AND n_regionkey = r_regionkey - AND r_name = 'AMERICA' + AND r_name = 'EUROPE' ) ORDER BY - s_acctbal desc, + s_acctbal DESC, n_name, s_name, - p_partkey -LIMIT 100; + p_partkey; ``` -Q3 +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + +This alternative formulation works and was verified to return the reference results. + +```sql +WITH MinSupplyCost AS ( + SELECT + ps_partkey, + MIN(ps_supplycost) AS min_supplycost + FROM + partsupp ps + JOIN + supplier s ON ps.ps_suppkey = s.s_suppkey + JOIN + nation n ON s.s_nationkey = n.n_nationkey + JOIN + region r ON n.n_regionkey = r.r_regionkey + WHERE + r.r_name = 'EUROPE' + GROUP BY + ps_partkey +) +SELECT + s.s_acctbal, + s.s_name, + n.n_name, + p.p_partkey, + p.p_mfgr, + s.s_address, + s.s_phone, + s.s_comment +FROM + part p +JOIN + partsupp ps ON p.p_partkey = ps.ps_partkey +JOIN + supplier s ON s.s_suppkey = ps.ps_suppkey +JOIN + nation n ON s.s_nationkey = n.n_nationkey +JOIN + region r ON n.n_regionkey = r.r_regionkey +JOIN + MinSupplyCost msc ON ps.ps_partkey = msc.ps_partkey AND ps.ps_supplycost = msc.min_supplycost +WHERE + p.p_size = 15 + AND p.p_type LIKE '%BRASS' + AND r.r_name = 'EUROPE' +ORDER BY + s.s_acctbal DESC, + n.n_name, + s.s_name, + p.p_partkey; +``` +:::: + +**Q3** ```sql SELECT @@ -230,19 +306,18 @@ WHERE c_mktsegment = 'BUILDING' AND c_custkey = o_custkey AND l_orderkey = o_orderkey - AND o_orderdate < date '1995-03-10' - AND l_shipdate > date '1995-03-10' + AND o_orderdate < DATE '1995-03-15' + AND l_shipdate > DATE '1995-03-15' GROUP BY l_orderkey, o_orderdate, o_shippriority ORDER BY - revenue desc, - o_orderdate -LIMIT 10; + revenue DESC, + o_orderdate; ``` -Q4 +**Q4** ```sql SELECT @@ -251,8 +326,8 @@ SELECT FROM orders WHERE - o_orderdate >= date '1994-07-01' - AND o_orderdate < date '1994-07-01' + interval '3' month + o_orderdate >= DATE '1993-07-01' + AND o_orderdate < DATE '1993-07-01' + INTERVAL '3' MONTH AND EXISTS ( SELECT * @@ -268,7 +343,40 @@ ORDER BY o_orderpriority; ``` -Q5 +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + +This alternative formulation works and was verified to return the reference results. + +```sql +WITH ValidLineItems AS ( + SELECT + l_orderkey + FROM + lineitem + WHERE + l_commitdate < l_receiptdate + GROUP BY + l_orderkey +) +SELECT + o.o_orderpriority, + COUNT(*) AS order_count +FROM + orders o +JOIN + ValidLineItems vli ON o.o_orderkey = vli.l_orderkey +WHERE + o.o_orderdate >= DATE '1993-07-01' + AND o.o_orderdate < DATE '1993-07-01' + INTERVAL '3' MONTH +GROUP BY + o.o_orderpriority +ORDER BY + o.o_orderpriority; +``` +:::: + +**Q5** ```sql SELECT @@ -288,16 +396,16 @@ WHERE AND c_nationkey = s_nationkey AND s_nationkey = n_nationkey AND n_regionkey = r_regionkey - AND r_name = 'MIDDLE EAST' - AND o_orderdate >= date '1994-01-01' - AND o_orderdate < date '1994-01-01' + interval '1' year + AND r_name = 'ASIA' + AND o_orderdate >= DATE '1994-01-01' + AND o_orderdate < DATE '1994-01-01' + INTERVAL '1' year GROUP BY n_name ORDER BY - revenue desc; + revenue DESC; ``` -Q6 +**Q6** ```sql SELECT @@ -305,13 +413,31 @@ SELECT FROM lineitem WHERE - l_shipdate >= date '1994-01-01' - AND l_shipdate < date '1994-01-01' + interval '1' year - AND l_discount between 0.09 - 0.01 AND 0.09 + 0.01 + l_shipdate >= DATE '1994-01-01' + AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year + AND l_discount BETWEEN 0.06 - 0.01 AND 0.06 + 0.01 AND l_quantity < 24; ``` -Q7 +::::note +As of October 2024, the query does not work out-of-the box due to a bug with Decimal addition. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70136 + +This alternative formulation works and was verified to return the reference results. + +```sql +SELECT + sum(l_extendedprice * l_discount) AS revenue +FROM + lineitem +WHERE + l_shipdate >= DATE '1994-01-01' + AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year + AND l_discount BETWEEN 0.05 AND 0.07 + AND l_quantity < 24; +``` +:::: + +**Q7** ```sql SELECT @@ -319,31 +445,30 @@ SELECT cust_nation, l_year, sum(volume) AS revenue -FROM - ( - SELECT - n1.n_name AS supp_nation, - n2.n_name AS cust_nation, - extract(year FROM l_shipdate) AS l_year, - l_extendedprice * (1 - l_discount) AS volume - FROM - supplier, - lineitem, - orders, - customer, - nation n1, - nation n2 - WHERE - s_suppkey = l_suppkey - AND o_orderkey = l_orderkey - AND c_custkey = o_custkey - AND s_nationkey = n1.n_nationkey - AND c_nationkey = n2.n_nationkey - AND ( - (n1.n_name = 'UNITED KINGDOM' AND n2.n_name = 'ETHIOPIA') - OR (n1.n_name = 'ETHIOPIA' AND n2.n_name = 'UNITED KINGDOM') - ) - AND l_shipdate between date '1995-01-01' AND date '1996-12-31' +FROM ( + SELECT + n1.n_name AS supp_nation, + n2.n_name AS cust_nation, + extract(year FROM l_shipdate) AS l_year, + l_extendedprice * (1 - l_discount) AS volume + FROM + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + WHERE + s_suppkey = l_suppkey + AND o_orderkey = l_orderkey + AND c_custkey = o_custkey + AND s_nationkey = n1.n_nationkey + AND c_nationkey = n2.n_nationkey + AND ( + (n1.n_name = 'FRANCE' AND n2.n_name = 'GERMANY') + OR (n1.n_name = 'GERMANY' AND n2.n_name = 'FRANCE') + ) + AND l_shipdate BETWEEN DATE '1995-01-01' AND DATE '1996-12-31' ) AS shipping GROUP BY supp_nation, @@ -355,41 +480,41 @@ ORDER BY l_year; ``` -Q8 +**Q8** ```sql SELECT o_year, sum(CASE - WHEN nation = 'ETHIOPIA' THEN volume - ELSE 0 - END) / sum(volume) AS mkt_share -FROM - ( - SELECT - extract(year FROM o_orderdate) AS o_year, - l_extendedprice * (1 - l_discount) AS volume, - n2.n_name AS nation - FROM - part, - supplier, - lineitem, - orders, - customer, - nation n1, - nation n2, - region - WHERE - p_partkey = l_partkey - AND s_suppkey = l_suppkey - AND l_orderkey = o_orderkey - AND o_custkey = c_custkey - AND c_nationkey = n1.n_nationkey - AND n1.n_regionkey = r_regionkey - AND r_name = 'AFRICA' - AND s_nationkey = n2.n_nationkey - AND o_orderdate between date '1995-01-01' AND date '1996-12-31' - AND p_type = 'SMALL POLISHED TIN' + WHEN nation = 'BRAZIL' + THEN volume + ELSE 0 + END) / sum(volume) AS mkt_share +FROM ( + SELECT + extract(year FROM o_orderdate) AS o_year, + l_extendedprice * (1 - l_discount) AS volume, + n2.n_name AS nation + FROM + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + WHERE + p_partkey = l_partkey + AND s_suppkey = l_suppkey + AND l_orderkey = o_orderkey + AND o_custkey = c_custkey + AND c_nationkey = n1.n_nationkey + AND n1.n_regionkey = r_regionkey + AND r_name = 'AMERICA' + AND s_nationkey = n2.n_nationkey + AND o_orderdate BETWEEN DATE '1995-01-01' AND DATE '1996-12-31' + AND p_type = 'ECONOMY ANODIZED STEEL' ) AS all_nations GROUP BY o_year @@ -397,44 +522,43 @@ ORDER BY o_year; ``` -Q9 +**Q9** ```sql SELECT nation, o_year, sum(amount) AS sum_profit -FROM - ( - SELECT - n_name AS nation, - extract(year FROM o_orderdate) AS o_year, - l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity AS amount - FROM - part, - supplier, - lineitem, - partsupp, - orders, - nation - WHERE - s_suppkey = l_suppkey - AND ps_suppkey = l_suppkey - AND ps_partkey = l_partkey - AND p_partkey = l_partkey - AND o_orderkey = l_orderkey - AND s_nationkey = n_nationkey - AND p_name LIKE '%drab%' +FROM ( + SELECT + n_name AS nation, + extract(year FROM o_orderdate) AS o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity AS amount + FROM + part, + supplier, + lineitem, + partsupp, + orders, + nation + WHERE + s_suppkey = l_suppkey + AND ps_suppkey = l_suppkey + AND ps_partkey = l_partkey + AND p_partkey = l_partkey + AND o_orderkey = l_orderkey + AND s_nationkey = n_nationkey + AND p_name LIKE '%green%' ) AS profit GROUP BY nation, o_year ORDER BY nation, - o_year desc; + o_year DESC; ``` -Q10 +**Q10** ```sql SELECT @@ -454,8 +578,8 @@ FROM WHERE c_custkey = o_custkey AND l_orderkey = o_orderkey - AND o_orderdate >= date '1993-06-01' - AND o_orderdate < date '1993-06-01' + interval '3' month + AND o_orderdate >= DATE '1993-10-01' + AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH AND l_returnflag = 'R' AND c_nationkey = n_nationkey GROUP BY @@ -467,11 +591,10 @@ GROUP BY c_address, c_comment ORDER BY - revenue desc -LIMIT 20; + revenue DESC; ``` -Q11 +**Q11** ```sql SELECT @@ -484,12 +607,12 @@ FROM WHERE ps_suppkey = s_suppkey AND s_nationkey = n_nationkey - AND n_name = 'MOZAMBIQUE' + AND n_name = 'GERMANY' GROUP BY - ps_partkey having + ps_partkey HAVING sum(ps_supplycost * ps_availqty) > ( SELECT - sum(ps_supplycost * ps_availqty) * 0.0000010000 + sum(ps_supplycost * ps_availqty) * 0.0001 FROM partsupp, supplier, @@ -497,89 +620,119 @@ GROUP BY WHERE ps_suppkey = s_suppkey AND s_nationkey = n_nationkey - AND n_name = 'MOZAMBIQUE' + AND n_name = 'GERMANY' ) ORDER BY - value desc; + value DESC; ``` -Q12 +**Q12** ```sql SELECT l_shipmode, sum(CASE - WHEN o_orderpriority = '1-URGENT' - OR o_orderpriority = '2-HIGH' + WHEN o_orderpriority = '1-URGENT' + OR o_orderpriority = '2-HIGH' THEN 1 - ELSE 0 - END) AS high_line_count, + ELSE 0 + END) AS high_line_count, sum(CASE WHEN o_orderpriority <> '1-URGENT' - AND o_orderpriority <> '2-HIGH' + AND o_orderpriority <> '2-HIGH' THEN 1 ELSE 0 - END) AS low_line_count + END) AS low_line_count FROM orders, lineitem WHERE o_orderkey = l_orderkey - AND l_shipmode in ('MAIL', 'AIR') + AND l_shipmode in ('MAIL', 'SHIP') AND l_commitdate < l_receiptdate AND l_shipdate < l_commitdate - AND l_receiptdate >= date '1996-01-01' - AND l_receiptdate < date '1996-01-01' + interval '1' year + AND l_receiptdate >= DATE '1994-01-01' + AND l_receiptdate < DATE '1994-01-01' + INTERVAL '1' year GROUP BY l_shipmode ORDER BY l_shipmode; ``` -Q13 +**Q13** ```sql SELECT c_count, count(*) AS custdist -FROM - ( - SELECT - c_custkey, - count(o_orderkey) - FROM - customer LEFT OUTER JOIN orders ON - c_custkey = o_custkey - AND o_comment NOT LIKE '%special%deposits%' - GROUP BY - c_custkey +FROM ( + SELECT + c_custkey, + count(o_orderkey) + FROM + customer LEFT OUTER JOIN orders ON + c_custkey = o_custkey + AND o_comment NOT LIKE '%special%requests%' + GROUP BY + c_custkey ) AS c_orders GROUP BY c_count ORDER BY - custdist desc, - c_count desc; + custdist DESC, + c_count DESC; ``` -Q14 +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + +This alternative formulation works and was verified to return the reference results. + +```sql +WITH CustomerOrderCounts AS ( + SELECT + c.c_custkey, + count(o.o_orderkey) AS order_count + FROM + customer c + LEFT OUTER JOIN + orders o ON c.c_custkey = o.o_custkey + AND o.o_comment NOT LIKE '%special%requests%' + GROUP BY + c.c_custkey +) +SELECT + order_count AS c_count, + count(*) AS custdist +FROM + CustomerOrderCounts +GROUP BY + order_count +ORDER BY + custdist DESC, + c_count DESC; +``` +:::: + +**Q14** ```sql SELECT 100.00 * sum(CASE - WHEN p_type LIKE 'PROMO%' - THEN l_extendedprice * (1 - l_discount) - ELSE 0 - END) / sum(l_extendedprice * (1 - l_discount)) AS promo_revenue + WHEN p_type LIKE 'PROMO%' + THEN l_extendedprice * (1 - l_discount) + ELSE 0 + END) / sum(l_extendedprice * (1 - l_discount)) AS promo_revenue FROM lineitem, part WHERE l_partkey = p_partkey - AND l_shipdate >= date '1996-10-01' - AND l_shipdate < date '1996-10-01' + interval '1' month; + AND l_shipdate >= DATE '1995-09-01' + AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH; ``` -Q15 +**Q15** ```sql CREATE VIEW revenue0 (supplier_no, total_revenue) AS @@ -589,8 +742,8 @@ CREATE VIEW revenue0 (supplier_no, total_revenue) AS FROM lineitem WHERE - l_shipdate >= date '1997-06-01' - AND l_shipdate < date '1997-06-01' + interval '3' month + l_shipdate >= DATE '1996-01-01' + AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH GROUP BY l_suppkey; @@ -617,7 +770,27 @@ ORDER BY DROP VIEW revenue0; ``` -Q16 +::::note +As of October 2024, the view definition does not work out-of-the box. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70139 + +This alternative view definition does work: + +```sql +CREATE VIEW revenue0 AS + SELECT + l_suppkey AS supplier_no, + sum(l_extendedprice * (1 - l_discount)) AS total_revenue + FROM + lineitem + WHERE + l_shipdate >= DATE '1996-01-01' + AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH + GROUP BY + l_suppkey; +``` +:::: + +**Q16** ```sql SELECT @@ -630,9 +803,9 @@ FROM part WHERE p_partkey = ps_partkey - AND p_brand <> 'Brand#15' - AND p_type NOT LIKE 'SMALL POLISHED%' - AND p_size in (21, 9, 46, 34, 50, 33, 17, 36) + AND p_brand <> 'Brand#45' + AND p_type NOT LIKE 'MEDIUM POLISHED%' + AND p_size in (49, 14, 23, 45, 19, 3, 36, 9) AND ps_suppkey NOT in ( SELECT s_suppkey @@ -646,13 +819,13 @@ GROUP BY p_type, p_size ORDER BY - supplier_cnt desc, + supplier_cnt DESC, p_brand, p_type, p_size; ``` -Q17 +**Q17** ```sql SELECT @@ -662,8 +835,8 @@ FROM part WHERE p_partkey = l_partkey - AND p_brand = 'Brand#52' - AND p_container = 'MED CASE' + AND p_brand = 'Brand#23' + AND p_container = 'MED BOX' AND l_quantity < ( SELECT 0.2 * avg(l_quantity) @@ -674,7 +847,38 @@ WHERE ); ``` -Q18 +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + +This alternative formulation works and was verified to return the reference results. + +```sql +WITH AvgQuantity AS ( + SELECT + l_partkey, + AVG(l_quantity) * 0.2 AS avg_quantity + FROM + lineitem + GROUP BY + l_partkey +) +SELECT + SUM(l.l_extendedprice) / 7.0 AS avg_yearly +FROM + lineitem l +JOIN + part p ON p.p_partkey = l.l_partkey +JOIN + AvgQuantity aq ON l.l_partkey = aq.l_partkey +WHERE + p.p_brand = 'Brand#23' + AND p.p_container = 'MED BOX' + AND l.l_quantity < aq.avg_quantity; + +``` +:::: + +**Q18** ```sql SELECT @@ -695,8 +899,9 @@ WHERE FROM lineitem GROUP BY - l_orderkey having - sum(l_quantity) > 313 + l_orderkey + HAVING + sum(l_quantity) > 300 ) AND c_custkey = o_custkey AND o_orderkey = l_orderkey @@ -707,52 +912,91 @@ GROUP BY o_orderdate, o_totalprice ORDER BY - o_totalprice desc, - o_orderdate -LIMIT 100; + o_totalprice DESC, + o_orderdate; ``` -Q19 +**Q19** ```sql SELECT - sum(l_extendedprice* (1 - l_discount)) AS revenue + sum(l_extendedprice * (1 - l_discount)) AS revenue FROM lineitem, part WHERE ( p_partkey = l_partkey - AND p_brand = 'Brand#31' + AND p_brand = 'Brand#12' AND p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') - AND l_quantity >= 3 AND l_quantity <= 3 + 10 - AND p_size between 1 AND 5 + AND l_quantity >= 1 AND l_quantity <= 1 + 10 + AND p_size BETWEEN 1 AND 5 AND l_shipmode in ('AIR', 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON' ) OR ( p_partkey = l_partkey - AND p_brand = 'Brand#54' + AND p_brand = 'Brand#23' AND p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') - AND l_quantity >= 17 AND l_quantity <= 17 + 10 - AND p_size between 1 AND 10 + AND l_quantity >= 10 AND l_quantity <= 10 + 10 + AND p_size BETWEEN 1 AND 10 AND l_shipmode in ('AIR', 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON' ) OR ( p_partkey = l_partkey - AND p_brand = 'Brand#54' + AND p_brand = 'Brand#34' AND p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') - AND l_quantity >= 26 AND l_quantity <= 26 + 10 - AND p_size between 1 AND 15 + AND l_quantity >= 20 AND l_quantity <= 20 + 10 + AND p_size BETWEEN 1 AND 15 AND l_shipmode in ('AIR', 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON' ); ``` -Q20 +::::note +As of October 2024, the query is extremely slow due to missing join predicate pushdown. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70802 + +This alternative formulation works and was verified to return the reference results. + +```sql +SELECT + sum(l_extendedprice * (1 - l_discount)) AS revenue +FROM + lineitem, + part +WHERE + p_partkey = l_partkey + AND l_shipinstruct = 'DELIVER IN PERSON' + AND l_shipmode IN ('AIR', 'AIR REG') + AND ( + ( + p_brand = 'Brand#12' + AND p_container IN ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + AND l_quantity >= 1 AND l_quantity <= 1 + 10 + AND p_size BETWEEN 1 AND 5 + ) + OR + ( + p_brand = 'Brand#23' + AND p_container IN ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + AND l_quantity >= 10 AND l_quantity <= 10 + 10 + AND p_size BETWEEN 1 AND 10 + ) + OR + ( + p_brand = 'Brand#34' + AND p_container IN ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + AND l_quantity >= 20 AND l_quantity <= 20 + 10 + AND p_size BETWEEN 1 AND 15 + ) + ) +``` +:::: + +**Q20** ```sql SELECT @@ -774,7 +1018,7 @@ WHERE FROM part WHERE - p_name LIKE 'chiffon%' + p_name LIKE 'forest%' ) AND ps_availqty > ( SELECT @@ -784,17 +1028,21 @@ WHERE WHERE l_partkey = ps_partkey AND l_suppkey = ps_suppkey - AND l_shipdate >= date '1997-01-01' - AND l_shipdate < date '1997-01-01' + interval '1' year + AND l_shipdate >= DATE '1994-01-01' + AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year ) ) AND s_nationkey = n_nationkey - AND n_name = 'MOZAMBIQUE' + AND n_name = 'CANADA' ORDER BY s_name; ``` -Q21 +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 +:::: + +**Q21** ```sql SELECT @@ -830,50 +1078,51 @@ WHERE AND l3.l_receiptdate > l3.l_commitdate ) AND s_nationkey = n_nationkey - AND n_name = 'RUSSIA' + AND n_name = 'SAUDI ARABIA' GROUP BY s_name ORDER BY - numwait desc, - s_name -LIMIT 100; + numwait DESC, + s_name; ``` +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 +:::: -Q22 +**Q22** ```sql SELECT cntrycode, count(*) AS numcust, sum(c_acctbal) AS totacctbal -FROM - ( - SELECT - substring(c_phone FROM 1 for 2) AS cntrycode, - c_acctbal - FROM - customer - WHERE - substring(c_phone FROM 1 for 2) in - ('26', '34', '10', '18', '27', '12', '11') - AND c_acctbal > ( - SELECT - avg(c_acctbal) - FROM - customer - WHERE - c_acctbal > 0.00 - AND substring(c_phone FROM 1 for 2) in - ('26', '34', '10', '18', '27', '12', '11') - ) - AND NOT EXISTS ( - SELECT - * - FROM - orders - WHERE - o_custkey = c_custkey - ) +FROM ( + SELECT + substring(c_phone FROM 1 for 2) AS cntrycode, + c_acctbal + FROM + customer + WHERE + substring(c_phone FROM 1 for 2) in + ('13', '31', '23', '29', '30', '18', '17') + AND c_acctbal > ( + SELECT + avg(c_acctbal) + FROM + customer + WHERE + c_acctbal > 0.00 + AND substring(c_phone FROM 1 for 2) in + ('13', '31', '23', '29', '30', '18', '17') + ) + AND NOT EXISTS ( + SELECT + * + FROM + orders + WHERE + o_custkey = c_custkey + ) ) AS custsale GROUP BY cntrycode diff --git a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md index 2aebccfdc53..10b2f2324d1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md @@ -18,7 +18,7 @@ Parameters: Returned values: -Constants `(a, b)` of the resulting line `y = a*x + b`. +Constants `(k, b)` of the resulting line `y = k*x + b`. **Examples** diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 0a50960b453..8d7d065ea8b 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -4393,7 +4393,7 @@ Result: ## globalVariable -Takes constant string argument and returns the value of global variable with that name. It is intended for compatibility with MySQL. +Takes a constant string argument and returns the value of the global variable with that name. This function is intended for compatibility with MySQL and not needed or useful for normal operation of ClickHouse. Only few dummy global variables are defined. **Syntax** diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index e2f1b8c7f14..729bedda0b3 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1096,7 +1096,7 @@ convertCharset(s, from, to) ## base58Encode -Encodes a String using [Base58](https://datatracker.ietf.org/doc/html/draft-msporny-base58) in the "Bitcoin" alphabet. +Encodes a string using [Base58](https://datatracker.ietf.org/doc/html/draft-msporny-base58) in the "Bitcoin" alphabet. **Syntax** @@ -1110,7 +1110,7 @@ base58Encode(plaintext) **Returned value** -- A string containing the encoded value of the argument. [String](../data-types/string.md). +- A string containing the encoded value of the argument. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Example** @@ -1128,7 +1128,7 @@ Result: ## base58Decode -Accepts a String and decodes it using [Base58](https://datatracker.ietf.org/doc/html/draft-msporny-base58) encoding scheme using "Bitcoin" alphabet. +Accepts a string and decodes it using [Base58](https://datatracker.ietf.org/doc/html/draft-msporny-base58) encoding scheme using "Bitcoin" alphabet. **Syntax** @@ -1138,7 +1138,7 @@ base58Decode(encoded) **Arguments** -- `encoded` — [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, an exception is thrown. +- `encoded` — [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). If the string is not a valid Base58-encoded value, an exception is thrown. **Returned value** @@ -1170,7 +1170,7 @@ tryBase58Decode(encoded) **Parameters** -- `encoded`: [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, returns an empty string in case of error. +- `encoded`: [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). If the string is not a valid Base58-encoded value, returns an empty string in case of error. **Returned value** diff --git a/docs/ru/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/ru/sql-reference/aggregate-functions/reference/simplelinearregression.md index 61c9e571683..ce23b04bf27 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/simplelinearregression.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/simplelinearregression.md @@ -18,7 +18,7 @@ simpleLinearRegression(x, y) Возвращаемые значения: -Константы `(a, b)` результирующей прямой `y = a*x + b`. +Константы `(k, b)` результирующей прямой `y = k*x + b`. **Примеры** diff --git a/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md index a7b36539b09..6875e841ad5 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md @@ -20,7 +20,7 @@ simpleLinearRegression(x, y) **返回值** -符合`y = a*x + b`的常量 `(a, b)` 。 +符合`y = k*x + b`的常量 `(k, b)` 。 **示例** diff --git a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp index 26a8b3f0d8f..e147ad68e40 100644 --- a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp +++ b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp @@ -184,6 +184,26 @@ public: , join_node(join_node_) {} + bool needChildVisit(const QueryTreeNodePtr & parent, const QueryTreeNodePtr &) + { + /** Optimization can change the value of some expression from NULL to FALSE. + * For example: + * when `a` is `NULL`, the expression `a = b AND a IS NOT NULL` returns `NULL` + * and it will be optimized to `a = b`, which returns `FALSE`. + * This is valid for JOIN ON condition and for the functions `AND`/`OR` inside it. + * (When we replace `AND`/`OR` operands from `NULL` to `FALSE`, the result value can also change only from `NULL` to `FALSE`) + * However, in the general case, the result can be wrong. + * For example, for NOT: `NOT NULL` is `NULL`, but `NOT FALSE` is `TRUE`. + * Therefore, optimize only top-level expression or expressions inside `AND`/`OR`. + */ + if (const auto * function_node = parent->as()) + { + const auto & func_name = function_node->getFunctionName(); + return func_name == "or" || func_name == "and"; + } + return parent->getNodeType() == QueryTreeNodeType::LIST; + } + void enterImpl(QueryTreeNodePtr & node) { auto * function_node = node->as(); diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 506133f7171..4277639fbd0 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -117,12 +118,6 @@ namespace throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); return outcome.GetResult().GetContents(); } - - bool isNotFoundError(Aws::S3::S3Errors error) - { - return error == Aws::S3::S3Errors::RESOURCE_NOT_FOUND - || error == Aws::S3::S3Errors::NO_SUCH_KEY; - } } @@ -236,6 +231,7 @@ BackupWriterS3::BackupWriterS3( : BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} + , s3_capabilities(getCapabilitiesFromConfig(context_->getConfigRef(), "s3")) { s3_settings.loadFromConfig(context_->getConfigRef(), "s3", context_->getSettingsRef()); @@ -358,92 +354,22 @@ std::unique_ptr BackupWriterS3::writeFile(const String & file_name) void BackupWriterS3::removeFile(const String & file_name) { - S3::DeleteObjectRequest request; - request.SetBucket(s3_uri.bucket); - auto key = fs::path(s3_uri.key) / file_name; - request.SetKey(key); - - auto outcome = client->DeleteObject(request); - - if (blob_storage_log) - { - blob_storage_log->addEvent( - BlobStorageLogElement::EventType::Delete, - s3_uri.bucket, key, /* local_path */ "", /* data_size */ 0, - outcome.IsSuccess() ? nullptr : &outcome.GetError()); - } - - if (!outcome.IsSuccess() && !isNotFoundError(outcome.GetError().GetErrorType())) - throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); + deleteFileFromS3(client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, /* if_exists = */ false, + blob_storage_log); } void BackupWriterS3::removeFiles(const Strings & file_names) { - try - { - if (!supports_batch_delete.has_value() || supports_batch_delete.value() == true) - { - removeFilesBatch(file_names); - supports_batch_delete = true; - } - else - { - for (const auto & file_name : file_names) - removeFile(file_name); - } - } - catch (const Exception &) - { - if (!supports_batch_delete.has_value()) - { - supports_batch_delete = false; - LOG_TRACE(log, "DeleteObjects is not supported. Retrying with plain DeleteObject."); + Strings keys; + keys.reserve(file_names.size()); + for (const String & file_name : file_names) + keys.push_back(fs::path(s3_uri.key) / file_name); - for (const auto & file_name : file_names) - removeFile(file_name); - } - else - throw; - } - -} - -void BackupWriterS3::removeFilesBatch(const Strings & file_names) -{ /// One call of DeleteObjects() cannot remove more than 1000 keys. - size_t chunk_size_limit = 1000; + size_t batch_size = 1000; - size_t current_position = 0; - while (current_position < file_names.size()) - { - std::vector current_chunk; - for (; current_position < file_names.size() && current_chunk.size() < chunk_size_limit; ++current_position) - { - Aws::S3::Model::ObjectIdentifier obj; - obj.SetKey(fs::path(s3_uri.key) / file_names[current_position]); - current_chunk.push_back(obj); - } - - Aws::S3::Model::Delete delkeys; - delkeys.SetObjects(current_chunk); - S3::DeleteObjectsRequest request; - request.SetBucket(s3_uri.bucket); - request.SetDelete(delkeys); - - auto outcome = client->DeleteObjects(request); - - if (blob_storage_log) - { - const auto * outcome_error = outcome.IsSuccess() ? nullptr : &outcome.GetError(); - auto time_now = std::chrono::system_clock::now(); - for (const auto & obj : current_chunk) - blob_storage_log->addEvent(BlobStorageLogElement::EventType::Delete, s3_uri.bucket, obj.GetKey(), - /* local_path */ "", /* data_size */ 0, outcome_error, time_now); - } - - if (!outcome.IsSuccess() && !isNotFoundError(outcome.GetError().GetErrorType())) - throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); - } + deleteFilesFromS3(client, s3_uri.bucket, keys, /* if_exists = */ false, + s3_capabilities, batch_size, blob_storage_log); } } diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index 327f06363c5..a04f1c915b9 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB { @@ -76,14 +77,12 @@ public: private: std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; - void removeFilesBatch(const Strings & file_names); const S3::URI s3_uri; const DataSourceDescription data_source_description; S3Settings s3_settings; std::shared_ptr client; - std::optional supports_batch_delete; - + S3Capabilities s3_capabilities; BlobStorageLogWriterPtr blob_storage_log; }; diff --git a/src/Common/Config/ConfigProcessor.cpp b/src/Common/Config/ConfigProcessor.cpp index 4e60eadd835..f4dd1cfa3fc 100644 --- a/src/Common/Config/ConfigProcessor.cpp +++ b/src/Common/Config/ConfigProcessor.cpp @@ -671,9 +671,11 @@ XMLDocumentPtr ConfigProcessor::parseConfig(const std::string & config_path) XMLDocumentPtr ConfigProcessor::processConfig( bool * has_zk_includes, zkutil::ZooKeeperNodeCache * zk_node_cache, - const zkutil::EventPtr & zk_changed_event) + const zkutil::EventPtr & zk_changed_event, + bool is_config_changed) { - LOG_DEBUG(log, "Processing configuration file '{}'.", path); + if (is_config_changed) + LOG_DEBUG(log, "Processing configuration file '{}'.", path); XMLDocumentPtr config; @@ -686,7 +688,8 @@ XMLDocumentPtr ConfigProcessor::processConfig( /// When we can use a config embedded in the binary. if (auto it = embedded_configs.find(path); it != embedded_configs.end()) { - LOG_DEBUG(log, "There is no file '{}', will use embedded config.", path); + if (is_config_changed) + LOG_DEBUG(log, "There is no file '{}', will use embedded config.", path); config = dom_parser.parseMemory(it->second.data(), it->second.size()); } else @@ -700,7 +703,8 @@ XMLDocumentPtr ConfigProcessor::processConfig( { try { - LOG_DEBUG(log, "Merging configuration file '{}'.", merge_file); + if (is_config_changed) + LOG_DEBUG(log, "Merging configuration file '{}'.", merge_file); XMLDocumentPtr with; with = parseConfig(merge_file); @@ -790,10 +794,10 @@ XMLDocumentPtr ConfigProcessor::processConfig( return config; } -ConfigProcessor::LoadedConfig ConfigProcessor::loadConfig(bool allow_zk_includes) +ConfigProcessor::LoadedConfig ConfigProcessor::loadConfig(bool allow_zk_includes, bool is_config_changed) { bool has_zk_includes; - XMLDocumentPtr config_xml = processConfig(&has_zk_includes); + XMLDocumentPtr config_xml = processConfig(&has_zk_includes, nullptr, nullptr, is_config_changed); if (has_zk_includes && !allow_zk_includes) throw Poco::Exception("Error while loading config '" + path + "': from_zk includes are not allowed!"); @@ -806,14 +810,15 @@ ConfigProcessor::LoadedConfig ConfigProcessor::loadConfig(bool allow_zk_includes ConfigProcessor::LoadedConfig ConfigProcessor::loadConfigWithZooKeeperIncludes( zkutil::ZooKeeperNodeCache & zk_node_cache, const zkutil::EventPtr & zk_changed_event, - bool fallback_to_preprocessed) + bool fallback_to_preprocessed, + bool is_config_changed) { XMLDocumentPtr config_xml; bool has_zk_includes; bool processed_successfully = false; try { - config_xml = processConfig(&has_zk_includes, &zk_node_cache, zk_changed_event); + config_xml = processConfig(&has_zk_includes, &zk_node_cache, zk_changed_event, is_config_changed); processed_successfully = true; } catch (const Poco::Exception & ex) diff --git a/src/Common/Config/ConfigProcessor.h b/src/Common/Config/ConfigProcessor.h index a9d1325b722..373e0809c5d 100644 --- a/src/Common/Config/ConfigProcessor.h +++ b/src/Common/Config/ConfigProcessor.h @@ -63,7 +63,8 @@ public: XMLDocumentPtr processConfig( bool * has_zk_includes = nullptr, zkutil::ZooKeeperNodeCache * zk_node_cache = nullptr, - const zkutil::EventPtr & zk_changed_event = nullptr); + const zkutil::EventPtr & zk_changed_event = nullptr, + bool is_config_changed = true); XMLDocumentPtr parseConfig(const std::string & config_path); @@ -88,14 +89,15 @@ public: /// If allow_zk_includes is true, expect that the configuration XML can contain from_zk nodes. /// If it is the case, set has_zk_includes to true and don't write config-preprocessed.xml, /// expecting that config would be reloaded with zookeeper later. - LoadedConfig loadConfig(bool allow_zk_includes = false); + LoadedConfig loadConfig(bool allow_zk_includes = false, bool is_config_changed = true); /// If fallback_to_preprocessed is true, then if KeeperException is thrown during config /// processing, load the configuration from the preprocessed file. LoadedConfig loadConfigWithZooKeeperIncludes( zkutil::ZooKeeperNodeCache & zk_node_cache, const zkutil::EventPtr & zk_changed_event, - bool fallback_to_preprocessed = false); + bool fallback_to_preprocessed = false, + bool is_config_changed = true); /// Save preprocessed config to specified directory. /// If preprocessed_dir is empty - calculate from loaded_config.path + /preprocessed_configs/ diff --git a/src/Common/Config/ConfigReloader.cpp b/src/Common/Config/ConfigReloader.cpp index 769a63c036b..a9a02ba6784 100644 --- a/src/Common/Config/ConfigReloader.cpp +++ b/src/Common/Config/ConfigReloader.cpp @@ -111,7 +111,8 @@ std::optional ConfigReloader::reloadIfNewer(bool std::lock_guard lock(reload_mutex); FilesChangesTracker new_files = getNewFileList(); - if (force || need_reload_from_zk || new_files.isDifferOrNewerThan(files)) + const bool is_config_changed = new_files.isDifferOrNewerThan(files); + if (force || need_reload_from_zk || is_config_changed) { ConfigProcessor config_processor(config_path); ConfigProcessor::LoadedConfig loaded_config; @@ -120,10 +121,10 @@ std::optional ConfigReloader::reloadIfNewer(bool try { - loaded_config = config_processor.loadConfig(/* allow_zk_includes = */ true); + loaded_config = config_processor.loadConfig(/* allow_zk_includes = */ true, is_config_changed); if (loaded_config.has_zk_includes) loaded_config = config_processor.loadConfigWithZooKeeperIncludes( - zk_node_cache, zk_changed_event, fallback_to_preprocessed); + zk_node_cache, zk_changed_event, fallback_to_preprocessed, is_config_changed); } catch (const Coordination::Exception & e) { diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index e63d11d7ee3..201d0b47de0 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -64,14 +65,13 @@ namespace ErrorCodes M(UInt64, log_slow_connection_operation_threshold_ms, 1000, "Log message if a certain operation took too long inside a single connection", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) +IMPLEMENT_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) struct CoordinationSettingsImpl : public BaseSettings { void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config); }; -IMPLEMENT_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) - void CoordinationSettingsImpl::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config) { if (!config.has(config_elem)) @@ -99,7 +99,7 @@ void CoordinationSettingsImpl::loadFromConfig(const String & config_elem, const } #define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \ - CoordinationSettings##TYPE NAME = &CoordinationSettings##Impl ::NAME; + CoordinationSettings##TYPE NAME = &CoordinationSettingsImpl ::NAME; namespace CoordinationSetting { @@ -119,18 +119,7 @@ CoordinationSettings::CoordinationSettings(const CoordinationSettings & settings CoordinationSettings::~CoordinationSettings() = default; -#define IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR(CLASS_NAME, TYPE) \ - const SettingField##TYPE & CoordinationSettings::operator[](CLASS_NAME##TYPE t) const \ - { \ - return impl.get()->*t; \ - } \ - SettingField##TYPE & CoordinationSettings::operator[](CLASS_NAME##TYPE t) \ - { \ - return impl.get()->*t; \ - } - COORDINATION_SETTINGS_SUPPORTED_TYPES(CoordinationSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) -#undef IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config) { diff --git a/src/Core/BaseSettings.h b/src/Core/BaseSettings.h index f281267cab9..1bebf019e43 100644 --- a/src/Core/BaseSettings.h +++ b/src/Core/BaseSettings.h @@ -25,19 +25,51 @@ class WriteBuffer; * Example of usage: * * mysettings.h: + * #include + * #include + * + * #define MY_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + * M(CLASS_NAME, Float) \ + * M(CLASS_NAME, String) \ + * M(CLASS_NAME, UInt64) + * + * MY_SETTINGS_SUPPORTED_TYPES(MySettings, DECLARE_SETTING_TRAIT) + * + * struct MySettings + * { + * MySettings(); + * ~MySettings(); + * + * MY_SETTINGS_SUPPORTED_TYPES(MySettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + * private: + * std::unique_ptr impl; + * }; + * + * mysettings.cpp: + * #include + * #include + * * #define APPLY_FOR_MYSETTINGS(M) \ * M(UInt64, a, 100, "Description of a", 0) \ * M(Float, f, 3.11, "Description of f", IMPORTANT) // IMPORTANT - means the setting can't be ignored by older versions) \ * M(String, s, "default", "Description of s", 0) * * DECLARE_SETTINGS_TRAITS(MySettingsTraits, APPLY_FOR_MYSETTINGS) - - * struct MySettings : public BaseSettings + * IMPLEMENT_SETTINGS_TRAITS(MySettingsTraits, APPLY_FOR_MYSETTINGS) + * + * struct MySettingsImpl : public BaseSettings * { * }; * - * mysettings.cpp: - * IMPLEMENT_SETTINGS_TRAITS(MySettingsTraits, APPLY_FOR_MYSETTINGS) + * #define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) MySettings##TYPE NAME = &MySettings##Impl ::NAME; + * + * namespace MySetting + * { + * APPLY_FOR_MYSETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) + * } + * #undef INITIALIZE_SETTING_EXTERN + * + * MY_SETTINGS_SUPPORTED_TYPES(MySettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) */ template class BaseSettings : public TTraits::Data diff --git a/src/Core/BaseSettingsFwdMacrosImpl.h b/src/Core/BaseSettingsFwdMacrosImpl.h new file mode 100644 index 00000000000..31c6b5dca1f --- /dev/null +++ b/src/Core/BaseSettingsFwdMacrosImpl.h @@ -0,0 +1,11 @@ +#pragma once + +#define IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR(CLASS_NAME, TYPE) \ + const SettingField##TYPE & CLASS_NAME::operator[](CLASS_NAME##TYPE t) const \ + { \ + return impl.get()->*t; \ + } \ + SettingField##TYPE & CLASS_NAME::operator[](CLASS_NAME##TYPE t) \ + { \ + return impl.get()->*t; \ + } diff --git a/src/Core/FormatFactorySettings.cpp b/src/Core/FormatFactorySettings.cpp index 902473a42d1..9735905c310 100644 --- a/src/Core/FormatFactorySettings.cpp +++ b/src/Core/FormatFactorySettings.cpp @@ -8,11 +8,9 @@ namespace DB * User-specified file format settings for File and URL engines. */ DECLARE_SETTINGS_TRAITS(FormatFactorySettingsTraits, LIST_OF_ALL_FORMAT_SETTINGS) +IMPLEMENT_SETTINGS_TRAITS(FormatFactorySettingsTraits, LIST_OF_ALL_FORMAT_SETTINGS) struct FormatFactorySettingsImpl : public BaseSettings { }; - -IMPLEMENT_SETTINGS_TRAITS(FormatFactorySettingsTraits, LIST_OF_ALL_FORMAT_SETTINGS) - } diff --git a/src/Core/ServerSettings.cpp b/src/Core/ServerSettings.cpp index 8c0864e78b7..42991ebe0ba 100644 --- a/src/Core/ServerSettings.cpp +++ b/src/Core/ServerSettings.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -193,14 +194,13 @@ namespace DB /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in dumpToSystemServerSettingsColumns below DECLARE_SETTINGS_TRAITS(ServerSettingsTraits, LIST_OF_SERVER_SETTINGS) +IMPLEMENT_SETTINGS_TRAITS(ServerSettingsTraits, LIST_OF_SERVER_SETTINGS) struct ServerSettingsImpl : public BaseSettings { void loadSettingsFromConfig(const Poco::Util::AbstractConfiguration & config); }; -IMPLEMENT_SETTINGS_TRAITS(ServerSettingsTraits, LIST_OF_SERVER_SETTINGS) - void ServerSettingsImpl::loadSettingsFromConfig(const Poco::Util::AbstractConfiguration & config) { // settings which can be loaded from the the default profile, see also MAKE_DEPRECATED_BY_SERVER_CONFIG in src/Core/Settings.h @@ -231,7 +231,7 @@ void ServerSettingsImpl::loadSettingsFromConfig(const Poco::Util::AbstractConfig } -#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) ServerSettings##TYPE NAME = &ServerSettings##Impl ::NAME; +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) ServerSettings##TYPE NAME = &ServerSettingsImpl ::NAME; namespace ServerSetting { @@ -250,18 +250,7 @@ ServerSettings::ServerSettings(const ServerSettings & settings) : impl(std::make ServerSettings::~ServerSettings() = default; -#define IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR(CLASS_NAME, TYPE) \ - const SettingField##TYPE & ServerSettings::operator[](CLASS_NAME##TYPE t) const \ - { \ - return impl.get()->*t; \ - } \ - SettingField##TYPE & ServerSettings::operator[](CLASS_NAME##TYPE t) \ - { \ - return impl.get()->*t; \ - } - SERVER_SETTINGS_SUPPORTED_TYPES(ServerSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) -#undef IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR void ServerSettings::set(std::string_view name, const Field & value) { diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index df25f395624..307cc5b9182 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -4453,8 +4454,9 @@ Optimize GROUP BY when all keys in block are constant M(Bool, legacy_column_name_of_tuple_literal, false, R"( List all names of element of large tuple literals in their column names instead of hash. This settings exists only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher. )", 0) \ - M(Bool, enable_named_columns_in_function_tuple, true, R"( + M(Bool, enable_named_columns_in_function_tuple, false, R"( Generate named tuples in function tuple() when all names are unique and can be treated as unquoted identifiers. +Beware that this setting might currently result in broken queries. It's not recommended to use in production )", 0) \ \ M(Bool, query_plan_enable_optimizations, true, R"( @@ -5558,8 +5560,8 @@ Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin M(UInt64, max_limit_for_ann_queries, 1'000'000, R"( SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes. )", 0) \ - M(UInt64, hnsw_candidate_list_size_for_search, 0, R"( -The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means USearch's default value (64). + M(UInt64, hnsw_candidate_list_size_for_search, 256, R"( +The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. )", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, R"( Throw exception if unsupported query is used inside transaction @@ -5898,7 +5900,7 @@ Allow writing simple SELECT queries without the leading SELECT keyword, which ma OBSOLETE_FORMAT_SETTINGS(M, ALIAS) \ DECLARE_SETTINGS_TRAITS_ALLOW_CUSTOM_SETTINGS(SettingsTraits, LIST_OF_SETTINGS) - +IMPLEMENT_SETTINGS_TRAITS(SettingsTraits, LIST_OF_SETTINGS) /** Settings of query execution. * These settings go to users.xml. @@ -5932,9 +5934,6 @@ private: std::unordered_set settings_changed_by_compatibility_setting; }; - -IMPLEMENT_SETTINGS_TRAITS(SettingsTraits, LIST_OF_SETTINGS) - /** Set the settings from the profile (in the server configuration, many settings can be listed in one profile). * The profile can also be set using the `set` functions, like the `profile` setting. */ @@ -6082,7 +6081,7 @@ void SettingsImpl::applyCompatibilitySetting(const String & compatibility_value) } #define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \ - Settings ## TYPE NAME = & Settings ## Impl :: NAME; + Settings ## TYPE NAME = & SettingsImpl :: NAME; namespace Setting { @@ -6116,18 +6115,7 @@ bool Settings::operator==(const Settings & other) const return *impl == *other.impl; } -#define IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR(CLASS_NAME, TYPE) \ - const SettingField##TYPE & Settings::operator[](CLASS_NAME##TYPE t) const \ - { \ - return impl.get()->*t; \ - } \ - SettingField##TYPE & Settings::operator[](CLASS_NAME##TYPE t) \ - { \ - return impl.get()->*t; \ - } - COMMON_SETTINGS_SUPPORTED_TYPES(Settings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) -#undef IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR bool Settings::has(std::string_view name) const { diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 9481b0c1197..5092e00aece 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -83,6 +83,7 @@ static std::initializer_listgetConfigRef()); + if (!db_settings[DatabaseReplicatedSetting::collection_name].value.empty()) + fillClusterAuthInfo(db_settings[DatabaseReplicatedSetting::collection_name].value, context_->getConfigRef()); replica_group_name = context_->getConfigRef().getString("replica_group_name", ""); @@ -1220,7 +1226,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep String db_name = getDatabaseName(); String to_db_name = getDatabaseName() + BROKEN_TABLES_SUFFIX; String to_db_name_replicated = getDatabaseName() + BROKEN_REPLICATED_TABLES_SUFFIX; - if (total_tables * db_settings.max_broken_tables_ratio < tables_to_detach.size()) + if (total_tables * db_settings[DatabaseReplicatedSetting::max_broken_tables_ratio] < tables_to_detach.size()) throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to recreate: {} of {}", tables_to_detach.size(), total_tables); if (!tables_to_detach.empty()) { diff --git a/src/Databases/DatabaseReplicatedSettings.cpp b/src/Databases/DatabaseReplicatedSettings.cpp index 61febcf2810..5ee37b55706 100644 --- a/src/Databases/DatabaseReplicatedSettings.cpp +++ b/src/Databases/DatabaseReplicatedSettings.cpp @@ -1,17 +1,60 @@ +#include +#include #include -#include #include +#include namespace DB { +#define LIST_OF_DATABASE_REPLICATED_SETTINGS(M, ALIAS) \ + M(Float, max_broken_tables_ratio, 1, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ + M(UInt64, max_replication_lag_to_enqueue, 50, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \ + M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \ + M(String, collection_name, "", "A name of a collection defined in server's config where all info for cluster authentication is defined", 0) \ + M(Bool, check_consistency, true, "Check consistency of local metadata and metadata in Keeper, do replica recovery on inconsistency", 0) \ + M(UInt64, max_retries_before_automatic_recovery, 100, "Max number of attempts to execute a queue entry before marking replica as lost recovering it from snapshot (0 means infinite)", 0) \ + +DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) IMPLEMENT_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) +struct DatabaseReplicatedSettingsImpl : public BaseSettings +{ +}; + +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \ + DatabaseReplicatedSettings##TYPE NAME = &DatabaseReplicatedSettingsImpl ::NAME; + +namespace DatabaseReplicatedSetting +{ +LIST_OF_DATABASE_REPLICATED_SETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) +} + +#undef INITIALIZE_SETTING_EXTERN + +DatabaseReplicatedSettings::DatabaseReplicatedSettings() : impl(std::make_unique()) +{ +} + +DatabaseReplicatedSettings::DatabaseReplicatedSettings(const DatabaseReplicatedSettings & settings) + : impl(std::make_unique(*settings.impl)) +{ +} + +DatabaseReplicatedSettings::DatabaseReplicatedSettings(DatabaseReplicatedSettings && settings) noexcept + : impl(std::make_unique(std::move(*settings.impl))) +{ +} + +DatabaseReplicatedSettings::~DatabaseReplicatedSettings() = default; + +DATABASE_REPLICATED_SETTINGS_SUPPORTED_TYPES(DatabaseReplicatedSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) + void DatabaseReplicatedSettings::loadFromQuery(ASTStorage & storage_def) { if (storage_def.settings) { - applyChanges(storage_def.settings->changes); + impl->applyChanges(storage_def.settings->changes); return; } diff --git a/src/Databases/DatabaseReplicatedSettings.h b/src/Databases/DatabaseReplicatedSettings.h index eb6dca7d5c4..5d3d5b9748b 100644 --- a/src/Databases/DatabaseReplicatedSettings.h +++ b/src/Databases/DatabaseReplicatedSettings.h @@ -1,27 +1,35 @@ #pragma once -#include -#include + +#include +#include namespace DB { - class ASTStorage; +struct DatabaseReplicatedSettingsImpl; -#define LIST_OF_DATABASE_REPLICATED_SETTINGS(M, ALIAS) \ - M(Float, max_broken_tables_ratio, 1, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ - M(UInt64, max_replication_lag_to_enqueue, 50, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \ - M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \ - M(String, collection_name, "", "A name of a collection defined in server's config where all info for cluster authentication is defined", 0) \ - M(Bool, check_consistency, true, "Check consistency of local metadata and metadata in Keeper, do replica recovery on inconsistency", 0) \ - M(UInt64, max_retries_before_automatic_recovery, 100, "Max number of attempts to execute a queue entry before marking replica as lost recovering it from snapshot (0 means infinite)", 0) \ +/// List of available types supported in ReplicatedSettings object +#define DATABASE_REPLICATED_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + M(CLASS_NAME, Bool) \ + M(CLASS_NAME, Float) \ + M(CLASS_NAME, String) \ + M(CLASS_NAME, UInt64) +DATABASE_REPLICATED_SETTINGS_SUPPORTED_TYPES(DatabaseReplicatedSettings, DECLARE_SETTING_TRAIT) -DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) - - -struct DatabaseReplicatedSettings : public BaseSettings +struct DatabaseReplicatedSettings { + DatabaseReplicatedSettings(); + DatabaseReplicatedSettings(const DatabaseReplicatedSettings & settings); + DatabaseReplicatedSettings(DatabaseReplicatedSettings && settings) noexcept; + ~DatabaseReplicatedSettings(); + + DATABASE_REPLICATED_SETTINGS_SUPPORTED_TYPES(DatabaseReplicatedSettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + void loadFromQuery(ASTStorage & storage_def); + +private: + std::unique_ptr impl; }; } diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 368559c057f..3b70383c28b 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -18,6 +18,14 @@ namespace Setting extern const SettingsUInt64 database_replicated_initial_query_timeout_sec; } +namespace DatabaseReplicatedSetting +{ + extern const DatabaseReplicatedSettingsBool check_consistency; + extern const DatabaseReplicatedSettingsUInt64 max_replication_lag_to_enqueue; + extern const DatabaseReplicatedSettingsUInt64 max_retries_before_automatic_recovery; + extern const DatabaseReplicatedSettingsUInt64 wait_entry_commited_timeout_sec; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -63,8 +71,8 @@ bool DatabaseReplicatedDDLWorker::initializeMainThread() break; } - if (database->db_settings.max_retries_before_automatic_recovery && - database->db_settings.max_retries_before_automatic_recovery <= subsequent_errors_count) + if (database->db_settings[DatabaseReplicatedSetting::max_retries_before_automatic_recovery] + && database->db_settings[DatabaseReplicatedSetting::max_retries_before_automatic_recovery] <= subsequent_errors_count) { String current_task_name; { @@ -155,7 +163,7 @@ void DatabaseReplicatedDDLWorker::initializeReplication() bool is_new_replica = our_log_ptr == 0; bool lost_according_to_log_ptr = our_log_ptr + logs_to_keep < max_log_ptr; - bool lost_according_to_digest = database->db_settings.check_consistency && local_digest != digest; + bool lost_according_to_digest = database->db_settings[DatabaseReplicatedSetting::check_consistency] && local_digest != digest; if (is_new_replica || lost_according_to_log_ptr || lost_according_to_digest) { @@ -306,7 +314,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr UInt32 our_log_ptr = getLogPointer(); UInt32 max_log_ptr = parse(zookeeper->get(database->zookeeper_path + "/max_log_ptr")); - if (our_log_ptr + database->db_settings.max_replication_lag_to_enqueue < max_log_ptr) + if (our_log_ptr + database->db_settings[DatabaseReplicatedSetting::max_replication_lag_to_enqueue] < max_log_ptr) throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot enqueue query on this replica, " "because it has replication lag of {} queries. Try other replica.", max_log_ptr - our_log_ptr); @@ -399,7 +407,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); constexpr size_t wait_time_ms = 1000; - size_t max_iterations = database->db_settings.wait_entry_commited_timeout_sec; + size_t max_iterations = database->db_settings[DatabaseReplicatedSetting::wait_entry_commited_timeout_sec]; size_t iteration = 0; while (!wait_committed_or_failed->tryWait(wait_time_ms)) diff --git a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp index 8bab8a82017..fa9ec7288d8 100644 --- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp +++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp @@ -1,4 +1,5 @@ #include +#include #if USE_LIBPQXX @@ -37,6 +38,11 @@ namespace Setting extern const SettingsUInt64 postgresql_connection_attempt_timeout; } +namespace MaterializedPostgreSQLSetting +{ + extern const MaterializedPostgreSQLSettingsString materialized_postgresql_tables_list; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -362,7 +368,7 @@ void DatabaseMaterializedPostgreSQL::attachTable(ContextPtr context_, const Stri try { - auto tables_to_replicate = settings->materialized_postgresql_tables_list.value; + auto tables_to_replicate = (*settings)[MaterializedPostgreSQLSetting::materialized_postgresql_tables_list].value; if (tables_to_replicate.empty()) tables_to_replicate = getFormattedTablesList(); diff --git a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h index cf1333d03c8..7506748cdf7 100644 --- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h +++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h @@ -5,7 +5,6 @@ #if USE_LIBPQXX #include -#include #include #include @@ -18,6 +17,7 @@ namespace DB { +struct MaterializedPostgreSQLSettings; class PostgreSQLConnection; using PostgreSQLConnectionPtr = std::shared_ptr; diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 5cae4936dea..79c27839eb6 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -158,21 +158,6 @@ S3::URI getS3URI(const Poco::Util::AbstractConfiguration & config, const std::st return uri; } -void checkS3Capabilities( - S3ObjectStorage & storage, const S3Capabilities s3_capabilities, const String & name) -{ - /// If `support_batch_delete` is turned on (default), check and possibly switch it off. - if (s3_capabilities.support_batch_delete && !checkBatchRemove(storage)) - { - LOG_WARNING( - getLogger("S3ObjectStorage"), - "Storage for disk {} does not support batch delete operations, " - "so `s3_capabilities.support_batch_delete` was automatically turned off during the access check. " - "To remove this message set `s3_capabilities.support_batch_delete` for the disk to `false`.", - name); - storage.setCapabilitiesSupportBatchDelete(false); - } -} } static std::string getEndpoint( @@ -192,7 +177,7 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const ContextPtr & context, - bool skip_access_check) -> ObjectStoragePtr + bool /* skip_access_check */) -> ObjectStoragePtr { auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); @@ -204,10 +189,6 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) auto object_storage = createObjectStorage( ObjectStorageType::S3, config, config_prefix, std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); - /// NOTE: should we still perform this check for clickhouse-disks? - if (!skip_access_check) - checkS3Capabilities(*dynamic_cast(object_storage.get()), s3_capabilities, name); - return object_storage; }); } @@ -221,7 +202,7 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const ContextPtr & context, - bool skip_access_check) -> ObjectStoragePtr + bool /* skip_access_check */) -> ObjectStoragePtr { /// send_metadata changes the filenames (includes revision), while /// s3_plain do not care about this, and expect that the file name @@ -241,10 +222,6 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) auto object_storage = std::make_shared>( std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); - /// NOTE: should we still perform this check for clickhouse-disks? - if (!skip_access_check) - checkS3Capabilities(*dynamic_cast(object_storage.get()), s3_capabilities, name); - return object_storage; }); } @@ -259,7 +236,7 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const ContextPtr & context, - bool skip_access_check) -> ObjectStoragePtr + bool /* skip_access_check */) -> ObjectStoragePtr { /// send_metadata changes the filenames (includes revision), while /// s3_plain_rewritable does not support file renaming. @@ -277,10 +254,6 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory) auto object_storage = std::make_shared>( std::move(metadata_storage_metrics), std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); - /// NOTE: should we still perform this check for clickhouse-disks? - if (!skip_access_check) - checkS3Capabilities(*dynamic_cast(object_storage.get()), s3_capabilities, name); - return object_storage; }); } diff --git a/src/Disks/ObjectStorages/S3/DiskS3Utils.cpp b/src/Disks/ObjectStorages/S3/DiskS3Utils.cpp index b20a2940e47..1bb33f56445 100644 --- a/src/Disks/ObjectStorages/S3/DiskS3Utils.cpp +++ b/src/Disks/ObjectStorages/S3/DiskS3Utils.cpp @@ -2,8 +2,6 @@ #if USE_AWS_S3 #include -#include -#include #include namespace DB @@ -11,7 +9,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int LOGICAL_ERROR; } ObjectStorageKeysGeneratorPtr getKeyGenerator( @@ -65,58 +62,6 @@ ObjectStorageKeysGeneratorPtr getKeyGenerator( return createObjectStorageKeysGeneratorByTemplate(object_key_template); } -static String getServerUUID() -{ - UUID server_uuid = ServerUUID::get(); - if (server_uuid == UUIDHelpers::Nil) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Server UUID is not initialized"); - return toString(server_uuid); -} - -bool checkBatchRemove(S3ObjectStorage & storage) -{ - /// NOTE: Here we are going to write and later drop some key. - /// We are using generateObjectKeyForPath() which returns random object key. - /// That generated key is placed in a right directory where we should have write access. - const String path = fmt::format("clickhouse_remove_objects_capability_{}", getServerUUID()); - const auto key = storage.generateObjectKeyForPath(path, {} /* key_prefix */); - StoredObject object(key.serialize(), path); - try - { - auto file = storage.writeObject(object, WriteMode::Rewrite); - file->write("test", 4); - file->finalize(); - } - catch (...) - { - try - { - storage.removeObject(object); - } - catch (...) // NOLINT(bugprone-empty-catch) - { - } - /// We don't have write access, therefore no information about batch remove. - return true; - } - try - { - /// Uses `DeleteObjects` request (batch delete). - storage.removeObjects({object}); - return true; - } - catch (const Exception &) - { - try - { - storage.removeObject(object); - } - catch (...) // NOLINT(bugprone-empty-catch) - { - } - return false; - } -} } #endif diff --git a/src/Disks/ObjectStorages/S3/DiskS3Utils.h b/src/Disks/ObjectStorages/S3/DiskS3Utils.h index e780f20d13c..42c169457f4 100644 --- a/src/Disks/ObjectStorages/S3/DiskS3Utils.h +++ b/src/Disks/ObjectStorages/S3/DiskS3Utils.h @@ -16,9 +16,6 @@ ObjectStorageKeysGeneratorPtr getKeyGenerator( const Poco::Util::AbstractConfiguration & config, const String & config_prefix); -class S3ObjectStorage; -bool checkBatchRemove(S3ObjectStorage & storage); - } #endif diff --git a/src/Disks/ObjectStorages/S3/S3Capabilities.cpp b/src/Disks/ObjectStorages/S3/S3Capabilities.cpp deleted file mode 100644 index 3e9a7d729b3..00000000000 --- a/src/Disks/ObjectStorages/S3/S3Capabilities.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include - -namespace DB -{ - -S3Capabilities getCapabilitiesFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) -{ - return S3Capabilities - { - .support_batch_delete = config.getBool(config_prefix + ".support_batch_delete", true), - .support_proxy = config.getBool(config_prefix + ".support_proxy", config.has(config_prefix + ".proxy")), - }; -} - -} diff --git a/src/Disks/ObjectStorages/S3/S3Capabilities.h b/src/Disks/ObjectStorages/S3/S3Capabilities.h deleted file mode 100644 index 622558f5d23..00000000000 --- a/src/Disks/ObjectStorages/S3/S3Capabilities.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include -#include - -namespace DB -{ - -/// Supported/unsupported features by different S3 implementations -/// Can be useful only for almost compatible with AWS S3 versions. -struct S3Capabilities -{ - /// Google S3 implementation doesn't support batch delete - /// TODO: possibly we have to use Google SDK https://github.com/googleapis/google-cloud-cpp/tree/main/google/cloud/storage - /// because looks like it misses some features: - /// 1) batch delete (DeleteObjects) - /// 2) upload part copy (UploadPartCopy) - bool support_batch_delete{true}; - - /// Y.Cloud S3 implementation support proxy for connection - bool support_proxy{false}; -}; - -S3Capabilities getCapabilitiesFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); - -} diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 0a7f659ee7b..cd36429d0a2 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -29,7 +30,6 @@ namespace ProfileEvents { - extern const Event S3DeleteObjects; extern const Event S3ListObjects; extern const Event DiskS3DeleteObjects; extern const Event DiskS3ListObjects; @@ -73,20 +73,6 @@ void throwIfError(const Aws::Utils::Outcome & response) } } -template -void throwIfUnexpectedError(const Aws::Utils::Outcome & response, bool if_exists) -{ - /// In this case even if absence of key may be ok for us, - /// the log will be polluted with error messages from aws sdk. - /// Looks like there is no way to suppress them. - - if (!response.IsSuccess() && (!if_exists || !S3::isNotFoundError(response.GetError().GetErrorType()))) - { - const auto & err = response.GetError(); - throw S3Exception(err.GetErrorType(), "{} (Code: {})", err.GetMessage(), static_cast(err.GetErrorType())); - } -} - template void logIfError(const Aws::Utils::Outcome & response, std::function && msg) { @@ -300,21 +286,11 @@ void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMet void S3ObjectStorage::removeObjectImpl(const StoredObject & object, bool if_exists) { - ProfileEvents::increment(ProfileEvents::S3DeleteObjects); - ProfileEvents::increment(ProfileEvents::DiskS3DeleteObjects); + auto blob_storage_log = BlobStorageLogWriter::create(disk_name); - S3::DeleteObjectRequest request; - request.SetBucket(uri.bucket); - request.SetKey(object.remote_path); - auto outcome = client.get()->DeleteObject(request); - if (auto blob_storage_log = BlobStorageLogWriter::create(disk_name)) - blob_storage_log->addEvent(BlobStorageLogElement::EventType::Delete, - uri.bucket, object.remote_path, object.local_path, object.bytes_size, - outcome.IsSuccess() ? nullptr : &outcome.GetError()); - - throwIfUnexpectedError(outcome, if_exists); - - LOG_DEBUG(log, "Object with path {} was removed from S3", object.remote_path); + deleteFileFromS3(client.get(), uri.bucket, object.remote_path, if_exists, + blob_storage_log, object.local_path, object.bytes_size, + ProfileEvents::DiskS3DeleteObjects); } void S3ObjectStorage::removeObjectsImpl(const StoredObjects & objects, bool if_exists) @@ -322,59 +298,31 @@ void S3ObjectStorage::removeObjectsImpl(const StoredObjects & objects, bool if_e if (objects.empty()) return; - if (!s3_capabilities.support_batch_delete) + Strings keys; + keys.reserve(objects.size()); + for (const auto & object : objects) + keys.push_back(object.remote_path); + + auto blob_storage_log = BlobStorageLogWriter::create(disk_name); + Strings local_paths_for_blob_storage_log; + std::vector file_sizes_for_blob_storage_log; + if (blob_storage_log) { + local_paths_for_blob_storage_log.reserve(objects.size()); + file_sizes_for_blob_storage_log.reserve(objects.size()); for (const auto & object : objects) - removeObjectImpl(object, if_exists); - } - else - { - auto settings_ptr = s3_settings.get(); - - size_t chunk_size_limit = settings_ptr->objects_chunk_size_to_delete; - size_t current_position = 0; - - auto blob_storage_log = BlobStorageLogWriter::create(disk_name); - while (current_position < objects.size()) { - std::vector current_chunk; - String keys; - size_t first_position = current_position; - for (; current_position < objects.size() && current_chunk.size() < chunk_size_limit; ++current_position) - { - Aws::S3::Model::ObjectIdentifier obj; - obj.SetKey(objects[current_position].remote_path); - current_chunk.push_back(obj); - - if (!keys.empty()) - keys += ", "; - keys += objects[current_position].remote_path; - } - - Aws::S3::Model::Delete delkeys; - delkeys.SetObjects(current_chunk); - - ProfileEvents::increment(ProfileEvents::S3DeleteObjects); - ProfileEvents::increment(ProfileEvents::DiskS3DeleteObjects); - S3::DeleteObjectsRequest request; - request.SetBucket(uri.bucket); - request.SetDelete(delkeys); - auto outcome = client.get()->DeleteObjects(request); - - if (blob_storage_log) - { - const auto * outcome_error = outcome.IsSuccess() ? nullptr : &outcome.GetError(); - auto time_now = std::chrono::system_clock::now(); - for (size_t i = first_position; i < current_position; ++i) - blob_storage_log->addEvent(BlobStorageLogElement::EventType::Delete, - uri.bucket, objects[i].remote_path, objects[i].local_path, objects[i].bytes_size, - outcome_error, time_now); - } - - LOG_DEBUG(log, "Objects with paths [{}] were removed from S3", keys); - throwIfUnexpectedError(outcome, if_exists); + local_paths_for_blob_storage_log.push_back(object.local_path); + file_sizes_for_blob_storage_log.push_back(object.bytes_size); } } + + auto settings_ptr = s3_settings.get(); + + deleteFilesFromS3(client.get(), uri.bucket, keys, if_exists, + s3_capabilities, settings_ptr->objects_chunk_size_to_delete, + blob_storage_log, local_paths_for_blob_storage_log, file_sizes_for_blob_storage_log, + ProfileEvents::DiskS3DeleteObjects); } void S3ObjectStorage::removeObject(const StoredObject & object) diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index ef9da8a948e..66b5d7455ba 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -5,8 +5,8 @@ #if USE_AWS_S3 #include -#include #include +#include #include #include #include @@ -148,8 +148,6 @@ public: bool isRemote() const override { return true; } - void setCapabilitiesSupportBatchDelete(bool value) { s3_capabilities.support_batch_delete = value; } - std::unique_ptr cloneObjectStorage( const std::string & new_namespace, const Poco::Util::AbstractConfiguration & config, diff --git a/src/Functions/FunctionBase58Conversion.h b/src/Functions/FunctionBase58Conversion.h index 5221c226f23..5fe02fb918b 100644 --- a/src/Functions/FunctionBase58Conversion.h +++ b/src/Functions/FunctionBase58Conversion.h @@ -1,6 +1,8 @@ #pragma once + #include #include +#include #include #include #include @@ -16,23 +18,20 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int BAD_ARGUMENTS; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } struct Base58Encode { static constexpr auto name = "base58Encode"; - static void process(const ColumnString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count) + static void processString(const ColumnString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count) { auto & dst_data = dst_column->getChars(); auto & dst_offsets = dst_column->getOffsets(); /// Base58 has efficiency of 73% (8/11) [https://monerodocs.org/cryptography/base58/], /// and we take double scale to avoid any reallocation. - size_t max_result_size = static_cast(ceil(2 * src_column.getChars().size() + 1)); dst_data.resize(max_result_size); @@ -61,6 +60,37 @@ struct Base58Encode dst_data.resize(current_dst_offset); } + + static void processFixedString(const ColumnFixedString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count) + { + auto & dst_data = dst_column->getChars(); + auto & dst_offsets = dst_column->getOffsets(); + + /// Base58 has efficiency of 73% (8/11) [https://monerodocs.org/cryptography/base58/], + /// and we take double scale to avoid any reallocation. + size_t max_result_size = static_cast(ceil(2 * src_column.getChars().size() + 1)); + + dst_data.resize(max_result_size); + dst_offsets.resize(input_rows_count); + + const auto * src = src_column.getChars().data(); + auto * dst = dst_data.data(); + + size_t N = src_column.getN(); + size_t current_dst_offset = 0; + + for (size_t row = 0; row < input_rows_count; ++row) + { + size_t encoded_size = encodeBase58(&src[row * N], N, &dst[current_dst_offset]); + current_dst_offset += encoded_size; + dst[current_dst_offset] = 0; + ++current_dst_offset; + + dst_offsets[row] = current_dst_offset; + } + + dst_data.resize(current_dst_offset); + } }; enum class Base58DecodeErrorHandling : uint8_t @@ -74,14 +104,13 @@ struct Base58Decode { static constexpr auto name = Name::name; - static void process(const ColumnString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count) + static void processString(const ColumnString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count) { auto & dst_data = dst_column->getChars(); auto & dst_offsets = dst_column->getOffsets(); /// Base58 has efficiency of 73% (8/11) [https://monerodocs.org/cryptography/base58/], /// and decoded value will be no longer than source. - size_t max_result_size = src_column.getChars().size() + 1; dst_data.resize(max_result_size); @@ -118,6 +147,45 @@ struct Base58Decode dst_data.resize(current_dst_offset); } + + static void processFixedString(const ColumnFixedString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count) + { + auto & dst_data = dst_column->getChars(); + auto & dst_offsets = dst_column->getOffsets(); + + /// Base58 has efficiency of 73% (8/11) [https://monerodocs.org/cryptography/base58/], + /// and decoded value will be no longer than source. + size_t max_result_size = src_column.getChars().size() + 1; + + dst_data.resize(max_result_size); + dst_offsets.resize(input_rows_count); + + const auto * src = src_column.getChars().data(); + auto * dst = dst_data.data(); + + size_t N = src_column.getN(); + size_t current_dst_offset = 0; + + for (size_t row = 0; row < input_rows_count; ++row) + { + std::optional decoded_size = decodeBase58(&src[row * N], N, &dst[current_dst_offset]); + if (!decoded_size) + { + if constexpr (ErrorHandling == Base58DecodeErrorHandling::ThrowException) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid Base58 value, cannot be decoded"); + else + decoded_size = 0; + } + + current_dst_offset += *decoded_size; + dst[current_dst_offset] = 0; + ++current_dst_offset; + + dst_offsets[row] = current_dst_offset; + } + + dst_data.resize(current_dst_offset); + } }; template @@ -135,14 +203,10 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (arguments.size() != 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Wrong number of arguments for function {}: 1 expected.", getName()); - - if (!isString(arguments[0].type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of first argument of function {}. Must be String.", - arguments[0].type->getName(), getName()); + FunctionArgumentDescriptors args{ + {"arg", static_cast(&isStringOrFixedString), nullptr, "String or FixedString"} + }; + validateFunctionArguments(*this, arguments, args); return std::make_shared(); } @@ -154,19 +218,25 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const ColumnPtr column_string = arguments[0].column; - const ColumnString * input = checkAndGetColumn(column_string.get()); - if (!input) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of first argument of function {}, must be String", - arguments[0].column->getName(), getName()); + const ColumnPtr col = arguments[0].column; - auto dst_column = ColumnString::create(); + if (const ColumnString * col_string = checkAndGetColumn(col.get())) + { + auto col_res = ColumnString::create(); + Func::processString(*col_string, col_res, input_rows_count); + return col_res; + } + else if (const ColumnFixedString * col_fixed_string = checkAndGetColumn(col.get())) + { + auto col_res = ColumnString::create(); + Func::processFixedString(*col_fixed_string, col_res, input_rows_count); + return col_res; + } - Func::process(*input, dst_column, input_rows_count); - - return dst_column; + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}, must be String or FixedString", + arguments[0].column->getName(), getName()); } }; } diff --git a/src/Functions/ReplaceRegexpImpl.h b/src/Functions/ReplaceRegexpImpl.h index 14f5a2d7932..b5572d2eaa4 100644 --- a/src/Functions/ReplaceRegexpImpl.h +++ b/src/Functions/ReplaceRegexpImpl.h @@ -13,7 +13,6 @@ namespace DB namespace ErrorCodes { - extern const int ARGUMENT_OUT_OF_BOUND; extern const int BAD_ARGUMENTS; } @@ -205,7 +204,11 @@ struct ReplaceRegexpImpl size_t input_rows_count) { if (needle.empty()) - throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + { + res_data.assign(haystack_data); + res_offsets.assign(haystack_offsets); + return; + } ColumnString::Offset res_offset = 0; res_data.reserve(haystack_data.size()); @@ -240,7 +243,7 @@ struct ReplaceRegexpImpl for (size_t i = 0; i < input_rows_count; ++i) { - size_t from = i > 0 ? haystack_offsets[i - 1] : 0; + size_t from = haystack_offsets[i - 1]; const char * hs_data = reinterpret_cast(haystack_data.data() + from); const size_t hs_length = static_cast(haystack_offsets[i] - from - 1); @@ -271,17 +274,24 @@ struct ReplaceRegexpImpl for (size_t i = 0; i < input_rows_count; ++i) { - size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + size_t hs_from = haystack_offsets[i - 1]; const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); - size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0; + size_t ndl_from = needle_offsets[i - 1]; const char * ndl_data = reinterpret_cast(needle_data.data() + ndl_from); const size_t ndl_length = static_cast(needle_offsets[i] - ndl_from - 1); std::string_view needle(ndl_data, ndl_length); if (needle.empty()) - throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + { + res_data.insert(res_data.end(), hs_data, hs_data + hs_length); + res_data.push_back(0); + + res_offset += hs_length + 1; + res_offsets[i] = res_offset; + continue; + } re2::RE2 searcher(needle, regexp_options); if (!searcher.ok()) @@ -308,7 +318,11 @@ struct ReplaceRegexpImpl assert(haystack_offsets.size() == replacement_offsets.size()); if (needle.empty()) - throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + { + res_data.assign(haystack_data); + res_offsets.assign(haystack_offsets); + return; + } ColumnString::Offset res_offset = 0; res_data.reserve(haystack_data.size()); @@ -325,11 +339,11 @@ struct ReplaceRegexpImpl for (size_t i = 0; i < input_rows_count; ++i) { - size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + size_t hs_from = haystack_offsets[i - 1]; const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); - size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0; + size_t repl_from = replacement_offsets[i - 1]; const char * repl_data = reinterpret_cast(replacement_data.data() + repl_from); const size_t repl_length = static_cast(replacement_offsets[i] - repl_from - 1); std::string_view replacement(repl_data, repl_length); @@ -364,19 +378,25 @@ struct ReplaceRegexpImpl for (size_t i = 0; i < input_rows_count; ++i) { - size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + size_t hs_from = haystack_offsets[i - 1]; const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); - size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0; + size_t ndl_from = needle_offsets[i - 1]; const char * ndl_data = reinterpret_cast(needle_data.data() + ndl_from); const size_t ndl_length = static_cast(needle_offsets[i] - ndl_from - 1); std::string_view needle(ndl_data, ndl_length); if (needle.empty()) - throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + { + res_data.insert(res_data.end(), hs_data, hs_data + hs_length); + res_data.push_back(0); + res_offsets[i] = res_offsets[i - 1] + hs_length + 1; + res_offset = res_offsets[i]; + continue; + } - size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0; + size_t repl_from = replacement_offsets[i - 1]; const char * repl_data = reinterpret_cast(replacement_data.data() + repl_from); const size_t repl_length = static_cast(replacement_offsets[i] - repl_from - 1); std::string_view replacement(repl_data, repl_length); @@ -403,7 +423,21 @@ struct ReplaceRegexpImpl size_t input_rows_count) { if (needle.empty()) - throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + { + chassert(input_rows_count == haystack_data.size() / n); + /// Since ColumnFixedString does not have a zero byte at the end, while ColumnString does, + /// we need to split haystack_data into strings of length n, add 1 zero byte to the end of each string + /// and then copy to res_data, ref: ColumnString.h and ColumnFixedString.h + res_data.reserve(haystack_data.size() + input_rows_count); + res_offsets.resize(input_rows_count); + for (size_t i = 0; i < input_rows_count; ++i) + { + res_data.insert(res_data.end(), haystack_data.begin() + i * n, haystack_data.begin() + (i + 1) * n); + res_data.push_back(0); + res_offsets[i] = res_offsets[i - 1] + n + 1; + } + return; + } ColumnString::Offset res_offset = 0; res_data.reserve(haystack_data.size()); diff --git a/src/IO/S3/S3Capabilities.cpp b/src/IO/S3/S3Capabilities.cpp new file mode 100644 index 00000000000..6bb2408d37d --- /dev/null +++ b/src/IO/S3/S3Capabilities.cpp @@ -0,0 +1,47 @@ +#include + +#include + + +namespace DB +{ + +S3Capabilities::S3Capabilities(const S3Capabilities & src) + : S3Capabilities(src.isBatchDeleteSupported(), src.support_proxy) +{ +} + +std::optional S3Capabilities::isBatchDeleteSupported() const +{ + std::lock_guard lock{mutex}; + return support_batch_delete; +} + +void S3Capabilities::setIsBatchDeleteSupported(bool support_batch_delete_) +{ + std::lock_guard lock{mutex}; + + if (support_batch_delete.has_value() && (support_batch_delete.value() != support_batch_delete_)) + { + LOG_ERROR(getLogger("S3Capabilities"), + "Got different results ({} vs {}) from checking if the cloud storage supports batch delete (DeleteObjects), " + "the cloud storage API may be unstable", + support_batch_delete.value(), support_batch_delete_); + chassert(false && "Got different results from checking if the cloud storage supports batch delete"); + } + + support_batch_delete = support_batch_delete_; +} + +S3Capabilities getCapabilitiesFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) +{ + std::optional support_batch_delete; + if (config.has(config_prefix + ".support_batch_delete")) + support_batch_delete = config.getBool(config_prefix + ".support_batch_delete"); + + bool support_proxy = config.getBool(config_prefix + ".support_proxy", config.has(config_prefix + ".proxy")); + + return S3Capabilities{support_batch_delete, support_proxy}; +} + +} diff --git a/src/IO/S3/S3Capabilities.h b/src/IO/S3/S3Capabilities.h new file mode 100644 index 00000000000..869453d412b --- /dev/null +++ b/src/IO/S3/S3Capabilities.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include +#include +#include + + +namespace DB +{ + +/// Supported/unsupported features by different S3 implementations +/// Can be useful only for almost compatible with AWS S3 versions. +class S3Capabilities +{ +public: + explicit S3Capabilities(std::optional support_batch_delete_ = {}, bool support_proxy_ = false) + : support_proxy(support_proxy_), support_batch_delete(support_batch_delete_) + { + } + + S3Capabilities(const S3Capabilities & src); + + /// Google S3 implementation doesn't support batch delete + /// TODO: possibly we have to use Google SDK https://github.com/googleapis/google-cloud-cpp/tree/main/google/cloud/storage + /// because looks like it misses some features: + /// 1) batch delete (DeleteObjects) + /// 2) upload part copy (UploadPartCopy) + /// If `isBatchDeleteSupported()` returns `nullopt` it means that it isn't clear yet if it's supported or not + /// and should be detected automatically from responses of the cloud storage. + std::optional isBatchDeleteSupported() const; + void setIsBatchDeleteSupported(bool support_batch_delete_); + + /// Y.Cloud S3 implementation support proxy for connection + const bool support_proxy{false}; + +private: + /// `support_batch_delete` is guarded by mutex because function deleteFilesFromS3() can update this field from another thread. + /// If `support_batch_delete == nullopt` that means it's not clear yet if it's supported or not. + std::optional support_batch_delete TSA_GUARDED_BY(mutex); + + mutable std::mutex mutex; +}; + +S3Capabilities getCapabilitiesFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); + +} diff --git a/src/IO/S3/deleteFileFromS3.cpp b/src/IO/S3/deleteFileFromS3.cpp new file mode 100644 index 00000000000..0554bb295bb --- /dev/null +++ b/src/IO/S3/deleteFileFromS3.cpp @@ -0,0 +1,274 @@ +#include + +#if USE_AWS_S3 + +#include +#include +#include +#include +#include +#include + + +namespace ProfileEvents +{ + extern const Event S3DeleteObjects; +} + + +namespace DB +{ + +void deleteFileFromS3( + const std::shared_ptr & s3_client, + const String & bucket, + const String & key, + bool if_exists, + BlobStorageLogWriterPtr blob_storage_log, + const String & local_path_for_blob_storage_log, + size_t file_size_for_blob_storage_log, + std::optional profile_event) +{ + S3::DeleteObjectRequest request; + request.SetBucket(bucket); + request.SetKey(key); + + ProfileEvents::increment(ProfileEvents::S3DeleteObjects); + if (profile_event && *profile_event != ProfileEvents::S3DeleteObjects) + ProfileEvents::increment(*profile_event); + + auto outcome = s3_client->DeleteObject(request); + + auto log = getLogger("deleteFileFromS3"); + + if (blob_storage_log) + { + LOG_TRACE(log, "Writing Delete operation for blob {}", key); + blob_storage_log->addEvent(BlobStorageLogElement::EventType::Delete, + bucket, key, + local_path_for_blob_storage_log, file_size_for_blob_storage_log, + outcome.IsSuccess() ? nullptr : &outcome.GetError()); + } + else + { + LOG_TRACE(log, "No blob storage log, not writing blob {}", key); + } + + if (outcome.IsSuccess()) + { + LOG_INFO(log, "Object with path {} was removed from S3", key); + } + else if (if_exists && S3::isNotFoundError(outcome.GetError().GetErrorType())) + { + /// In this case even if absence of key may be ok for us, the log will be polluted with error messages from aws sdk. + /// Looks like there is no way to suppress them. + LOG_TRACE(log, "Object with path {} was skipped because it didn't exist", key); + } + else + { + const auto & err = outcome.GetError(); + throw S3Exception(err.GetErrorType(), "{} (Code: {}) while removing object with path {} from S3", + err.GetMessage(), static_cast(err.GetErrorType()), key); + } +} + + +void deleteFilesFromS3( + const std::shared_ptr & s3_client, + const String & bucket, + const Strings & keys, + bool if_exists, + S3Capabilities & s3_capabilities, + size_t batch_size, + BlobStorageLogWriterPtr blob_storage_log, + const Strings & local_paths_for_blob_storage_log, + const std::vector & file_sizes_for_blob_storage_log, + std::optional profile_event) +{ + chassert(local_paths_for_blob_storage_log.empty() || (local_paths_for_blob_storage_log.size() == keys.size())); + chassert(file_sizes_for_blob_storage_log.empty() || (file_sizes_for_blob_storage_log.size() == keys.size())); + + if (keys.empty()) + return; /// Nothing to delete. + + /// We're trying batch delete (DeleteObjects) first. + bool try_batch_delete = true; + { + if (keys.size() == 1) + try_batch_delete = false; /// We're deleting one file - there is no need for batch delete. + else if (batch_size < 2) + try_batch_delete = false; /// Can't do batch delete with such small batches. + else if (auto support_batch_delete = s3_capabilities.isBatchDeleteSupported(); + support_batch_delete.has_value() && !support_batch_delete.value()) + try_batch_delete = false; /// Support for batch delete is disabled. + } + + auto log = getLogger("deleteFileFromS3"); + const String empty_string; + + if (try_batch_delete) + { + bool need_retry_with_plain_delete_object = false; + size_t current_position = 0; + + while (current_position < keys.size()) + { + std::vector current_chunk; + String comma_separated_keys; + size_t first_position = current_position; + for (; current_position < keys.size() && current_chunk.size() < batch_size; ++current_position) + { + Aws::S3::Model::ObjectIdentifier obj; + obj.SetKey(keys[current_position]); + current_chunk.push_back(obj); + + if (!comma_separated_keys.empty()) + comma_separated_keys += ", "; + comma_separated_keys += keys[current_position]; + } + + Aws::S3::Model::Delete delkeys; + delkeys.SetObjects(current_chunk); + delkeys.SetQuiet(true); + + S3::DeleteObjectsRequest request; + request.SetBucket(bucket); + request.SetDelete(delkeys); + + ProfileEvents::increment(ProfileEvents::S3DeleteObjects); + if (profile_event && *profile_event != ProfileEvents::S3DeleteObjects) + ProfileEvents::increment(*profile_event); + + auto outcome = s3_client->DeleteObjects(request); + + if (blob_storage_log) + { + const auto * outcome_error = outcome.IsSuccess() ? nullptr : &outcome.GetError(); + auto time_now = std::chrono::system_clock::now(); + LOG_TRACE(log, "Writing Delete operation for blobs [{}]", comma_separated_keys); + for (size_t i = first_position; i < current_position; ++i) + { + const String & local_path_for_blob_storage_log = (i < local_paths_for_blob_storage_log.size()) ? local_paths_for_blob_storage_log[i] : empty_string; + size_t file_size_for_blob_storage_log = (i < file_sizes_for_blob_storage_log.size()) ? file_sizes_for_blob_storage_log[i] : 0; + + blob_storage_log->addEvent(BlobStorageLogElement::EventType::Delete, + bucket, keys[i], + local_path_for_blob_storage_log, file_size_for_blob_storage_log, + outcome_error, time_now); + } + } + else + { + LOG_TRACE(log, "No blob storage log, not writing blobs [{}]", comma_separated_keys); + } + + if (outcome.IsSuccess()) + { + /// DeleteObjects succeeded, that means some objects were removed (but maybe not all the objects). + /// Multiple threads can call deleteFilesFromS3() with a reference to the same `s3_capabilities`, + /// and the following line doesn't cause a race because `s3_capabilities` is protected with mutex. + s3_capabilities.setIsBatchDeleteSupported(true); + + const auto & errors = outcome.GetResult().GetErrors(); + if (errors.empty()) + { + /// All the objects were removed. + LOG_INFO(log, "Objects with paths [{}] were removed from S3", comma_separated_keys); + } + else + { + /// Mixed success/error response - some objects were removed, and some were not. + /// We need to extract more detailed information from the outcome. + std::unordered_set removed_keys{keys.begin(), keys.end()}; + String not_found_keys; + std::exception_ptr other_error; + + for (const auto & err : errors) + { + removed_keys.erase(err.GetKey()); + auto error_type = static_cast(Aws::S3::S3ErrorMapper::GetErrorForName(err.GetCode().c_str()).GetErrorType()); + if (if_exists && S3::isNotFoundError(error_type)) + { + if (not_found_keys.empty()) + not_found_keys += ", "; + not_found_keys += err.GetKey(); + } + else if (!other_error) + { + other_error = std::make_exception_ptr( + S3Exception{error_type, "{} (Code: {}) while removing object with path {} from S3", + err.GetMessage(), err.GetCode(), err.GetKey()}); + } + } + + if (!removed_keys.empty()) + { + String removed_keys_comma_separated; + for (const auto & key : removed_keys) + { + if (!removed_keys_comma_separated.empty()) + removed_keys_comma_separated += ", "; + removed_keys_comma_separated += key; + } + LOG_INFO(log, "Objects with paths [{}] were removed from S3", removed_keys_comma_separated); + } + + if (!not_found_keys.empty()) + { + /// In this case even if absence of key may be ok for us, the log will be polluted with error messages from aws sdk. + /// Looks like there is no way to suppress them. + LOG_TRACE(log, "Object with paths [{}] were skipped because they didn't exist", not_found_keys); + } + + if (other_error) + std::rethrow_exception(other_error); + } + } + else + { + /// DeleteObjects didn't succeed, that means either a) this operation isn't supported at all; + /// or b) all the objects didn't exist; or c) some failure occurred. + const auto & err = outcome.GetError(); + if ((err.GetExceptionName() == "InvalidRequest") || (err.GetExceptionName() == "InvalidArgument") + || (err.GetExceptionName() == "NotImplemented")) + { + LOG_TRACE(log, "DeleteObjects is not supported: {} (Code: {}). Retrying with plain DeleteObject.", + err.GetMessage(), static_cast(err.GetErrorType())); + /// Multiple threads can call deleteFilesFromS3() with a reference to the same `s3_capabilities`, + /// and the following line doesn't cause a race because `s3_capabilities` is protected with mutex. + s3_capabilities.setIsBatchDeleteSupported(false); + need_retry_with_plain_delete_object = true; + break; + } + + if (if_exists && S3::isNotFoundError(err.GetErrorType())) + { + LOG_TRACE(log, "Object with paths [{}] were skipped because they didn't exist", comma_separated_keys); + } + else + { + throw S3Exception(err.GetErrorType(), "{} (Code: {}) while removing objects with paths [{}] from S3", + err.GetMessage(), static_cast(err.GetErrorType()), comma_separated_keys); + } + } + } + + if (!need_retry_with_plain_delete_object) + return; + } + + /// Batch delete (DeleteObjects) isn't supported so we'll delete all the files sequentially. + for (size_t i = 0; i != keys.size(); ++i) + { + const String & local_path_for_blob_storage_log = (i < local_paths_for_blob_storage_log.size()) ? local_paths_for_blob_storage_log[i] : empty_string; + size_t file_size_for_blob_storage_log = (i < file_sizes_for_blob_storage_log.size()) ? file_sizes_for_blob_storage_log[i] : 0; + + deleteFileFromS3(s3_client, bucket, keys[i], if_exists, + blob_storage_log, local_path_for_blob_storage_log, file_size_for_blob_storage_log, + profile_event); + } +} + +} + +#endif diff --git a/src/IO/S3/deleteFileFromS3.h b/src/IO/S3/deleteFileFromS3.h new file mode 100644 index 00000000000..7fdf2454220 --- /dev/null +++ b/src/IO/S3/deleteFileFromS3.h @@ -0,0 +1,51 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include +#include +#include + + +namespace DB +{ + +namespace S3 +{ + class Client; +} + +class S3Capabilities; +class BlobStorageLogWriter; +using BlobStorageLogWriterPtr = std::shared_ptr; + + +/// Deletes one file from S3. +void deleteFileFromS3( + const std::shared_ptr & s3_client, + const String & bucket, + const String & key, + bool if_exists = false, + BlobStorageLogWriterPtr blob_storage_log = nullptr, + const String & local_path_for_blob_storage_log = {}, + size_t file_size_for_blob_storage_log = 0, + std::optional profile_event = std::nullopt); + +/// Deletes multiple files from S3 using batch requests when it's possible. +void deleteFilesFromS3( + const std::shared_ptr & s3_client, + const String & bucket, + const Strings & keys, + bool if_exists, + S3Capabilities & s3_capabilities, + size_t batch_size = 1000, + BlobStorageLogWriterPtr blob_storage_log = nullptr, + const Strings & local_paths_for_blob_storage_log = {}, + const std::vector & file_sizes_for_blob_storage_log = {}, + std::optional profile_event = std::nullopt); + +} + +#endif diff --git a/src/IO/S3/getObjectInfo.cpp b/src/IO/S3/getObjectInfo.cpp index 37f7410f0f4..cd8bd3ac122 100644 --- a/src/IO/S3/getObjectInfo.cpp +++ b/src/IO/S3/getObjectInfo.cpp @@ -66,7 +66,8 @@ namespace bool isNotFoundError(Aws::S3::S3Errors error) { - return error == Aws::S3::S3Errors::RESOURCE_NOT_FOUND || error == Aws::S3::S3Errors::NO_SUCH_KEY; + return error == Aws::S3::S3Errors::RESOURCE_NOT_FOUND || error == Aws::S3::S3Errors::NO_SUCH_KEY + || error == Aws::S3::S3Errors::NO_SUCH_BUCKET; } ObjectInfo getObjectInfo( diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index de72641f8f8..53d44e2f325 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -332,15 +332,20 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format const auto * literal = arguments->children[0]->as(); const auto * function = arguments->children[0]->as(); const auto * subquery = arguments->children[0]->as(); - bool is_tuple = literal && literal->value.getType() == Field::Types::Tuple; - /// Do not add parentheses for tuple literal, otherwise extra parens will be added `-((3, 7, 3), 1)` -> `-(((3, 7, 3), 1))` - bool literal_need_parens = literal && !is_tuple; + bool is_tuple = (literal && literal->value.getType() == Field::Types::Tuple) + || (function && function->name == "tuple" && function->arguments && function->arguments->children.size() > 1); + bool is_array = (literal && literal->value.getType() == Field::Types::Array) + || (function && function->name == "array"); + + /// Do not add parentheses for tuple and array literal, otherwise extra parens will be added `-((3, 7, 3), 1)` -> `-(((3, 7, 3), 1))`, `-[1]` -> `-([1])` + bool literal_need_parens = literal && !is_tuple && !is_array; /// Negate always requires parentheses, otherwise -(-1) will be printed as --1 - /// Also extra parentheses are needed for subqueries, because NOT can be parsed as a function: + /// Also extra parentheses are needed for subqueries and tuple, because NOT can be parsed as a function: /// not(SELECT 1) cannot be parsed, while not((SELECT 1)) can. + /// not((1, 2, 3)) is a function of one argument, while not(1, 2, 3) is a function of three arguments. bool inside_parens = (name == "negate" && (literal_need_parens || (function && function->name == "negate"))) - || (subquery && name == "not"); + || (subquery && name == "not") || (is_tuple && name == "not"); /// We DO need parentheses around a single literal /// For example, SELECT (NOT 0) + (NOT 0) cannot be transformed into SELECT NOT 0 + NOT 0, since diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp index 9bf3c3e6cbb..81df330ffb5 100644 --- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp @@ -32,12 +32,35 @@ # include # include # include -# include +# include +# include # include -# include "ArrowBufferedStreams.h" - +# include # include +# include "ArrowBufferedStreams.h" + +# include + + +namespace +{ + +class MemoryPool : public orc::MemoryPool +{ +public: + char * malloc(uint64_t size) override + { + auto * ptr = ::malloc(size); + /// For nullable columns some of the values will not be initialized. + __msan_unpoison(ptr, size); + return static_cast(ptr); + } + + void free(char * p) override { ::free(p); } +}; + +} namespace DB { @@ -724,12 +747,17 @@ buildORCSearchArgument(const KeyCondition & key_condition, const Block & header, } static void getFileReader( - ReadBuffer & in, std::unique_ptr & file_reader, const FormatSettings & format_settings, std::atomic & is_stopped) + ReadBuffer & in, + std::unique_ptr & file_reader, + orc::MemoryPool & pool, + const FormatSettings & format_settings, + std::atomic & is_stopped) { if (is_stopped) return; orc::ReaderOptions options; + options.setMemoryPool(pool); auto input_stream = asORCInputStream(in, format_settings, is_stopped); file_reader = orc::createReader(std::move(input_stream), options); } @@ -875,6 +903,7 @@ static void updateIncludeTypeIds( NativeORCBlockInputFormat::NativeORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) : IInputFormat(std::move(header_), &in_) + , memory_pool(std::make_unique()) , block_missing_values(getPort().getHeader().columns()) , format_settings(format_settings_) , skip_stripes(format_settings.orc.skip_stripes) @@ -883,7 +912,7 @@ NativeORCBlockInputFormat::NativeORCBlockInputFormat(ReadBuffer & in_, Block hea void NativeORCBlockInputFormat::prepareFileReader() { - getFileReader(*in, file_reader, format_settings, is_stopped); + getFileReader(*in, file_reader, *memory_pool, format_settings, is_stopped); if (is_stopped) return; @@ -1027,7 +1056,8 @@ NamesAndTypesList NativeORCSchemaReader::readSchema() { std::unique_ptr file_reader; std::atomic is_stopped = 0; - getFileReader(in, file_reader, format_settings, is_stopped); + MemoryPool memory_pool; + getFileReader(in, file_reader, memory_pool, format_settings, is_stopped); const auto & schema = file_reader->getType(); diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h index 7eaff7f3020..c3e4f4fd7f6 100644 --- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h @@ -8,6 +8,7 @@ # include # include # include +# include # include namespace DB @@ -70,6 +71,8 @@ private: void prepareFileReader(); bool prepareStripeReader(); + std::unique_ptr memory_pool; + std::unique_ptr file_reader; std::unique_ptr stripe_reader; std::unique_ptr orc_column_to_ch_column; diff --git a/src/Storages/MaterializedView/RefreshSettings.cpp b/src/Storages/MaterializedView/RefreshSettings.cpp index a00f9b398c6..079b35d6152 100644 --- a/src/Storages/MaterializedView/RefreshSettings.cpp +++ b/src/Storages/MaterializedView/RefreshSettings.cpp @@ -1,8 +1,57 @@ +#include +#include #include namespace DB { +#define LIST_OF_REFRESH_SETTINGS(M, ALIAS) \ + M(Int64, refresh_retries, 2, "How many times to retry refresh query if it fails. If all attempts fail, wait for the next refresh time according to schedule. 0 to disable retries. -1 for infinite retries.", 0) \ + M(UInt64, refresh_retry_initial_backoff_ms, 100, "Delay before the first retry if refresh query fails (if refresh_retries setting is not zero). Each subsequent retry doubles the delay, up to refresh_retry_max_backoff_ms.", 0) \ + M(UInt64, refresh_retry_max_backoff_ms, 60'000, "Limit on the exponential growth of delay between refresh attempts, if they keep failing and refresh_retries is positive.", 0) \ + M(Bool, all_replicas, /* do not change or existing tables will break */ false, "If the materialized view is in a Replicated database, and APPEND is enabled, this flag controls whether all replicas or one replica will refresh.", 0) \ + +DECLARE_SETTINGS_TRAITS(RefreshSettingsTraits, LIST_OF_REFRESH_SETTINGS) IMPLEMENT_SETTINGS_TRAITS(RefreshSettingsTraits, LIST_OF_REFRESH_SETTINGS) +struct RefreshSettingsImpl : public BaseSettings +{ +}; + +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) RefreshSettings##TYPE NAME = &RefreshSettingsImpl ::NAME; + +namespace RefreshSetting +{ +LIST_OF_REFRESH_SETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) +} + +#undef INITIALIZE_SETTING_EXTERN + +RefreshSettings::RefreshSettings() : impl(std::make_unique()) +{ +} + +RefreshSettings::RefreshSettings(const RefreshSettings & settings) : impl(std::make_unique(*settings.impl)) +{ +} + +RefreshSettings::RefreshSettings(RefreshSettings && settings) noexcept + : impl(std::make_unique(std::move(*settings.impl))) +{ +} + +RefreshSettings::~RefreshSettings() = default; + +RefreshSettings & RefreshSettings::operator=(const RefreshSettings & other) +{ + *impl = *other.impl; + return *this; +} + +REFRESH_SETTINGS_SUPPORTED_TYPES(RefreshSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) + +void RefreshSettings::applyChanges(const SettingsChanges & changes) +{ + impl->applyChanges(changes); +} } diff --git a/src/Storages/MaterializedView/RefreshSettings.h b/src/Storages/MaterializedView/RefreshSettings.h index 20a2991af95..867013d7406 100644 --- a/src/Storages/MaterializedView/RefreshSettings.h +++ b/src/Storages/MaterializedView/RefreshSettings.h @@ -1,18 +1,35 @@ #pragma once -#include +#include +#include namespace DB { +struct RefreshSettingsImpl; +class SettingsChanges; -#define LIST_OF_REFRESH_SETTINGS(M, ALIAS) \ - M(Int64, refresh_retries, 2, "How many times to retry refresh query if it fails. If all attempts fail, wait for the next refresh time according to schedule. 0 to disable retries. -1 for infinite retries.", 0) \ - M(UInt64, refresh_retry_initial_backoff_ms, 100, "Delay before the first retry if refresh query fails (if refresh_retries setting is not zero). Each subsequent retry doubles the delay, up to refresh_retry_max_backoff_ms.", 0) \ - M(UInt64, refresh_retry_max_backoff_ms, 60'000, "Limit on the exponential growth of delay between refresh attempts, if they keep failing and refresh_retries is positive.", 0) \ - M(Bool, all_replicas, /* do not change or existing tables will break */ false, "If the materialized view is in a Replicated database, and APPEND is enabled, this flag controls whether all replicas or one replica will refresh.", 0) \ +/// List of available types supported in RabbitMQSettings object +#define REFRESH_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + M(CLASS_NAME, Bool) \ + M(CLASS_NAME, Int64) \ + M(CLASS_NAME, UInt64) -DECLARE_SETTINGS_TRAITS(RefreshSettingsTraits, LIST_OF_REFRESH_SETTINGS) +REFRESH_SETTINGS_SUPPORTED_TYPES(RefreshSettings, DECLARE_SETTING_TRAIT) -struct RefreshSettings : public BaseSettings {}; +struct RefreshSettings +{ + RefreshSettings(); + RefreshSettings(const RefreshSettings & settings); + RefreshSettings(RefreshSettings && settings) noexcept; + ~RefreshSettings(); + RefreshSettings & operator=(const RefreshSettings & other); + + REFRESH_SETTINGS_SUPPORTED_TYPES(RefreshSettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + + void applyChanges(const SettingsChanges & changes); + +private: + std::unique_ptr impl; +}; } diff --git a/src/Storages/MaterializedView/RefreshTask.cpp b/src/Storages/MaterializedView/RefreshTask.cpp index 6a490c63bc0..3b893d4677a 100644 --- a/src/Storages/MaterializedView/RefreshTask.cpp +++ b/src/Storages/MaterializedView/RefreshTask.cpp @@ -38,6 +38,14 @@ namespace ServerSetting extern const ServerSettingsString default_replica_path; } +namespace RefreshSetting +{ + extern const RefreshSettingsBool all_replicas; + extern const RefreshSettingsInt64 refresh_retries; + extern const RefreshSettingsUInt64 refresh_retry_initial_backoff_ms; + extern const RefreshSettingsUInt64 refresh_retry_max_backoff_ms; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -187,7 +195,7 @@ void RefreshTask::checkAlterIsPossible(const DB::ASTRefreshStrategy & new_strate RefreshSettings s; if (new_strategy.settings) s.applyChanges(new_strategy.settings->changes); - if (s.all_replicas != refresh_settings.all_replicas) + if (s[RefreshSetting::all_replicas] != refresh_settings[RefreshSetting::all_replicas]) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Altering setting 'all_replicas' is not supported."); if (new_strategy.append != refresh_append) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Adding or removing APPEND is not supported."); @@ -456,7 +464,7 @@ void RefreshTask::refreshTask() else { error_message = getCurrentExceptionMessage(true); - LOG_ERROR(log, "{}: Refresh failed (attempt {}/{}): {}", view->getStorageID().getFullTableName(), start_znode.attempt_number, refresh_settings.refresh_retries + 1, error_message); + LOG_ERROR(log, "{}: Refresh failed (attempt {}/{}): {}", view->getStorageID().getFullTableName(), start_znode.attempt_number, refresh_settings[RefreshSetting::refresh_retries] + 1, error_message); } } @@ -676,10 +684,10 @@ static std::chrono::milliseconds backoff(Int64 retry_idx, const RefreshSettings UInt64 delay_ms; UInt64 multiplier = UInt64(1) << std::min(retry_idx, Int64(62)); /// Overflow check: a*b <= c iff a <= c/b iff a <= floor(c/b). - if (refresh_settings.refresh_retry_initial_backoff_ms <= refresh_settings.refresh_retry_max_backoff_ms / multiplier) - delay_ms = refresh_settings.refresh_retry_initial_backoff_ms * multiplier; + if (refresh_settings[RefreshSetting::refresh_retry_initial_backoff_ms] <= refresh_settings[RefreshSetting::refresh_retry_max_backoff_ms] / multiplier) + delay_ms = refresh_settings[RefreshSetting::refresh_retry_initial_backoff_ms] * multiplier; else - delay_ms = refresh_settings.refresh_retry_max_backoff_ms; + delay_ms = refresh_settings[RefreshSetting::refresh_retry_max_backoff_ms]; return std::chrono::milliseconds(delay_ms); } @@ -687,7 +695,7 @@ std::tuple= 0 && znode.attempt_number > refresh_settings.refresh_retries) + if (refresh_settings[RefreshSetting::refresh_retries] >= 0 && znode.attempt_number > refresh_settings[RefreshSetting::refresh_retries]) { /// Skip to the next scheduled refresh, as if a refresh succeeded. znode.last_completed_timeslot = refresh_schedule.timeslotForCompletedRefresh(znode.last_completed_timeslot, znode.last_attempt_time, znode.last_attempt_time, false); diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index 1419575f732..5a725922e14 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -42,6 +42,7 @@ namespace ErrorCodes extern const int INCORRECT_DATA; extern const int INCORRECT_NUMBER_OF_COLUMNS; extern const int INCORRECT_QUERY; + extern const int INVALID_SETTING_VALUE; extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; } @@ -110,7 +111,7 @@ USearchIndexWithSerialization::USearchIndexWithSerialization( { USearchIndex::metric_t metric(dimensions, metric_kind, scalar_kind); - unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, unum::usearch::default_expansion_search()); + unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, default_expansion_search); config.enable_key_lookups = false; /// we don't do row-to-vector lookups auto result = USearchIndex::make(metric, config); @@ -407,6 +408,9 @@ MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity , metric_kind(metric_kind_) , expansion_search(context->getSettingsRef()[Setting::hnsw_candidate_list_size_for_search]) { + if (expansion_search == 0) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'hnsw_candidate_list_size_for_search' must not be 0"); + } bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const @@ -447,7 +451,7 @@ std::vector MergeTreeIndexConditionVectorSimilarity::calculateApproximat /// synchronize index access, see https://github.com/unum-cloud/usearch/issues/500. As a workaround, we extended USearch' search method /// to accept a custom expansion_add setting. The config value is only used on the fly, i.e. not persisted in the index. - auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, (expansion_search == 0) ? unum::usearch::default_expansion_search() : expansion_search); + auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search); if (!search_result) throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release())); @@ -558,7 +562,7 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta /// Call Usearch's own parameter validation method for HNSW-specific parameters UInt64 connectivity = index.arguments[3].safeGet(); UInt64 expansion_add = index.arguments[4].safeGet(); - UInt64 expansion_search = unum::usearch::default_expansion_search(); + UInt64 expansion_search = default_expansion_search; unum::usearch::index_dense_config_t config(connectivity, expansion_add, expansion_search); if (auto error = config.validate(); error) diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index ecaa331466f..9a81e168393 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -11,10 +11,18 @@ namespace DB { +/// Defaults for HNSW parameters. Instead of using the default parameters provided by USearch (default_connectivity(), +/// default_expansion_add(), default_expansion_search()), we experimentally came up with our own default parameters. They provide better +/// trade-offs with regards to index construction time, search precision and queries-per-second (speed). +static constexpr size_t default_connectivity = 32; +static constexpr size_t default_expansion_add = 128; +static constexpr size_t default_expansion_search = 256; + +/// Parameters for HNSW index construction. struct UsearchHnswParams { - size_t connectivity = unum::usearch::default_connectivity(); - size_t expansion_add = unum::usearch::default_expansion_add(); + size_t connectivity = default_connectivity; + size_t expansion_add = default_expansion_add; }; using USearchIndex = unum::usearch::index_dense_t; diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp index 86d95aee242..1e736077f28 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.cpp +++ b/src/Storages/MergeTree/MergeTreeSettings.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -486,8 +487,7 @@ void MergeTreeColumnSettings::validate(const SettingsChanges & changes) } } -#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \ - MergeTreeSettings ## TYPE NAME = & MergeTreeSettings ## Impl :: NAME; +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) MergeTreeSettings##TYPE NAME = &MergeTreeSettingsImpl ::NAME; namespace MergeTreeSetting { @@ -511,18 +511,7 @@ MergeTreeSettings::MergeTreeSettings(MergeTreeSettings && settings) noexcept MergeTreeSettings::~MergeTreeSettings() = default; -#define IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR(CLASS_NAME, TYPE) \ - const SettingField##TYPE & MergeTreeSettings::operator[](CLASS_NAME##TYPE t) const \ - { \ - return impl.get()->*t; \ - } \ -SettingField##TYPE & MergeTreeSettings::operator[](CLASS_NAME##TYPE t) \ - { \ - return impl.get()->*t; \ - } - MERGETREE_SETTINGS_SUPPORTED_TYPES(MergeTreeSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) -#undef IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR bool MergeTreeSettings::has(std::string_view name) const { diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 24e503fd5b3..2aac7586e9a 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -144,11 +144,7 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, auto client = getClient(url, *s3_settings, context, /* for_disk_s3 */false); auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); - auto s3_capabilities = S3Capabilities - { - .support_batch_delete = config.getBool("s3.support_batch_delete", true), - .support_proxy = config.getBool("s3.support_proxy", config.has("s3.proxy")), - }; + auto s3_capabilities = getCapabilitiesFromConfig(config, "s3"); return std::make_shared( std::move(client), std::move(s3_settings), url, s3_capabilities, diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadata.cpp b/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadata.cpp index b86c4a2832c..8bab744459d 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadata.cpp +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadata.cpp @@ -41,6 +41,11 @@ namespace Setting extern const SettingsBool cloud_mode; } +namespace ObjectStorageQueueSetting +{ + extern const ObjectStorageQueueSettingsObjectStorageQueueMode mode; +} + namespace { UInt64 getCurrentTime() @@ -227,7 +232,7 @@ ObjectStorageQueueTableMetadata ObjectStorageQueueMetadata::syncWithKeeper( std::vector metadata_paths; size_t buckets_num = 0; - if (settings.mode == ObjectStorageQueueMode::ORDERED) + if (settings[ObjectStorageQueueSetting::mode] == ObjectStorageQueueMode::ORDERED) { buckets_num = getBucketsNum(table_metadata); if (buckets_num == 0) diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadata.h b/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadata.h index 0a2f4347c8c..93e726b17ab 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadata.h +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadata.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -17,6 +16,7 @@ namespace Poco { class Logger; } namespace DB { class StorageObjectStorageQueue; +struct ObjectStorageQueueSettings; struct ObjectStorageQueueTableMetadata; struct StorageInMemoryMetadata; using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadataFactory.h b/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadataFactory.h index a9975c526ef..bd2455097ec 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadataFactory.h +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueMetadataFactory.h @@ -1,7 +1,6 @@ #pragma once -#include -#include #include +#include namespace DB { diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.cpp b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.cpp index de73a5edac0..cb5f909b004 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.cpp +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.cpp @@ -1,8 +1,10 @@ -#include -#include +#include +#include #include #include #include +#include +#include namespace DB @@ -13,8 +15,70 @@ namespace ErrorCodes extern const int UNKNOWN_SETTING; } +#define OBJECT_STORAGE_QUEUE_RELATED_SETTINGS(M, ALIAS) \ + M(ObjectStorageQueueMode, mode, ObjectStorageQueueMode::ORDERED, \ + "With unordered mode, the set of all already processed files is tracked with persistent nodes in ZooKepeer." \ + "With ordered mode, only the max name of the successfully consumed file stored.", \ + 0) \ + M(ObjectStorageQueueAction, after_processing, ObjectStorageQueueAction::KEEP, "Delete or keep file in after successful processing", 0) \ + M(String, keeper_path, "", "Zookeeper node path", 0) \ + M(UInt32, loading_retries, 10, "Retry loading up to specified number of times", 0) \ + M(UInt32, processing_threads_num, 1, "Number of processing threads", 0) \ + M(UInt32, enable_logging_to_queue_log, 1, "Enable logging to system table system.(s3/azure_)queue_log", 0) \ + M(String, last_processed_path, "", "For Ordered mode. Files that have lexicographically smaller file name are considered already processed", 0) \ + M(UInt32, tracked_file_ttl_sec, 0, "Maximum number of seconds to store processed files in ZooKeeper node (store forever by default)", 0) \ + M(UInt32, polling_min_timeout_ms, 1000, "Minimal timeout before next polling", 0) \ + M(UInt32, polling_max_timeout_ms, 10000, "Maximum timeout before next polling", 0) \ + M(UInt32, polling_backoff_ms, 1000, "Polling backoff", 0) \ + M(UInt32, tracked_files_limit, 1000, "For unordered mode. Max set size for tracking processed files in ZooKeeper", 0) \ + M(UInt32, cleanup_interval_min_ms, 60000, "For unordered mode. Polling backoff min for cleanup", 0) \ + M(UInt32, cleanup_interval_max_ms, 60000, "For unordered mode. Polling backoff max for cleanup", 0) \ + M(UInt32, buckets, 0, "Number of buckets for Ordered mode parallel processing", 0) \ + M(UInt32, max_processed_files_before_commit, 100, "Number of files which can be processed before being committed to keeper", 0) \ + M(UInt32, max_processed_rows_before_commit, 0, "Number of rows which can be processed before being committed to keeper", 0) \ + M(UInt32, max_processed_bytes_before_commit, 0, "Number of bytes which can be processed before being committed to keeper", 0) \ + M(UInt32, max_processing_time_sec_before_commit, 0, "Timeout in seconds after which to commit files committed to keeper", 0) \ + +#define LIST_OF_OBJECT_STORAGE_QUEUE_SETTINGS(M, ALIAS) \ + OBJECT_STORAGE_QUEUE_RELATED_SETTINGS(M, ALIAS) \ + LIST_OF_ALL_FORMAT_SETTINGS(M, ALIAS) + +DECLARE_SETTINGS_TRAITS(ObjectStorageQueueSettingsTraits, LIST_OF_OBJECT_STORAGE_QUEUE_SETTINGS) IMPLEMENT_SETTINGS_TRAITS(ObjectStorageQueueSettingsTraits, LIST_OF_OBJECT_STORAGE_QUEUE_SETTINGS) +struct ObjectStorageQueueSettingsImpl : public BaseSettings +{ +}; + +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \ + ObjectStorageQueueSettings##TYPE NAME = &ObjectStorageQueueSettingsImpl ::NAME; + +namespace ObjectStorageQueueSetting +{ +LIST_OF_OBJECT_STORAGE_QUEUE_SETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) +} + +#undef INITIALIZE_SETTING_EXTERN + +ObjectStorageQueueSettings::ObjectStorageQueueSettings() : impl(std::make_unique()) +{ +} + +ObjectStorageQueueSettings::ObjectStorageQueueSettings(const ObjectStorageQueueSettings & settings) + : impl(std::make_unique(*settings.impl)) +{ +} + +ObjectStorageQueueSettings::ObjectStorageQueueSettings(ObjectStorageQueueSettings && settings) noexcept + : impl(std::make_unique(std::move(*settings.impl))) +{ +} + +ObjectStorageQueueSettings::~ObjectStorageQueueSettings() = default; + +OBJECT_STORAGE_QUEUE_SETTINGS_SUPPORTED_TYPES(ObjectStorageQueueSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) + + void ObjectStorageQueueSettings::loadFromQuery(ASTStorage & storage_def) { if (storage_def.settings) @@ -42,7 +106,7 @@ void ObjectStorageQueueSettings::loadFromQuery(ASTStorage & storage_def) for (const auto & setting : ignore_settings) settings_changes.removeSetting(setting); - applyChanges(settings_changes); + impl->applyChanges(settings_changes); } catch (Exception & e) { diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.h b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.h index 550367b48a2..37c65dee0ca 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.h +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.h @@ -1,51 +1,59 @@ #pragma once -#include +#include #include #include +#include namespace DB { class ASTStorage; +struct ObjectStorageQueueSettingsImpl; +/// List of available types supported in ObjectStorageQueueSettings object +#define OBJECT_STORAGE_QUEUE_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + M(CLASS_NAME, ArrowCompression) \ + M(CLASS_NAME, Bool) \ + M(CLASS_NAME, CapnProtoEnumComparingMode) \ + M(CLASS_NAME, Char) \ + M(CLASS_NAME, DateTimeInputFormat) \ + M(CLASS_NAME, DateTimeOutputFormat) \ + M(CLASS_NAME, DateTimeOverflowBehavior) \ + M(CLASS_NAME, Double) \ + M(CLASS_NAME, EscapingRule) \ + M(CLASS_NAME, Float) \ + M(CLASS_NAME, IdentifierQuotingRule) \ + M(CLASS_NAME, IdentifierQuotingStyle) \ + M(CLASS_NAME, Int64) \ + M(CLASS_NAME, IntervalOutputFormat) \ + M(CLASS_NAME, MsgPackUUIDRepresentation) \ + M(CLASS_NAME, ObjectStorageQueueAction) \ + M(CLASS_NAME, ObjectStorageQueueMode) \ + M(CLASS_NAME, ORCCompression) \ + M(CLASS_NAME, ParquetCompression) \ + M(CLASS_NAME, ParquetVersion) \ + M(CLASS_NAME, SchemaInferenceMode) \ + M(CLASS_NAME, String) \ + M(CLASS_NAME, UInt32) \ + M(CLASS_NAME, UInt64) \ + M(CLASS_NAME, UInt64Auto) \ + M(CLASS_NAME, URI) -#define OBJECT_STORAGE_QUEUE_RELATED_SETTINGS(M, ALIAS) \ - M(ObjectStorageQueueMode, \ - mode, \ - ObjectStorageQueueMode::ORDERED, \ - "With unordered mode, the set of all already processed files is tracked with persistent nodes in ZooKepeer." \ - "With ordered mode, only the max name of the successfully consumed file stored.", \ - 0) \ - M(ObjectStorageQueueAction, after_processing, ObjectStorageQueueAction::KEEP, "Delete or keep file in after successful processing", 0) \ - M(String, keeper_path, "", "Zookeeper node path", 0) \ - M(UInt32, loading_retries, 10, "Retry loading up to specified number of times", 0) \ - M(UInt32, processing_threads_num, 1, "Number of processing threads", 0) \ - M(UInt32, enable_logging_to_queue_log, 1, "Enable logging to system table system.(s3/azure_)queue_log", 0) \ - M(String, last_processed_path, "", "For Ordered mode. Files that have lexicographically smaller file name are considered already processed", 0) \ - M(UInt32, tracked_file_ttl_sec, 0, "Maximum number of seconds to store processed files in ZooKeeper node (store forever by default)", 0) \ - M(UInt32, polling_min_timeout_ms, 1000, "Minimal timeout before next polling", 0) \ - M(UInt32, polling_max_timeout_ms, 10000, "Maximum timeout before next polling", 0) \ - M(UInt32, polling_backoff_ms, 1000, "Polling backoff", 0) \ - M(UInt32, tracked_files_limit, 1000, "For unordered mode. Max set size for tracking processed files in ZooKeeper", 0) \ - M(UInt32, cleanup_interval_min_ms, 60000, "For unordered mode. Polling backoff min for cleanup", 0) \ - M(UInt32, cleanup_interval_max_ms, 60000, "For unordered mode. Polling backoff max for cleanup", 0) \ - M(UInt32, buckets, 0, "Number of buckets for Ordered mode parallel processing", 0) \ - M(UInt32, max_processed_files_before_commit, 100, "Number of files which can be processed before being committed to keeper", 0) \ - M(UInt32, max_processed_rows_before_commit, 0, "Number of rows which can be processed before being committed to keeper", 0) \ - M(UInt32, max_processed_bytes_before_commit, 0, "Number of bytes which can be processed before being committed to keeper", 0) \ - M(UInt32, max_processing_time_sec_before_commit, 0, "Timeout in seconds after which to commit files committed to keeper", 0) \ +OBJECT_STORAGE_QUEUE_SETTINGS_SUPPORTED_TYPES(ObjectStorageQueueSettings, DECLARE_SETTING_TRAIT) -#define LIST_OF_OBJECT_STORAGE_QUEUE_SETTINGS(M, ALIAS) \ - OBJECT_STORAGE_QUEUE_RELATED_SETTINGS(M, ALIAS) \ - LIST_OF_ALL_FORMAT_SETTINGS(M, ALIAS) - -DECLARE_SETTINGS_TRAITS(ObjectStorageQueueSettingsTraits, LIST_OF_OBJECT_STORAGE_QUEUE_SETTINGS) - - -struct ObjectStorageQueueSettings : public BaseSettings +struct ObjectStorageQueueSettings { - void loadFromQuery(ASTStorage & storage_def); -}; + ObjectStorageQueueSettings(); + ObjectStorageQueueSettings(const ObjectStorageQueueSettings & settings); + ObjectStorageQueueSettings(ObjectStorageQueueSettings && settings) noexcept; + ~ObjectStorageQueueSettings(); + OBJECT_STORAGE_QUEUE_SETTINGS_SUPPORTED_TYPES(ObjectStorageQueueSettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + + void loadFromQuery(ASTStorage & storage_def); + +private: + std::unique_ptr impl; +}; } diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp index 5e143c1df1c..26966f9cbd2 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp @@ -23,6 +23,15 @@ namespace Setting extern const SettingsMaxThreads max_parsing_threads; } +namespace ObjectStorageQueueSetting +{ + extern const ObjectStorageQueueSettingsObjectStorageQueueAction after_processing; + extern const ObjectStorageQueueSettingsUInt32 max_processed_bytes_before_commit; + extern const ObjectStorageQueueSettingsUInt32 max_processed_files_before_commit; + extern const ObjectStorageQueueSettingsUInt32 max_processed_rows_before_commit; + extern const ObjectStorageQueueSettingsUInt32 max_processing_time_sec_before_commit; +} + namespace ErrorCodes { extern const int NOT_IMPLEMENTED; @@ -361,7 +370,7 @@ ObjectStorageQueueSource::ObjectStorageQueueSource( ObjectStoragePtr object_storage_, const ReadFromFormatInfo & read_from_format_info_, const std::optional & format_settings_, - const ObjectStorageQueueSettings & queue_settings_, + const CommitSettings & commit_settings_, std::shared_ptr files_metadata_, ContextPtr context_, size_t max_block_size_, @@ -380,7 +389,7 @@ ObjectStorageQueueSource::ObjectStorageQueueSource( , object_storage(object_storage_) , read_from_format_info(read_from_format_info_) , format_settings(format_settings_) - , queue_settings(queue_settings_) + , commit_settings(commit_settings_) , files_metadata(files_metadata_) , max_block_size(max_block_size_) , shutdown_called(shutdown_called_) @@ -565,8 +574,8 @@ Chunk ObjectStorageQueueSource::generateImpl() processed_rows_from_file = 0; processed_files.push_back(file_metadata); - if (queue_settings.max_processed_files_before_commit - && processed_files.size() == queue_settings.max_processed_files_before_commit) + if (commit_settings.max_processed_files_before_commit + && processed_files.size() == commit_settings.max_processed_files_before_commit) { LOG_TRACE(log, "Number of max processed files before commit reached " "(rows: {}, bytes: {}, files: {})", @@ -574,15 +583,15 @@ Chunk ObjectStorageQueueSource::generateImpl() break; } - if (queue_settings.max_processed_rows_before_commit - && total_processed_rows == queue_settings.max_processed_rows_before_commit) + if (commit_settings.max_processed_rows_before_commit + && total_processed_rows == commit_settings.max_processed_rows_before_commit) { LOG_TRACE(log, "Number of max processed rows before commit reached " "(rows: {}, bytes: {}, files: {})", total_processed_rows, total_processed_bytes, processed_files.size()); break; } - if (queue_settings.max_processed_bytes_before_commit && total_processed_bytes == queue_settings.max_processed_bytes_before_commit) + if (commit_settings.max_processed_bytes_before_commit && total_processed_bytes == commit_settings.max_processed_bytes_before_commit) { LOG_TRACE( log, @@ -593,8 +602,8 @@ Chunk ObjectStorageQueueSource::generateImpl() processed_files.size()); break; } - if (queue_settings.max_processing_time_sec_before_commit - && total_stopwatch.elapsedSeconds() >= queue_settings.max_processing_time_sec_before_commit) + if (commit_settings.max_processing_time_sec_before_commit + && total_stopwatch.elapsedSeconds() >= commit_settings.max_processing_time_sec_before_commit) { LOG_TRACE( log, @@ -648,15 +657,9 @@ void ObjectStorageQueueSource::commit(bool success, const std::string & exceptio void ObjectStorageQueueSource::applyActionAfterProcessing(const String & path) { - switch (queue_settings.after_processing.value) + if (files_metadata->getTableMetadata().after_processing == "delete") { - case ObjectStorageQueueAction::DELETE: - { - object_storage->removeObject(StoredObject(path)); - break; - } - case ObjectStorageQueueAction::KEEP: - break; + object_storage->removeObject(StoredObject(path)); } } diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.h b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.h index c085287e4f3..5d8bd5bda7c 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.h +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.h @@ -1,12 +1,13 @@ #pragma once #include "config.h" -#include +#include #include -#include #include #include -#include +#include +#include +#include namespace Poco { class Logger; } @@ -93,6 +94,14 @@ public: bool hasKeysForProcessor(const Processor & processor) const; }; + struct CommitSettings + { + size_t max_processed_files_before_commit; + size_t max_processed_rows_before_commit; + size_t max_processed_bytes_before_commit; + size_t max_processing_time_sec_before_commit; + }; + ObjectStorageQueueSource( String name_, size_t processor_id_, @@ -101,7 +110,7 @@ public: ObjectStoragePtr object_storage_, const ReadFromFormatInfo & read_from_format_info_, const std::optional & format_settings_, - const ObjectStorageQueueSettings & queue_settings_, + const CommitSettings & commit_settings_, std::shared_ptr files_metadata_, ContextPtr context_, size_t max_block_size_, @@ -130,7 +139,7 @@ private: const ObjectStoragePtr object_storage; ReadFromFormatInfo read_from_format_info; const std::optional format_settings; - const ObjectStorageQueueSettings queue_settings; + const CommitSettings commit_settings; const std::shared_ptr files_metadata; const size_t max_block_size; diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.cpp b/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.cpp index c7e219038e7..e67379c393e 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.cpp +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.cpp @@ -12,6 +12,19 @@ namespace DB { +namespace ObjectStorageQueueSetting +{ + extern const ObjectStorageQueueSettingsObjectStorageQueueAction after_processing; + extern const ObjectStorageQueueSettingsUInt32 buckets; + extern const ObjectStorageQueueSettingsString last_processed_path; + extern const ObjectStorageQueueSettingsUInt32 loading_retries; + extern const ObjectStorageQueueSettingsObjectStorageQueueMode mode; + extern const ObjectStorageQueueSettingsUInt32 processing_threads_num; + extern const ObjectStorageQueueSettingsUInt32 tracked_files_limit; + extern const ObjectStorageQueueSettingsUInt32 tracked_file_ttl_sec; + +} + namespace ErrorCodes { extern const int METADATA_MISMATCH; @@ -43,19 +56,19 @@ ObjectStorageQueueTableMetadata::ObjectStorageQueueTableMetadata( const std::string & format_) : format_name(format_) , columns(columns_.toString()) - , after_processing(engine_settings.after_processing.toString()) - , mode(engine_settings.mode.toString()) - , tracked_files_limit(engine_settings.tracked_files_limit) - , tracked_files_ttl_sec(engine_settings.tracked_file_ttl_sec) - , buckets(engine_settings.buckets) - , last_processed_path(engine_settings.last_processed_path) - , loading_retries(engine_settings.loading_retries) + , after_processing(engine_settings[ObjectStorageQueueSetting::after_processing].toString()) + , mode(engine_settings[ObjectStorageQueueSetting::mode].toString()) + , tracked_files_limit(engine_settings[ObjectStorageQueueSetting::tracked_files_limit]) + , tracked_files_ttl_sec(engine_settings[ObjectStorageQueueSetting::tracked_file_ttl_sec]) + , buckets(engine_settings[ObjectStorageQueueSetting::buckets]) + , last_processed_path(engine_settings[ObjectStorageQueueSetting::last_processed_path]) + , loading_retries(engine_settings[ObjectStorageQueueSetting::loading_retries]) { - processing_threads_num_changed = engine_settings.processing_threads_num.changed; - if (!processing_threads_num_changed && engine_settings.processing_threads_num <= 1) + processing_threads_num_changed = engine_settings[ObjectStorageQueueSetting::processing_threads_num].changed; + if (!processing_threads_num_changed && engine_settings[ObjectStorageQueueSetting::processing_threads_num] <= 1) processing_threads_num = std::max(getNumberOfCPUCoresToUse(), 16); else - processing_threads_num = engine_settings.processing_threads_num; + processing_threads_num = engine_settings[ObjectStorageQueueSetting::processing_threads_num]; } String ObjectStorageQueueTableMetadata::toString() const diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.h b/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.h index fc0b52c196b..433c6b26fe7 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.h +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.h @@ -1,15 +1,16 @@ #pragma once -#include -#include +#include #include +#include +#include #include #include -#include namespace DB { +struct ObjectStorageQueueSettings; class WriteBuffer; class ReadBuffer; diff --git a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp index c1794f494b4..5da5d56bf3b 100644 --- a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp +++ b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,23 @@ namespace Setting extern const SettingsBool use_concurrency_control; } +namespace ObjectStorageQueueSetting +{ + extern const ObjectStorageQueueSettingsUInt32 cleanup_interval_max_ms; + extern const ObjectStorageQueueSettingsUInt32 cleanup_interval_min_ms; + extern const ObjectStorageQueueSettingsUInt32 enable_logging_to_queue_log; + extern const ObjectStorageQueueSettingsString keeper_path; + extern const ObjectStorageQueueSettingsObjectStorageQueueMode mode; + extern const ObjectStorageQueueSettingsUInt32 max_processed_bytes_before_commit; + extern const ObjectStorageQueueSettingsUInt32 max_processed_files_before_commit; + extern const ObjectStorageQueueSettingsUInt32 max_processed_rows_before_commit; + extern const ObjectStorageQueueSettingsUInt32 max_processing_time_sec_before_commit; + extern const ObjectStorageQueueSettingsUInt32 polling_min_timeout_ms; + extern const ObjectStorageQueueSettingsUInt32 polling_max_timeout_ms; + extern const ObjectStorageQueueSettingsUInt32 polling_backoff_ms; + extern const ObjectStorageQueueSettingsUInt32 processing_threads_num; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -56,10 +74,10 @@ namespace zk_path_prefix = "/"; std::string result_zk_path; - if (queue_settings.keeper_path.changed) + if (queue_settings[ObjectStorageQueueSetting::keeper_path].changed) { /// We do not add table uuid here on purpose. - result_zk_path = fs::path(zk_path_prefix) / queue_settings.keeper_path.value; + result_zk_path = fs::path(zk_path_prefix) / queue_settings[ObjectStorageQueueSetting::keeper_path].value; } else { @@ -73,42 +91,42 @@ namespace ObjectStorageQueueSettings & queue_settings, bool is_attach) { - if (!is_attach && !queue_settings.mode.changed) + if (!is_attach && !queue_settings[ObjectStorageQueueSetting::mode].changed) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `mode` (Unordered/Ordered) is not specified, but is required."); } /// In case !is_attach, we leave Ordered mode as default for compatibility. - if (!queue_settings.processing_threads_num) + if (!queue_settings[ObjectStorageQueueSetting::processing_threads_num]) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `processing_threads_num` cannot be set to zero"); } - if (queue_settings.cleanup_interval_min_ms > queue_settings.cleanup_interval_max_ms) + if (queue_settings[ObjectStorageQueueSetting::cleanup_interval_min_ms] > queue_settings[ObjectStorageQueueSetting::cleanup_interval_max_ms]) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `cleanup_interval_min_ms` ({}) must be less or equal to `cleanup_interval_max_ms` ({})", - queue_settings.cleanup_interval_min_ms, queue_settings.cleanup_interval_max_ms); + queue_settings[ObjectStorageQueueSetting::cleanup_interval_min_ms], queue_settings[ObjectStorageQueueSetting::cleanup_interval_max_ms]); } } std::shared_ptr getQueueLog( const ObjectStoragePtr & storage, const ContextPtr & context, - const ObjectStorageQueueSettings & table_settings) + bool enable_logging_to_queue_log) { const auto & settings = context->getSettingsRef(); switch (storage->getType()) { case DB::ObjectStorageType::S3: { - if (table_settings.enable_logging_to_queue_log || settings[Setting::s3queue_enable_logging_to_s3queue_log]) + if (enable_logging_to_queue_log || settings[Setting::s3queue_enable_logging_to_s3queue_log]) return context->getS3QueueLog(); return nullptr; } case DB::ObjectStorageType::Azure: { - if (table_settings.enable_logging_to_queue_log) + if (enable_logging_to_queue_log) return context->getAzureQueueLog(); return nullptr; } @@ -131,11 +149,20 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( LoadingStrictnessLevel mode) : IStorage(table_id_) , WithContext(context_) - , queue_settings(std::move(queue_settings_)) - , zk_path(chooseZooKeeperPath(table_id_, context_->getSettingsRef(), *queue_settings)) + , zk_path(chooseZooKeeperPath(table_id_, context_->getSettingsRef(), *queue_settings_)) + , enable_logging_to_queue_log((*queue_settings_)[ObjectStorageQueueSetting::enable_logging_to_queue_log]) + , polling_min_timeout_ms((*queue_settings_)[ObjectStorageQueueSetting::polling_min_timeout_ms]) + , polling_max_timeout_ms((*queue_settings_)[ObjectStorageQueueSetting::polling_max_timeout_ms]) + , polling_backoff_ms((*queue_settings_)[ObjectStorageQueueSetting::polling_backoff_ms]) + , commit_settings(CommitSettings{ + .max_processed_files_before_commit = (*queue_settings_)[ObjectStorageQueueSetting::max_processed_files_before_commit], + .max_processed_rows_before_commit = (*queue_settings_)[ObjectStorageQueueSetting::max_processed_rows_before_commit], + .max_processed_bytes_before_commit = (*queue_settings_)[ObjectStorageQueueSetting::max_processed_bytes_before_commit], + .max_processing_time_sec_before_commit = (*queue_settings_)[ObjectStorageQueueSetting::max_processing_time_sec_before_commit], + }) , configuration{configuration_} , format_settings(format_settings_) - , reschedule_processing_interval_ms(queue_settings->polling_min_timeout_ms) + , reschedule_processing_interval_ms((*queue_settings_)[ObjectStorageQueueSetting::polling_min_timeout_ms]) , log(getLogger(fmt::format("Storage{}Queue ({})", configuration->getEngineName(), table_id_.getFullTableName()))) { if (configuration->getPath().empty()) @@ -152,7 +179,7 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( } const bool is_attach = mode > LoadingStrictnessLevel::CREATE; - validateSettings(*queue_settings, is_attach); + validateSettings(*queue_settings_, is_attach); object_storage = configuration->createObjectStorage(context_, /* is_readonly */true); FormatFactory::instance().checkFormatName(configuration->format); @@ -173,10 +200,10 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( LOG_INFO(log, "Using zookeeper path: {}", zk_path.string()); auto table_metadata = ObjectStorageQueueMetadata::syncWithKeeper( - zk_path, *queue_settings, storage_metadata.getColumns(), configuration_->format, context_, is_attach, log); + zk_path, *queue_settings_, storage_metadata.getColumns(), configuration_->format, context_, is_attach, log); auto queue_metadata = std::make_unique( - zk_path, std::move(table_metadata), queue_settings->cleanup_interval_min_ms, queue_settings->cleanup_interval_max_ms); + zk_path, std::move(table_metadata), (*queue_settings_)[ObjectStorageQueueSetting::cleanup_interval_min_ms], (*queue_settings_)[ObjectStorageQueueSetting::cleanup_interval_max_ms]); files_metadata = ObjectStorageQueueMetadataFactory::instance().getOrCreate(zk_path, std::move(queue_metadata)); @@ -317,7 +344,7 @@ void StorageObjectStorageQueue::read( void ReadFromObjectStorageQueue::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { Pipes pipes; - const size_t adjusted_num_streams = storage->queue_settings->processing_threads_num; + const size_t adjusted_num_streams = storage->getTableMetadata().processing_threads_num; createIterator(nullptr); for (size_t i = 0; i < adjusted_num_streams; ++i) @@ -351,9 +378,10 @@ std::shared_ptr StorageObjectStorageQueue::createSourc getName(), processor_id, file_iterator, configuration, object_storage, info, format_settings, - *queue_settings, files_metadata, + commit_settings, + files_metadata, local_context, max_block_size, shutdown_called, table_is_being_dropped, - getQueueLog(object_storage, local_context, *queue_settings), + getQueueLog(object_storage, local_context, enable_logging_to_queue_log), getStorageID(), log, commit_once_processed); } @@ -400,12 +428,12 @@ void StorageObjectStorageQueue::threadFunc() if (streamToViews()) { /// Reset the reschedule interval. - reschedule_processing_interval_ms = queue_settings->polling_min_timeout_ms; + reschedule_processing_interval_ms = polling_min_timeout_ms; } else { /// Increase the reschedule interval. - reschedule_processing_interval_ms += queue_settings->polling_backoff_ms; + reschedule_processing_interval_ms = std::min(polling_max_timeout_ms, reschedule_processing_interval_ms + polling_backoff_ms); } LOG_DEBUG(log, "Stopped streaming to {} attached views", dependencies_count); @@ -446,6 +474,9 @@ bool StorageObjectStorageQueue::streamToViews() auto file_iterator = createFileIterator(queue_context, nullptr); size_t total_rows = 0; + const size_t processing_threads_num = getTableMetadata().processing_threads_num; + + LOG_TEST(log, "Using {} processing threads", processing_threads_num); while (!shutdown_called && !file_iterator->isFinished()) { @@ -466,10 +497,10 @@ bool StorageObjectStorageQueue::streamToViews() Pipes pipes; std::vector> sources; - pipes.reserve(queue_settings->processing_threads_num); - sources.reserve(queue_settings->processing_threads_num); + pipes.reserve(processing_threads_num); + sources.reserve(processing_threads_num); - for (size_t i = 0; i < queue_settings->processing_threads_num; ++i) + for (size_t i = 0; i < processing_threads_num; ++i) { auto source = createSource( i/* processor_id */, @@ -485,7 +516,7 @@ bool StorageObjectStorageQueue::streamToViews() auto pipe = Pipe::unitePipes(std::move(pipes)); block_io.pipeline.complete(std::move(pipe)); - block_io.pipeline.setNumThreads(queue_settings->processing_threads_num); + block_io.pipeline.setNumThreads(processing_threads_num); block_io.pipeline.setConcurrencyControl(queue_context->getSettingsRef()[Setting::use_concurrency_control]); std::atomic_size_t rows = 0; diff --git a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.h b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.h index fc459c45f74..68186074007 100644 --- a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.h +++ b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -15,6 +14,7 @@ namespace DB { class ObjectStorageQueueMetadata; +struct ObjectStorageQueueSettings; class StorageObjectStorageQueue : public IStorage, WithContext { @@ -54,9 +54,14 @@ public: private: friend class ReadFromObjectStorageQueue; using FileIterator = ObjectStorageQueueSource::FileIterator; + using CommitSettings = ObjectStorageQueueSource::CommitSettings; - const std::unique_ptr queue_settings; const fs::path zk_path; + const bool enable_logging_to_queue_log; + const size_t polling_min_timeout_ms; + const size_t polling_max_timeout_ms; + const size_t polling_backoff_ms; + const CommitSettings commit_settings; std::shared_ptr files_metadata; ConfigurationPtr configuration; @@ -81,6 +86,7 @@ private: bool supportsSubcolumns() const override { return true; } bool supportsOptimizationToSubcolumns() const override { return false; } bool supportsDynamicSubcolumns() const override { return true; } + const ObjectStorageQueueTableMetadata & getTableMetadata() const { return files_metadata->getTableMetadata(); } std::shared_ptr createFileIterator(ContextPtr local_context, const ActionsDAG::Node * predicate); std::shared_ptr createSource( diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.cpp b/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.cpp index ef9ca78d984..3e067a9320e 100644 --- a/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.cpp +++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.cpp @@ -1,6 +1,9 @@ -#include "MaterializedPostgreSQLSettings.h" +#include #if USE_LIBPQXX + +#include +#include #include #include #include @@ -15,15 +18,72 @@ namespace ErrorCodes extern const int UNKNOWN_SETTING; } +#define LIST_OF_MATERIALIZED_POSTGRESQL_SETTINGS(M, ALIAS) \ + M(UInt64, materialized_postgresql_max_block_size, 65536, "Number of row collected before flushing data into table.", 0) \ + M(String, materialized_postgresql_tables_list, "", "List of tables for MaterializedPostgreSQL database engine", 0) \ + M(String, materialized_postgresql_schema_list, "", "List of schemas for MaterializedPostgreSQL database engine", 0) \ + M(String, materialized_postgresql_replication_slot, "", "A user-created replication slot", 0) \ + M(String, materialized_postgresql_snapshot, "", "User provided snapshot in case he manages replication slots himself", 0) \ + M(String, materialized_postgresql_schema, "", "PostgreSQL schema", 0) \ + M(Bool, materialized_postgresql_tables_list_with_schema, false, \ + "Consider by default that if there is a dot in tables list 'name.name', " \ + "then the first name is postgres schema and second is postgres table. This setting is needed to allow table names with dots", 0) \ + M(UInt64, materialized_postgresql_backoff_min_ms, 200, "Poll backoff start point", 0) \ + M(UInt64, materialized_postgresql_backoff_max_ms, 10000, "Poll backoff max point", 0) \ + M(UInt64, materialized_postgresql_backoff_factor, 2, "Poll backoff factor", 0) \ + M(Bool, materialized_postgresql_use_unique_replication_consumer_identifier, false, "Should a unique consumer be registered for table replication", 0) \ + +DECLARE_SETTINGS_TRAITS(MaterializedPostgreSQLSettingsTraits, LIST_OF_MATERIALIZED_POSTGRESQL_SETTINGS) IMPLEMENT_SETTINGS_TRAITS(MaterializedPostgreSQLSettingsTraits, LIST_OF_MATERIALIZED_POSTGRESQL_SETTINGS) +struct MaterializedPostgreSQLSettingsImpl : public BaseSettings +{ +}; + +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) MaterializedPostgreSQLSettings##TYPE NAME = &MaterializedPostgreSQLSettingsImpl ::NAME; + +namespace MaterializedPostgreSQLSetting +{ +LIST_OF_MATERIALIZED_POSTGRESQL_SETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) +} + +#undef INITIALIZE_SETTING_EXTERN + +MaterializedPostgreSQLSettings::MaterializedPostgreSQLSettings() : impl(std::make_unique()) +{ +} + +MaterializedPostgreSQLSettings::MaterializedPostgreSQLSettings(const MaterializedPostgreSQLSettings & settings) + : impl(std::make_unique(*settings.impl)) +{ +} + +MaterializedPostgreSQLSettings::MaterializedPostgreSQLSettings(MaterializedPostgreSQLSettings && settings) noexcept + : impl(std::make_unique(std::move(*settings.impl))) +{ +} + +MaterializedPostgreSQLSettings::~MaterializedPostgreSQLSettings() = default; + +MATERIALIZED_POSTGRESQL_SETTINGS_SUPPORTED_TYPES(MaterializedPostgreSQLSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) + +void MaterializedPostgreSQLSettings::applyChange(const SettingChange & change) +{ + impl->applyChange(change); +} + +bool MaterializedPostgreSQLSettings::has(std::string_view name) const +{ + return impl->has(name); +} + void MaterializedPostgreSQLSettings::loadFromQuery(ASTStorage & storage_def) { if (storage_def.settings) { try { - applyChanges(storage_def.settings->changes); + impl->applyChanges(storage_def.settings->changes); } catch (Exception & e) { diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h b/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h index 18473e1bb26..2f07cac8b3e 100644 --- a/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h +++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h @@ -3,34 +3,40 @@ #include "config.h" #if USE_LIBPQXX -#include + +#include +#include namespace DB { - class ASTStorage; +class ASTStorage; +struct SettingChange; +struct MaterializedPostgreSQLSettingsImpl; + +#define MATERIALIZED_POSTGRESQL_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + M(CLASS_NAME, Bool) \ + M(CLASS_NAME, String) \ + M(CLASS_NAME, UInt64) + +MATERIALIZED_POSTGRESQL_SETTINGS_SUPPORTED_TYPES(MaterializedPostgreSQLSettings, DECLARE_SETTING_TRAIT) -#define LIST_OF_MATERIALIZED_POSTGRESQL_SETTINGS(M, ALIAS) \ - M(UInt64, materialized_postgresql_max_block_size, 65536, "Number of row collected before flushing data into table.", 0) \ - M(String, materialized_postgresql_tables_list, "", "List of tables for MaterializedPostgreSQL database engine", 0) \ - M(String, materialized_postgresql_schema_list, "", "List of schemas for MaterializedPostgreSQL database engine", 0) \ - M(String, materialized_postgresql_replication_slot, "", "A user-created replication slot", 0) \ - M(String, materialized_postgresql_snapshot, "", "User provided snapshot in case he manages replication slots himself", 0) \ - M(String, materialized_postgresql_schema, "", "PostgreSQL schema", 0) \ - M(Bool, materialized_postgresql_tables_list_with_schema, false, \ - "Consider by default that if there is a dot in tables list 'name.name', " \ - "then the first name is postgres schema and second is postgres table. This setting is needed to allow table names with dots", 0) \ - M(UInt64, materialized_postgresql_backoff_min_ms, 200, "Poll backoff start point", 0) \ - M(UInt64, materialized_postgresql_backoff_max_ms, 10000, "Poll backoff max point", 0) \ - M(UInt64, materialized_postgresql_backoff_factor, 2, "Poll backoff factor", 0) \ - M(Bool, materialized_postgresql_use_unique_replication_consumer_identifier, false, "Should a unique consumer be registered for table replication", 0) \ - -DECLARE_SETTINGS_TRAITS(MaterializedPostgreSQLSettingsTraits, LIST_OF_MATERIALIZED_POSTGRESQL_SETTINGS) - -struct MaterializedPostgreSQLSettings : public BaseSettings +struct MaterializedPostgreSQLSettings { + MaterializedPostgreSQLSettings(); + MaterializedPostgreSQLSettings(const MaterializedPostgreSQLSettings & settings); + MaterializedPostgreSQLSettings(MaterializedPostgreSQLSettings && settings) noexcept; + ~MaterializedPostgreSQLSettings(); + + MATERIALIZED_POSTGRESQL_SETTINGS_SUPPORTED_TYPES(MaterializedPostgreSQLSettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + + void applyChange(const SettingChange & change); + bool has(std::string_view name) const; void loadFromQuery(ASTStorage & storage_def); + +private: + std::unique_ptr impl; }; } diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp index 81b8663512d..74b5203fa8e 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp @@ -1,4 +1,3 @@ -#include "PostgreSQLReplicationHandler.h" #include @@ -9,6 +8,8 @@ #include #include #include +#include +#include #include #include #include @@ -26,6 +27,21 @@ namespace DB static const auto CLEANUP_RESCHEDULE_MS = 600000 * 3; /// 30 min static constexpr size_t replication_slot_name_max_size = 64; +namespace MaterializedPostgreSQLSetting +{ + extern const MaterializedPostgreSQLSettingsUInt64 materialized_postgresql_backoff_factor; + extern const MaterializedPostgreSQLSettingsUInt64 materialized_postgresql_backoff_max_ms; + extern const MaterializedPostgreSQLSettingsUInt64 materialized_postgresql_backoff_min_ms; + extern const MaterializedPostgreSQLSettingsUInt64 materialized_postgresql_max_block_size; + extern const MaterializedPostgreSQLSettingsString materialized_postgresql_replication_slot; + extern const MaterializedPostgreSQLSettingsString materialized_postgresql_schema; + extern const MaterializedPostgreSQLSettingsString materialized_postgresql_schema_list; + extern const MaterializedPostgreSQLSettingsString materialized_postgresql_snapshot; + extern const MaterializedPostgreSQLSettingsString materialized_postgresql_tables_list; + extern const MaterializedPostgreSQLSettingsBool materialized_postgresql_tables_list_with_schema; + extern const MaterializedPostgreSQLSettingsBool materialized_postgresql_use_unique_replication_consumer_identifier; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -103,10 +119,10 @@ namespace const String & clickhouse_uuid, const MaterializedPostgreSQLSettings & replication_settings) { - String slot_name = replication_settings.materialized_postgresql_replication_slot; + String slot_name = replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_replication_slot]; if (slot_name.empty()) { - if (replication_settings.materialized_postgresql_use_unique_replication_consumer_identifier) + if (replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_use_unique_replication_consumer_identifier]) slot_name = clickhouse_uuid; else slot_name = postgres_table.empty() ? postgres_database : fmt::format("{}_{}_ch_replication_slot", postgres_database, postgres_table); @@ -131,22 +147,22 @@ PostgreSQLReplicationHandler::PostgreSQLReplicationHandler( , log(getLogger("PostgreSQLReplicationHandler")) , is_attach(is_attach_) , postgres_database(postgres_database_) - , postgres_schema(replication_settings.materialized_postgresql_schema) + , postgres_schema(replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_schema]) , current_database_name(clickhouse_database_) , connection_info(connection_info_) - , max_block_size(replication_settings.materialized_postgresql_max_block_size) + , max_block_size(replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_max_block_size]) , is_materialized_postgresql_database(is_materialized_postgresql_database_) - , tables_list(replication_settings.materialized_postgresql_tables_list) - , schema_list(replication_settings.materialized_postgresql_schema_list) - , schema_as_a_part_of_table_name(!schema_list.empty() || replication_settings.materialized_postgresql_tables_list_with_schema) - , user_managed_slot(!replication_settings.materialized_postgresql_replication_slot.value.empty()) - , user_provided_snapshot(replication_settings.materialized_postgresql_snapshot) + , tables_list(replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_tables_list]) + , schema_list(replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_schema_list]) + , schema_as_a_part_of_table_name(!schema_list.empty() || replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_tables_list_with_schema]) + , user_managed_slot(!replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_replication_slot].value.empty()) + , user_provided_snapshot(replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_snapshot]) , replication_slot(getReplicationSlotName(postgres_database_, postgres_table_, clickhouse_uuid_, replication_settings)) , tmp_replication_slot(replication_slot + "_tmp") , publication_name(getPublicationName(postgres_database_, postgres_table_)) - , reschedule_backoff_min_ms(replication_settings.materialized_postgresql_backoff_min_ms) - , reschedule_backoff_max_ms(replication_settings.materialized_postgresql_backoff_max_ms) - , reschedule_backoff_factor(replication_settings.materialized_postgresql_backoff_factor) + , reschedule_backoff_min_ms(replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_backoff_min_ms]) + , reschedule_backoff_max_ms(replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_backoff_max_ms]) + , reschedule_backoff_factor(replication_settings[MaterializedPostgreSQLSetting::materialized_postgresql_backoff_factor]) , milliseconds_to_wait(reschedule_backoff_min_ms) { if (!schema_list.empty() && !tables_list.empty()) diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h index 8257f92ae1f..eae715abcb5 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h @@ -1,7 +1,6 @@ #pragma once #include "MaterializedPostgreSQLConsumer.h" -#include "MaterializedPostgreSQLSettings.h" #include #include #include @@ -10,6 +9,7 @@ namespace DB { +struct MaterializedPostgreSQLSettings; class StorageMaterializedPostgreSQL; struct SettingChange; diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp index b798c2475a7..28e44f3a276 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp @@ -1,4 +1,5 @@ -#include "StorageMaterializedPostgreSQL.h" +#include +#include #if USE_LIBPQXX #include @@ -46,6 +47,11 @@ namespace Setting extern const SettingsUInt64 postgresql_connection_attempt_timeout; } +namespace MaterializedPostgreSQLSetting +{ + extern const MaterializedPostgreSQLSettingsString materialized_postgresql_tables_list; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -79,7 +85,7 @@ StorageMaterializedPostgreSQL::StorageMaterializedPostgreSQL( setInMemoryMetadata(storage_metadata); setVirtuals(createVirtuals()); - replication_settings->materialized_postgresql_tables_list = remote_table_name_; + (*replication_settings)[MaterializedPostgreSQLSetting::materialized_postgresql_tables_list] = remote_table_name_; replication_handler = std::make_unique( remote_database_name, diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h index 8774a8963af..40131f676ed 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h @@ -4,7 +4,6 @@ #if USE_LIBPQXX #include "PostgreSQLReplicationHandler.h" -#include "MaterializedPostgreSQLSettings.h" #include #include @@ -17,6 +16,7 @@ namespace DB { +struct MaterializedPostgreSQLSettings; /** TODO list: * - Actually I think we can support ddl even though logical replication does not fully support it. diff --git a/src/Storages/RabbitMQ/RabbitMQSettings.cpp b/src/Storages/RabbitMQ/RabbitMQSettings.cpp index 93495cdd8ae..f53e6c1feb1 100644 --- a/src/Storages/RabbitMQ/RabbitMQSettings.cpp +++ b/src/Storages/RabbitMQ/RabbitMQSettings.cpp @@ -1,8 +1,12 @@ -#include +#include +#include +#include #include -#include #include +#include +#include #include +#include namespace DB { @@ -12,15 +16,84 @@ namespace ErrorCodes extern const int UNKNOWN_SETTING; } +#define RABBITMQ_RELATED_SETTINGS(M, ALIAS) \ + M(String, rabbitmq_host_port, "", "A host-port to connect to RabbitMQ server.", 0) \ + M(String, rabbitmq_exchange_name, "clickhouse-exchange", "The exchange name, to which messages are sent.", 0) \ + M(String, rabbitmq_format, "", "The message format.", 0) \ + M(String, rabbitmq_exchange_type, "default", "The exchange type.", 0) \ + M(String, rabbitmq_routing_key_list, "5672", "A string of routing keys, separated by dots.", 0) \ + M(String, rabbitmq_schema, "", "Schema identifier (used by schema-based formats) for RabbitMQ engine", 0) \ + M(UInt64, rabbitmq_num_consumers, 1, "The number of consumer channels per table.", 0) \ + M(UInt64, rabbitmq_num_queues, 1, "The number of queues per consumer.", 0) \ + M(String, rabbitmq_queue_base, "", "Base for queue names to be able to reopen non-empty queues in case of failure.", 0) \ + M(Bool, rabbitmq_persistent, false, "For insert query messages will be made 'persistent', durable.", 0) \ + M(Bool, rabbitmq_secure, false, "Use SSL connection", 0) \ + M(String, rabbitmq_address, "", "Address for connection", 0) \ + M(UInt64, rabbitmq_skip_broken_messages, 0, "Skip at least this number of broken messages from RabbitMQ per block", 0) \ + M(UInt64, rabbitmq_max_block_size, 0, "Number of row collected before flushing data from RabbitMQ.", 0) \ + M(UInt64, rabbitmq_flush_interval_ms, 0, "Timeout for flushing data from RabbitMQ.", 0) \ + M(String, rabbitmq_vhost, "/", "RabbitMQ vhost.", 0) \ + M(String, rabbitmq_queue_settings_list, "", "A list of rabbitmq queue settings", 0) \ + M(UInt64, rabbitmq_empty_queue_backoff_start_ms, 10, "A minimum backoff point to reschedule read if the rabbitmq queue is empty", 0) \ + M(UInt64, rabbitmq_empty_queue_backoff_end_ms, 10000, "A maximum backoff point to reschedule read if the rabbitmq queue is empty", 0) \ + M(UInt64, rabbitmq_empty_queue_backoff_step_ms, 100, "A backoff step to reschedule read if the rabbitmq queue is empty", 0) \ + M(Bool, rabbitmq_queue_consume, false, "Use user-defined queues and do not make any RabbitMQ setup: declaring exchanges, queues, bindings", 0) \ + M(String, rabbitmq_username, "", "RabbitMQ username", 0) \ + M(String, rabbitmq_password, "", "RabbitMQ password", 0) \ + M(Bool, reject_unhandled_messages, false, "Allow messages to be rejected in case they cannot be processed. This also automatically implies if there is a x-deadletter-exchange queue setting added", 0) \ + M(Bool, rabbitmq_commit_on_select, false, "Commit messages when select query is made", 0) \ + M(UInt64, rabbitmq_max_rows_per_message, 1, "The maximum number of rows produced in one message for row-based formats.", 0) \ + M(StreamingHandleErrorMode, rabbitmq_handle_error_mode, StreamingHandleErrorMode::DEFAULT, "How to handle errors for RabbitMQ engine. Possible values: default (throw an exception after rabbitmq_skip_broken_messages broken messages), stream (save broken messages and errors in virtual columns _raw_message, _error).", 0) \ + +#define OBSOLETE_RABBITMQ_SETTINGS(M, ALIAS) \ + MAKE_OBSOLETE(M, Char, rabbitmq_row_delimiter, '\0') \ + +#define LIST_OF_RABBITMQ_SETTINGS(M, ALIAS) \ + RABBITMQ_RELATED_SETTINGS(M, ALIAS) \ + OBSOLETE_RABBITMQ_SETTINGS(M, ALIAS) \ + LIST_OF_ALL_FORMAT_SETTINGS(M, ALIAS) \ + +DECLARE_SETTINGS_TRAITS(RabbitMQSettingsTraits, LIST_OF_RABBITMQ_SETTINGS) IMPLEMENT_SETTINGS_TRAITS(RabbitMQSettingsTraits, LIST_OF_RABBITMQ_SETTINGS) +struct RabbitMQSettingsImpl : public BaseSettings +{ +}; + +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) RabbitMQSettings##TYPE NAME = &RabbitMQSettingsImpl ::NAME; + +namespace RabbitMQSetting +{ +LIST_OF_RABBITMQ_SETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) +} + +#undef INITIALIZE_SETTING_EXTERN + +RabbitMQSettings::RabbitMQSettings() : impl(std::make_unique()) +{ +} + +RabbitMQSettings::RabbitMQSettings(const RabbitMQSettings & settings) : impl(std::make_unique(*settings.impl)) +{ +} + +RabbitMQSettings::RabbitMQSettings(RabbitMQSettings && settings) noexcept + : impl(std::make_unique(std::move(*settings.impl))) +{ +} + +RabbitMQSettings::~RabbitMQSettings() = default; + +RABBITMQ_SETTINGS_SUPPORTED_TYPES(RabbitMQSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) + + void RabbitMQSettings::loadFromQuery(ASTStorage & storage_def) { if (storage_def.settings) { try { - applyChanges(storage_def.settings->changes); + impl->applyChanges(storage_def.settings->changes); } catch (Exception & e) { @@ -36,4 +109,30 @@ void RabbitMQSettings::loadFromQuery(ASTStorage & storage_def) storage_def.set(storage_def.settings, settings_ast); } } + +void RabbitMQSettings::loadFromNamedCollection(const MutableNamedCollectionPtr & named_collection) +{ + for (const auto & setting : impl->all()) + { + const auto & setting_name = setting.getName(); + if (named_collection->has(setting_name)) + impl->set(setting_name, named_collection->get(setting_name)); + } +} + +SettingsChanges RabbitMQSettings::getFormatSettings() const +{ + SettingsChanges values; + + for (const auto & setting : *impl) + { + const auto & setting_name = setting.getName(); + + /// check for non-rabbitmq-related settings + if (!setting_name.starts_with("rabbitmq_")) + values.emplace_back(setting_name, setting.getValue()); + } + + return values; +} } diff --git a/src/Storages/RabbitMQ/RabbitMQSettings.h b/src/Storages/RabbitMQ/RabbitMQSettings.h index 8c13f3dbef1..6bdadb5de76 100644 --- a/src/Storages/RabbitMQ/RabbitMQSettings.h +++ b/src/Storages/RabbitMQ/RabbitMQSettings.h @@ -1,56 +1,60 @@ #pragma once -#include -#include +#include #include +#include +#include +#include namespace DB { - class ASTStorage; +class ASTStorage; +struct RabbitMQSettingsImpl; +/// List of available types supported in RabbitMQSettings object +#define RABBITMQ_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + M(CLASS_NAME, ArrowCompression) \ + M(CLASS_NAME, Bool) \ + M(CLASS_NAME, CapnProtoEnumComparingMode) \ + M(CLASS_NAME, Char) \ + M(CLASS_NAME, DateTimeInputFormat) \ + M(CLASS_NAME, DateTimeOutputFormat) \ + M(CLASS_NAME, DateTimeOverflowBehavior) \ + M(CLASS_NAME, Double) \ + M(CLASS_NAME, EscapingRule) \ + M(CLASS_NAME, Float) \ + M(CLASS_NAME, IdentifierQuotingRule) \ + M(CLASS_NAME, IdentifierQuotingStyle) \ + M(CLASS_NAME, Int64) \ + M(CLASS_NAME, IntervalOutputFormat) \ + M(CLASS_NAME, MsgPackUUIDRepresentation) \ + M(CLASS_NAME, ORCCompression) \ + M(CLASS_NAME, ParquetCompression) \ + M(CLASS_NAME, ParquetVersion) \ + M(CLASS_NAME, SchemaInferenceMode) \ + M(CLASS_NAME, StreamingHandleErrorMode) \ + M(CLASS_NAME, String) \ + M(CLASS_NAME, UInt64) \ + M(CLASS_NAME, UInt64Auto) \ + M(CLASS_NAME, URI) -#define RABBITMQ_RELATED_SETTINGS(M, ALIAS) \ - M(String, rabbitmq_host_port, "", "A host-port to connect to RabbitMQ server.", 0) \ - M(String, rabbitmq_exchange_name, "clickhouse-exchange", "The exchange name, to which messages are sent.", 0) \ - M(String, rabbitmq_format, "", "The message format.", 0) \ - M(String, rabbitmq_exchange_type, "default", "The exchange type.", 0) \ - M(String, rabbitmq_routing_key_list, "5672", "A string of routing keys, separated by dots.", 0) \ - M(String, rabbitmq_schema, "", "Schema identifier (used by schema-based formats) for RabbitMQ engine", 0) \ - M(UInt64, rabbitmq_num_consumers, 1, "The number of consumer channels per table.", 0) \ - M(UInt64, rabbitmq_num_queues, 1, "The number of queues per consumer.", 0) \ - M(String, rabbitmq_queue_base, "", "Base for queue names to be able to reopen non-empty queues in case of failure.", 0) \ - M(Bool, rabbitmq_persistent, false, "For insert query messages will be made 'persistent', durable.", 0) \ - M(Bool, rabbitmq_secure, false, "Use SSL connection", 0) \ - M(String, rabbitmq_address, "", "Address for connection", 0) \ - M(UInt64, rabbitmq_skip_broken_messages, 0, "Skip at least this number of broken messages from RabbitMQ per block", 0) \ - M(UInt64, rabbitmq_max_block_size, 0, "Number of row collected before flushing data from RabbitMQ.", 0) \ - M(UInt64, rabbitmq_flush_interval_ms, 0, "Timeout for flushing data from RabbitMQ.", 0) \ - M(String, rabbitmq_vhost, "/", "RabbitMQ vhost.", 0) \ - M(String, rabbitmq_queue_settings_list, "", "A list of rabbitmq queue settings", 0) \ - M(UInt64, rabbitmq_empty_queue_backoff_start_ms, 10, "A minimum backoff point to reschedule read if the rabbitmq queue is empty", 0) \ - M(UInt64, rabbitmq_empty_queue_backoff_end_ms, 10000, "A maximum backoff point to reschedule read if the rabbitmq queue is empty", 0) \ - M(UInt64, rabbitmq_empty_queue_backoff_step_ms, 100, "A backoff step to reschedule read if the rabbitmq queue is empty", 0) \ - M(Bool, rabbitmq_queue_consume, false, "Use user-defined queues and do not make any RabbitMQ setup: declaring exchanges, queues, bindings", 0) \ - M(String, rabbitmq_username, "", "RabbitMQ username", 0) \ - M(String, rabbitmq_password, "", "RabbitMQ password", 0) \ - M(Bool, reject_unhandled_messages, false, "Allow messages to be rejected in case they cannot be processed. This also automatically implies if there is a x-deadletter-exchange queue setting added", 0) \ - M(Bool, rabbitmq_commit_on_select, false, "Commit messages when select query is made", 0) \ - M(UInt64, rabbitmq_max_rows_per_message, 1, "The maximum number of rows produced in one message for row-based formats.", 0) \ - M(StreamingHandleErrorMode, rabbitmq_handle_error_mode, StreamingHandleErrorMode::DEFAULT, "How to handle errors for RabbitMQ engine. Possible values: default (throw an exception after rabbitmq_skip_broken_messages broken messages), stream (save broken messages and errors in virtual columns _raw_message, _error).", 0) \ +RABBITMQ_SETTINGS_SUPPORTED_TYPES(RabbitMQSettings, DECLARE_SETTING_TRAIT) -#define OBSOLETE_RABBITMQ_SETTINGS(M, ALIAS) \ - MAKE_OBSOLETE(M, Char, rabbitmq_row_delimiter, '\0') \ - - -#define LIST_OF_RABBITMQ_SETTINGS(M, ALIAS) \ - RABBITMQ_RELATED_SETTINGS(M, ALIAS) \ - OBSOLETE_RABBITMQ_SETTINGS(M, ALIAS) \ - LIST_OF_ALL_FORMAT_SETTINGS(M, ALIAS) \ - -DECLARE_SETTINGS_TRAITS(RabbitMQSettingsTraits, LIST_OF_RABBITMQ_SETTINGS) - -struct RabbitMQSettings : public BaseSettings +struct RabbitMQSettings { + RabbitMQSettings(); + RabbitMQSettings(const RabbitMQSettings & settings); + RabbitMQSettings(RabbitMQSettings && settings) noexcept; + ~RabbitMQSettings(); + + RABBITMQ_SETTINGS_SUPPORTED_TYPES(RabbitMQSettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + void loadFromQuery(ASTStorage & storage_def); + void loadFromNamedCollection(const MutableNamedCollectionPtr & named_collection); + + SettingsChanges getFormatSettings() const; + +private: + std::unique_ptr impl; }; } diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index ce667847cdb..0f3ac2d5289 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,8 @@ #include #include +#include + namespace DB { namespace Setting @@ -43,6 +46,37 @@ namespace Setting extern const SettingsBool stream_like_engine_allow_direct_select; } +namespace RabbitMQSetting +{ + extern const RabbitMQSettingsString rabbitmq_address; + extern const RabbitMQSettingsBool rabbitmq_commit_on_select; + extern const RabbitMQSettingsUInt64 rabbitmq_empty_queue_backoff_end_ms; + extern const RabbitMQSettingsUInt64 rabbitmq_empty_queue_backoff_start_ms; + extern const RabbitMQSettingsUInt64 rabbitmq_empty_queue_backoff_step_ms; + extern const RabbitMQSettingsString rabbitmq_exchange_name; + extern const RabbitMQSettingsString rabbitmq_exchange_type; + extern const RabbitMQSettingsUInt64 rabbitmq_flush_interval_ms; + extern const RabbitMQSettingsString rabbitmq_format; + extern const RabbitMQSettingsStreamingHandleErrorMode rabbitmq_handle_error_mode; + extern const RabbitMQSettingsString rabbitmq_host_port; + extern const RabbitMQSettingsUInt64 rabbitmq_max_block_size; + extern const RabbitMQSettingsUInt64 rabbitmq_max_rows_per_message; + extern const RabbitMQSettingsUInt64 rabbitmq_num_consumers; + extern const RabbitMQSettingsUInt64 rabbitmq_num_queues; + extern const RabbitMQSettingsString rabbitmq_password; + extern const RabbitMQSettingsBool rabbitmq_persistent; + extern const RabbitMQSettingsString rabbitmq_queue_base; + extern const RabbitMQSettingsBool rabbitmq_queue_consume; + extern const RabbitMQSettingsString rabbitmq_queue_settings_list; + extern const RabbitMQSettingsString rabbitmq_routing_key_list; + extern const RabbitMQSettingsString rabbitmq_schema; + extern const RabbitMQSettingsBool rabbitmq_secure; + extern const RabbitMQSettingsUInt64 rabbitmq_skip_broken_messages; + extern const RabbitMQSettingsString rabbitmq_username; + extern const RabbitMQSettingsString rabbitmq_vhost; + extern const RabbitMQSettingsBool reject_unhandled_messages; +} + static const uint32_t QUEUE_SIZE = 100000; static const auto MAX_FAILED_READ_ATTEMPTS = 10; static const auto RESCHEDULE_MS = 500; @@ -84,26 +118,26 @@ StorageRabbitMQ::StorageRabbitMQ( : IStorage(table_id_) , WithContext(context_->getGlobalContext()) , rabbitmq_settings(std::move(rabbitmq_settings_)) - , exchange_name(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_exchange_name)) - , format_name(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_format)) - , exchange_type(defineExchangeType(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_exchange_type))) - , routing_keys(parseSettings(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_routing_key_list))) - , schema_name(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_schema)) - , num_consumers(rabbitmq_settings->rabbitmq_num_consumers.value) - , num_queues(rabbitmq_settings->rabbitmq_num_queues.value) - , queue_base(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_queue_base)) - , queue_settings_list(parseSettings(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_queue_settings_list))) - , max_rows_per_message(rabbitmq_settings->rabbitmq_max_rows_per_message) + , exchange_name(getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_exchange_name])) + , format_name(getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_format])) + , exchange_type(defineExchangeType(getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_exchange_type]))) + , routing_keys(parseSettings(getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_routing_key_list]))) + , schema_name(getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_schema])) + , num_consumers((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_num_consumers].value) + , num_queues((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_num_queues].value) + , queue_base(getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_queue_base])) + , queue_settings_list(parseSettings(getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_queue_settings_list]))) + , max_rows_per_message((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_max_rows_per_message]) , log(getLogger("StorageRabbitMQ (" + table_id_.table_name + ")")) - , persistent(rabbitmq_settings->rabbitmq_persistent.value) - , use_user_setup(rabbitmq_settings->rabbitmq_queue_consume.value) + , persistent((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_persistent].value) + , use_user_setup((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_queue_consume].value) , hash_exchange(num_consumers > 1 || num_queues > 1) , semaphore(0, static_cast(num_consumers)) , unique_strbase(getRandomName()) , queue_size(std::max(QUEUE_SIZE, static_cast(getMaxBlockSize()))) - , milliseconds_to_wait(rabbitmq_settings->rabbitmq_empty_queue_backoff_start_ms) + , milliseconds_to_wait((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_empty_queue_backoff_start_ms]) { - reject_unhandled_messages = rabbitmq_settings->reject_unhandled_messages + reject_unhandled_messages = (*rabbitmq_settings)[RabbitMQSetting::reject_unhandled_messages] || queue_settings_list.end() != std::find_if(queue_settings_list.begin(), queue_settings_list.end(), [](const String & name) { return name.starts_with(deadletter_exchange_setting); }); @@ -111,11 +145,11 @@ StorageRabbitMQ::StorageRabbitMQ( const auto & config = getContext()->getConfigRef(); std::pair parsed_address; - auto setting_rabbitmq_username = rabbitmq_settings->rabbitmq_username.value; - auto setting_rabbitmq_password = rabbitmq_settings->rabbitmq_password.value; + auto setting_rabbitmq_username = (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_username].value; + auto setting_rabbitmq_password = (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_password].value; String username, password; - if (rabbitmq_settings->rabbitmq_host_port.changed) + if ((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_host_port].changed) { username = setting_rabbitmq_username.empty() ? config.getString("rabbitmq.username", "") : setting_rabbitmq_username; password = setting_rabbitmq_password.empty() ? config.getString("rabbitmq.password", "") : setting_rabbitmq_password; @@ -124,7 +158,7 @@ StorageRabbitMQ::StorageRabbitMQ( ErrorCodes::BAD_ARGUMENTS, "No username or password. They can be specified either in config or in storage settings"); - parsed_address = parseAddress(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_host_port), 5672); + parsed_address = parseAddress(getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_host_port]), 5672); if (parsed_address.first.empty()) throw Exception( ErrorCodes::BAD_ARGUMENTS, @@ -132,7 +166,7 @@ StorageRabbitMQ::StorageRabbitMQ( context_->getRemoteHostFilter().checkHostAndPort(parsed_address.first, toString(parsed_address.second)); } - else if (!rabbitmq_settings->rabbitmq_address.changed) + else if (!(*rabbitmq_settings)[RabbitMQSetting::rabbitmq_address].changed) throw Exception(ErrorCodes::BAD_ARGUMENTS, "RabbitMQ requires either `rabbitmq_host_port` or `rabbitmq_address` setting"); configuration = @@ -141,9 +175,9 @@ StorageRabbitMQ::StorageRabbitMQ( .port = parsed_address.second, .username = username, .password = password, - .vhost = config.getString("rabbitmq.vhost", getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_vhost)), - .secure = rabbitmq_settings->rabbitmq_secure.value, - .connection_string = getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_address) + .vhost = config.getString("rabbitmq.vhost", getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_vhost])), + .secure = (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_secure].value, + .connection_string = getContext()->getMacros()->expand((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_address]) }; if (configuration.secure) @@ -156,7 +190,7 @@ StorageRabbitMQ::StorageRabbitMQ( storage_metadata.setColumns(columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); - setVirtuals(createVirtuals(rabbitmq_settings->rabbitmq_handle_error_mode)); + setVirtuals(createVirtuals((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_handle_error_mode])); rabbitmq_context = addSettings(getContext()); rabbitmq_context->makeQueryContext(); @@ -212,6 +246,8 @@ StorageRabbitMQ::StorageRabbitMQ( init_task->deactivate(); } +StorageRabbitMQ::~StorageRabbitMQ() = default; + VirtualColumnsDescription StorageRabbitMQ::createVirtuals(StreamingHandleErrorMode handle_error_mode) { VirtualColumnsDescription desc; @@ -280,8 +316,8 @@ ContextMutablePtr StorageRabbitMQ::addSettings(ContextPtr local_context) const auto modified_context = Context::createCopy(local_context); modified_context->setSetting("input_format_skip_unknown_fields", true); modified_context->setSetting("input_format_allow_errors_ratio", 0.); - if (rabbitmq_settings->rabbitmq_handle_error_mode == StreamingHandleErrorMode::DEFAULT) - modified_context->setSetting("input_format_allow_errors_num", rabbitmq_settings->rabbitmq_skip_broken_messages.value); + if ((*rabbitmq_settings)[RabbitMQSetting::rabbitmq_handle_error_mode] == StreamingHandleErrorMode::DEFAULT) + modified_context->setSetting("input_format_allow_errors_num", (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_skip_broken_messages].value); else modified_context->setSetting("input_format_allow_errors_num", Field(0)); @@ -291,14 +327,8 @@ ContextMutablePtr StorageRabbitMQ::addSettings(ContextPtr local_context) const if (!schema_name.empty()) modified_context->setSetting("format_schema", schema_name); - for (const auto & setting : *rabbitmq_settings) - { - const auto & setting_name = setting.getName(); - - /// check for non-rabbitmq-related settings - if (!setting_name.starts_with("rabbitmq_")) - modified_context->setSetting(setting_name, setting.getValue()); - } + /// check for non-rabbitmq-related settings + modified_context->applySettingsChanges(rabbitmq_settings->getFormatSettings()); return modified_context; } @@ -396,8 +426,8 @@ void StorageRabbitMQ::deactivateTask(BackgroundSchedulePool::TaskHolder & task, size_t StorageRabbitMQ::getMaxBlockSize() const { - return rabbitmq_settings->rabbitmq_max_block_size.changed - ? rabbitmq_settings->rabbitmq_max_block_size.value + return (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_max_block_size].changed + ? (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_max_block_size].value : (getContext()->getSettingsRef()[Setting::max_insert_block_size].value / num_consumers); } @@ -773,16 +803,16 @@ void StorageRabbitMQ::read( Pipes pipes; pipes.reserve(num_created_consumers); - uint64_t max_execution_time_ms = rabbitmq_settings->rabbitmq_flush_interval_ms.changed - ? rabbitmq_settings->rabbitmq_flush_interval_ms + uint64_t max_execution_time_ms = (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_flush_interval_ms].changed + ? (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_flush_interval_ms] : static_cast(getContext()->getSettingsRef()[Setting::stream_flush_interval_ms].totalMilliseconds()); for (size_t i = 0; i < num_created_consumers; ++i) { auto rabbit_source = std::make_shared( *this, storage_snapshot, modified_context, column_names, /* max_block_size */1, - max_execution_time_ms, rabbitmq_settings->rabbitmq_handle_error_mode, reject_unhandled_messages, - /* ack_in_suffix */rabbitmq_settings->rabbitmq_commit_on_select, log); + max_execution_time_ms, (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_handle_error_mode], reject_unhandled_messages, + /* ack_in_suffix */(*rabbitmq_settings)[RabbitMQSetting::rabbitmq_commit_on_select], log); auto converting_dag = ActionsDAG::makeConvertingActions( rabbit_source->getPort().getHeader().getColumnsWithTypeAndName(), @@ -1041,8 +1071,8 @@ void StorageRabbitMQ::streamingToViewsFunc() else { /// Reschedule with backoff. - if (milliseconds_to_wait < rabbitmq_settings->rabbitmq_empty_queue_backoff_end_ms) - milliseconds_to_wait += rabbitmq_settings->rabbitmq_empty_queue_backoff_step_ms; + if (milliseconds_to_wait < (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_empty_queue_backoff_end_ms]) + milliseconds_to_wait += (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_empty_queue_backoff_step_ms]; LOG_DEBUG(log, "Rescheduling background streaming process in {}", milliseconds_to_wait); streaming_task->scheduleAfter(milliseconds_to_wait); @@ -1089,7 +1119,7 @@ void StorageRabbitMQ::streamToViewsImpl() break; } - milliseconds_to_wait = rabbitmq_settings->rabbitmq_empty_queue_backoff_start_ms; + milliseconds_to_wait = (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_empty_queue_backoff_start_ms]; } } } @@ -1110,15 +1140,15 @@ bool StorageRabbitMQ::tryStreamToViews() sources.reserve(num_created_consumers); pipes.reserve(num_created_consumers); - uint64_t max_execution_time_ms = rabbitmq_settings->rabbitmq_flush_interval_ms.changed - ? rabbitmq_settings->rabbitmq_flush_interval_ms + uint64_t max_execution_time_ms = (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_flush_interval_ms].changed + ? (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_flush_interval_ms] : static_cast(getContext()->getSettingsRef()[Setting::stream_flush_interval_ms].totalMilliseconds()); for (size_t i = 0; i < num_created_consumers; ++i) { auto source = std::make_shared( *this, storage_snapshot, rabbitmq_context, Names{}, block_size, - max_execution_time_ms, rabbitmq_settings->rabbitmq_handle_error_mode, + max_execution_time_ms, (*rabbitmq_settings)[RabbitMQSetting::rabbitmq_handle_error_mode], reject_unhandled_messages, /* ack_in_suffix */false, log); sources.emplace_back(source); @@ -1274,26 +1304,19 @@ void registerStorageRabbitMQ(StorageFactory & factory) auto rabbitmq_settings = std::make_unique(); if (auto named_collection = tryGetNamedCollectionWithOverrides(args.engine_args, args.getLocalContext())) - { - for (const auto & setting : rabbitmq_settings->all()) - { - const auto & setting_name = setting.getName(); - if (named_collection->has(setting_name)) - rabbitmq_settings->set(setting_name, named_collection->get(setting_name)); - } - } + rabbitmq_settings->loadFromNamedCollection(named_collection); else if (!args.storage_def->settings) throw Exception(ErrorCodes::BAD_ARGUMENTS, "RabbitMQ engine must have settings"); if (args.storage_def->settings) rabbitmq_settings->loadFromQuery(*args.storage_def); - if (!rabbitmq_settings->rabbitmq_host_port.changed - && !rabbitmq_settings->rabbitmq_address.changed) + if (!(*rabbitmq_settings)[RabbitMQSetting::rabbitmq_host_port].changed + && !(*rabbitmq_settings)[RabbitMQSetting::rabbitmq_address].changed) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "You must specify either `rabbitmq_host_port` or `rabbitmq_address` settings"); - if (!rabbitmq_settings->rabbitmq_format.changed) + if (!(*rabbitmq_settings)[RabbitMQSetting::rabbitmq_format].changed) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "You must specify `rabbitmq_format` setting"); return std::make_shared(args.table_id, args.getContext(), args.columns, args.comment, std::move(rabbitmq_settings), args.mode); diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.h b/src/Storages/RabbitMQ/StorageRabbitMQ.h index fed80a4357b..e80c9d2a0f5 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.h +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h @@ -1,12 +1,12 @@ #pragma once #include +#include #include #include #include #include #include -#include #include #include #include @@ -16,7 +16,7 @@ namespace DB { - +struct RabbitMQSettings; using RabbitMQConsumerPtr = std::shared_ptr; class StorageRabbitMQ final: public IStorage, WithContext @@ -30,6 +30,8 @@ public: std::unique_ptr rabbitmq_settings_, LoadingStrictnessLevel mode); + ~StorageRabbitMQ() override; + std::string getName() const override { return "RabbitMQ"; } bool noPushingToViews() const override { return true; } diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp index 3dfeee8d912..b7bdf6dd2fc 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp +++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp @@ -35,6 +35,11 @@ namespace Setting extern const SettingsUInt64 min_insert_block_size_rows; } +namespace RocksDBSetting +{ + extern const RocksDBSettingsUInt64 bulk_insert_block_size; +} + namespace ErrorCodes { extern const int ROCKSDB_ERROR; @@ -91,7 +96,7 @@ EmbeddedRocksDBBulkSink::EmbeddedRocksDBBulkSink( serializations = getHeader().getSerializations(); min_block_size_rows - = std::max(storage.getSettings().bulk_insert_block_size, getContext()->getSettingsRef()[Setting::min_insert_block_size_rows]); + = std::max(storage.getSettings()[RocksDBSetting::bulk_insert_block_size], getContext()->getSettingsRef()[Setting::min_insert_block_size_rows]); /// If max_insert_threads > 1 we may have multiple EmbeddedRocksDBBulkSink and getContext()->getCurrentQueryId() is not guarantee to /// to have a distinct path. Also we cannot use query id as directory name here, because it could be defined by user and not suitable diff --git a/src/Storages/RocksDB/RocksDBSettings.cpp b/src/Storages/RocksDB/RocksDBSettings.cpp index 7de2077eb47..fef15660cc8 100644 --- a/src/Storages/RocksDB/RocksDBSettings.cpp +++ b/src/Storages/RocksDB/RocksDBSettings.cpp @@ -1,6 +1,8 @@ -#include "RocksDBSettings.h" +#include +#include #include #include +#include namespace DB { @@ -10,17 +12,59 @@ namespace ErrorCodes extern const int UNKNOWN_SETTING; } -IMPLEMENT_SETTINGS_TRAITS(RockDBSettingsTraits, LIST_OF_ROCKSDB_SETTINGS) +/** StorageEmbeddedRocksdb table settings + */ +#define LIST_OF_ROCKSDB_SETTINGS(M, ALIAS) \ + M(Bool, optimize_for_bulk_insert, true, "Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing to memtables)", 0) \ + M(UInt64, bulk_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "Size of block for bulk insert, if it's smaller than query setting min_insert_block_size_rows then it will be overridden by min_insert_block_size_rows", 0) \ + +DECLARE_SETTINGS_TRAITS(RocksDBSettingsTraits, LIST_OF_ROCKSDB_SETTINGS) +IMPLEMENT_SETTINGS_TRAITS(RocksDBSettingsTraits, LIST_OF_ROCKSDB_SETTINGS) + +struct RocksDBSettingsImpl : public BaseSettings +{ +}; + +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) RocksDBSettings##TYPE NAME = &RocksDBSettingsImpl ::NAME; + +namespace RocksDBSetting +{ +LIST_OF_ROCKSDB_SETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) +} + +#undef INITIALIZE_SETTING_EXTERN -void RocksDBSettings::loadFromQuery(ASTStorage & storage_def, ContextPtr /*context*/) +RocksDBSettings::RocksDBSettings() : impl(std::make_unique()) +{ +} + +RocksDBSettings::RocksDBSettings(const RocksDBSettings & settings) : impl(std::make_unique(*settings.impl)) +{ +} + +RocksDBSettings::RocksDBSettings(RocksDBSettings && settings) noexcept + : impl(std::make_unique(std::move(*settings.impl))) +{ +} + +RocksDBSettings::~RocksDBSettings() = default; + +ROCKSDB_SETTINGS_SUPPORTED_TYPES(RocksDBSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) + +void RocksDBSettings::applyChanges(const SettingsChanges & changes) +{ + impl->applyChanges(changes); +} + +void RocksDBSettings::loadFromQuery(const ASTStorage & storage_def) { if (storage_def.settings) { try { auto changes = storage_def.settings->changes; - applyChanges(changes); + impl->applyChanges(changes); } catch (Exception & e) { @@ -30,12 +74,4 @@ void RocksDBSettings::loadFromQuery(ASTStorage & storage_def, ContextPtr /*conte } } } - -std::vector RocksDBSettings::getAllRegisteredNames() const -{ - std::vector all_settings; - for (const auto & setting_field : all()) - all_settings.push_back(setting_field.getName()); - return all_settings; -} } diff --git a/src/Storages/RocksDB/RocksDBSettings.h b/src/Storages/RocksDB/RocksDBSettings.h index 1b168c56d89..7205c025675 100644 --- a/src/Storages/RocksDB/RocksDBSettings.h +++ b/src/Storages/RocksDB/RocksDBSettings.h @@ -1,39 +1,34 @@ #pragma once -#include -#include -#include -#include -#include - - -namespace Poco::Util -{ -class AbstractConfiguration; -} - +#include +#include namespace DB { class ASTStorage; -struct Settings; +struct RocksDBSettingsImpl; +class SettingsChanges; +/// List of available types supported in RocksDBSettings object +#define ROCKSDB_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + M(CLASS_NAME, Bool) \ + M(CLASS_NAME, UInt64) -/** StorageEmbeddedRocksdb table settings - */ +ROCKSDB_SETTINGS_SUPPORTED_TYPES(RocksDBSettings, DECLARE_SETTING_TRAIT) -#define ROCKSDB_SETTINGS(M, ALIAS) \ - M(Bool, optimize_for_bulk_insert, true, "Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing to memtables)", 0) \ - M(UInt64, bulk_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "Size of block for bulk insert, if it's smaller than query setting min_insert_block_size_rows then it will be overridden by min_insert_block_size_rows", 0) \ - -#define LIST_OF_ROCKSDB_SETTINGS(M, ALIAS) ROCKSDB_SETTINGS(M, ALIAS) - -DECLARE_SETTINGS_TRAITS(RockDBSettingsTraits, LIST_OF_ROCKSDB_SETTINGS) - -struct RocksDBSettings : public BaseSettings, public IHints<2> +struct RocksDBSettings { - void loadFromQuery(ASTStorage & storage_def, ContextPtr context); - std::vector getAllRegisteredNames() const override; -}; + RocksDBSettings(); + RocksDBSettings(const RocksDBSettings & settings); + RocksDBSettings(RocksDBSettings && settings) noexcept; + ~RocksDBSettings(); + ROCKSDB_SETTINGS_SUPPORTED_TYPES(RocksDBSettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + + void applyChanges(const SettingsChanges & changes); + void loadFromQuery(const ASTStorage & storage_def); + +private: + std::unique_ptr impl; +}; } diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 86346f25d08..d0e0d37679c 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -57,6 +57,11 @@ namespace Setting extern const SettingsBool optimize_trivial_approximate_count_query; } +namespace RocksDBSetting +{ + extern const RocksDBSettingsBool optimize_for_bulk_insert; +} + namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -663,7 +668,7 @@ void ReadFromEmbeddedRocksDB::applyFilters(ActionDAGNodes added_filter_nodes) SinkToStoragePtr StorageEmbeddedRocksDB::write( const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context, bool /*async_insert*/) { - if (getSettings().optimize_for_bulk_insert) + if (getSettings()[RocksDBSetting::optimize_for_bulk_insert]) { LOG_DEBUG(log, "Using bulk insert"); return std::make_shared(query_context, *this, metadata_snapshot); @@ -710,7 +715,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) throw Exception(ErrorCodes::BAD_ARGUMENTS, "StorageEmbeddedRocksDB must require one column in primary key"); } auto settings = std::make_unique(); - settings->loadFromQuery(*args.storage_def, args.getContext()); + settings->loadFromQuery(*args.storage_def); if (args.storage_def->settings) metadata.settings_changes = args.storage_def->settings->ptr(); else @@ -720,7 +725,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) /// SETTING queries. So we just add a setting with its default value. auto settings_changes = std::make_shared(); settings_changes->is_standalone = false; - settings_changes->changes.insertSetting("optimize_for_bulk_insert", settings->optimize_for_bulk_insert.value); + settings_changes->changes.insertSetting("optimize_for_bulk_insert", (*settings)[RocksDBSetting::optimize_for_bulk_insert].value); metadata.settings_changes = settings_changes; } return std::make_shared(args.table_id, args.relative_data_path, metadata, args.mode, args.getContext(), std::move(settings), primary_key_names[0], ttl, std::move(rocksdb_dir), read_only); diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 55c975712e3..d047b28e076 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -53,6 +53,11 @@ namespace ServerSetting extern const ServerSettingsUInt64 max_materialized_views_count_for_table; } +namespace RefreshSetting +{ + extern const RefreshSettingsBool all_replicas; +} + namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -177,7 +182,7 @@ StorageMaterializedView::StorageMaterializedView( RefreshSettings s; if (query.refresh_strategy->settings) s.applyChanges(query.refresh_strategy->settings->changes); - refresh_coordinated = !s.all_replicas; + refresh_coordinated = !s[RefreshSetting::all_replicas]; } else { diff --git a/src/Storages/StorageTimeSeries.cpp b/src/Storages/StorageTimeSeries.cpp index a42758d6822..50346709a83 100644 --- a/src/Storages/StorageTimeSeries.cpp +++ b/src/Storages/StorageTimeSeries.cpp @@ -179,9 +179,9 @@ StorageTimeSeries::StorageTimeSeries( StorageTimeSeries::~StorageTimeSeries() = default; -TimeSeriesSettings StorageTimeSeries::getStorageSettings() const +const TimeSeriesSettings & StorageTimeSeries::getStorageSettings() const { - return *getStorageSettingsPtr(); + return *storage_settings; } void StorageTimeSeries::startup() diff --git a/src/Storages/StorageTimeSeries.h b/src/Storages/StorageTimeSeries.h index 35db3131a0b..6be2e9184b7 100644 --- a/src/Storages/StorageTimeSeries.h +++ b/src/Storages/StorageTimeSeries.h @@ -41,8 +41,7 @@ public: std::string getName() const override { return "TimeSeries"; } - TimeSeriesSettings getStorageSettings() const; - TimeSeriesSettingsPtr getStorageSettingsPtr() const { return storage_settings; } + const TimeSeriesSettings & getStorageSettings() const; StorageID getTargetTableId(ViewTarget::Kind target_kind) const; StoragePtr getTargetTable(ViewTarget::Kind target_kind, const ContextPtr & local_context) const; diff --git a/src/Storages/TimeSeries/PrometheusRemoteReadProtocol.cpp b/src/Storages/TimeSeries/PrometheusRemoteReadProtocol.cpp index 52fa24f48e0..df0f6b8bc5c 100644 --- a/src/Storages/TimeSeries/PrometheusRemoteReadProtocol.cpp +++ b/src/Storages/TimeSeries/PrometheusRemoteReadProtocol.cpp @@ -31,6 +31,12 @@ namespace DB { +namespace TimeSeriesSetting +{ + extern const TimeSeriesSettingsBool filter_by_min_time_and_max_time; + extern const TimeSeriesSettingsMap tags_to_columns; +} + namespace ErrorCodes { extern const int BAD_REQUEST_PARAMETER; @@ -160,7 +166,7 @@ namespace std::unordered_map makeColumnNameByTagNameMap(const TimeSeriesSettings & storage_settings) { std::unordered_map res; - const Map & tags_to_columns = storage_settings.tags_to_columns; + const Map & tags_to_columns = storage_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -196,7 +202,7 @@ namespace exp_list->children.push_back( makeASTColumn(tags_table_id, TimeSeriesColumnNames::MetricName)); - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -255,7 +261,7 @@ namespace /// WHERE if (auto where = makeASTFilterForReadingTimeSeries(label_matcher, min_timestamp_ms, max_timestamp_ms, data_table_id, tags_table_id, - column_name_by_tag_name, time_series_settings.filter_by_min_time_and_max_time)) + column_name_by_tag_name, time_series_settings[TimeSeriesSetting::filter_by_min_time_and_max_time])) { select_query->setExpression(ASTSelectQuery::Expression::WHERE, std::move(where)); } @@ -267,7 +273,7 @@ namespace exp_list->children.push_back( makeASTColumn(tags_table_id, TimeSeriesColumnNames::MetricName)); - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -329,7 +335,7 @@ namespace /// Columns corresponding to specific tags specified in the "tags_to_columns" setting. std::unordered_map column_by_tag_name; - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -447,12 +453,12 @@ void PrometheusRemoteReadProtocol::readTimeSeries(google::protobuf::RepeatedPtrF out_time_series.Clear(); auto time_series_storage_id = time_series_storage->getStorageID(); - auto time_series_settings = time_series_storage->getStorageSettingsPtr(); + const auto & time_series_settings = time_series_storage->getStorageSettings(); auto data_table_id = time_series_storage->getTargetTableId(ViewTarget::Data); auto tags_table_id = time_series_storage->getTargetTableId(ViewTarget::Tags); ASTPtr select_query = buildSelectQueryForReadingTimeSeries( - start_timestamp_ms, end_timestamp_ms, label_matcher, *time_series_settings, data_table_id, tags_table_id); + start_timestamp_ms, end_timestamp_ms, label_matcher, time_series_settings, data_table_id, tags_table_id); LOG_TRACE(log, "{}: Executing query {}", time_series_storage_id.getNameForLogs(), select_query); @@ -468,7 +474,7 @@ void PrometheusRemoteReadProtocol::readTimeSeries(google::protobuf::RepeatedPtrF time_series_storage_id.getNameForLogs(), block.columns(), block.rows()); if (block) - convertBlockToProtobuf(std::move(block), out_time_series, time_series_storage_id, *time_series_settings); + convertBlockToProtobuf(std::move(block), out_time_series, time_series_storage_id, time_series_settings); } LOG_TRACE(log, "{}: {} time series read", diff --git a/src/Storages/TimeSeries/PrometheusRemoteWriteProtocol.cpp b/src/Storages/TimeSeries/PrometheusRemoteWriteProtocol.cpp index 3be5e8d5c39..815f642a80d 100644 --- a/src/Storages/TimeSeries/PrometheusRemoteWriteProtocol.cpp +++ b/src/Storages/TimeSeries/PrometheusRemoteWriteProtocol.cpp @@ -32,6 +32,13 @@ namespace DB { +namespace TimeSeriesSetting +{ + extern const TimeSeriesSettingsBool store_min_time_and_max_time; + extern const TimeSeriesSettingsMap tags_to_columns; + extern const TimeSeriesSettingsBool use_all_tags_column_to_generate_id; +} + namespace ErrorCodes { extern const int ILLEGAL_TIME_SERIES_TAGS; @@ -261,7 +268,7 @@ namespace /// Columns corresponding to specific tags specified in the "tags_to_columns" setting. std::unordered_map columns_by_tag_name; - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -286,7 +293,7 @@ namespace IColumn * all_tags_names = nullptr; IColumn * all_tags_values = nullptr; IColumn::Offsets * all_tags_offsets = nullptr; - if (time_series_settings.use_all_tags_column_to_generate_id) + if (time_series_settings[TimeSeriesSetting::use_all_tags_column_to_generate_id]) { const auto & all_tags_description = get_column_description(TimeSeriesColumnNames::AllTags); validator.validateColumnForTagsMap(all_tags_description); @@ -301,7 +308,7 @@ namespace IColumn * max_time_column = nullptr; UInt32 min_time_scale = 0; UInt32 max_time_scale = 0; - if (time_series_settings.store_min_time_and_max_time) + if (time_series_settings[TimeSeriesSetting::store_min_time_and_max_time]) { const auto & min_time_description = get_column_description(TimeSeriesColumnNames::MinTime); const auto & max_time_description = get_column_description(TimeSeriesColumnNames::MaxTime); @@ -336,7 +343,7 @@ namespace } else { - if (time_series_settings.use_all_tags_column_to_generate_id) + if (time_series_settings[TimeSeriesSetting::use_all_tags_column_to_generate_id]) { all_tags_names->insertData(tag_name.data(), tag_name.length()); all_tags_values->insertData(tag_value.data(), tag_value.length()); @@ -359,10 +366,10 @@ namespace tags_offsets.push_back(tags_names.size()); - if (time_series_settings.use_all_tags_column_to_generate_id) + if (time_series_settings[TimeSeriesSetting::use_all_tags_column_to_generate_id]) all_tags_offsets->push_back(all_tags_names->size()); - if (time_series_settings.store_min_time_and_max_time) + if (time_series_settings[TimeSeriesSetting::store_min_time_and_max_time]) { auto [min_time, max_time] = findMinTimeAndMaxTime(element.samples()); min_time_column->insert(scaleTimestamp(min_time, min_time_scale)); @@ -571,9 +578,9 @@ void PrometheusRemoteWriteProtocol::writeTimeSeries(const google::protobuf::Repe time_series_storage_id.getNameForLogs(), time_series.size()); auto time_series_storage_metadata = time_series_storage->getInMemoryMetadataPtr(); - auto time_series_settings = time_series_storage->getStorageSettingsPtr(); + const auto & time_series_settings = time_series_storage->getStorageSettings(); - auto blocks = toBlocks(time_series, getContext(), time_series_storage_id, *time_series_storage_metadata, *time_series_settings); + auto blocks = toBlocks(time_series, getContext(), time_series_storage_id, *time_series_storage_metadata, time_series_settings); insertToTargetTables(std::move(blocks), *time_series_storage, getContext(), log.get()); LOG_TRACE(log, "{}: {} time series written", @@ -588,9 +595,9 @@ void PrometheusRemoteWriteProtocol::writeMetricsMetadata(const google::protobuf: time_series_storage_id.getNameForLogs(), metrics_metadata.size()); auto time_series_storage_metadata = time_series_storage->getInMemoryMetadataPtr(); - auto time_series_settings = time_series_storage->getStorageSettingsPtr(); + const auto & time_series_settings = time_series_storage->getStorageSettings(); - auto blocks = toBlocks(metrics_metadata, time_series_storage_id, *time_series_storage_metadata, *time_series_settings); + auto blocks = toBlocks(metrics_metadata, time_series_storage_id, *time_series_storage_metadata, time_series_settings); insertToTargetTables(std::move(blocks), *time_series_storage, getContext(), log.get()); LOG_TRACE(log, "{}: {} metrics metadata written", diff --git a/src/Storages/TimeSeries/TimeSeriesColumnsValidator.cpp b/src/Storages/TimeSeries/TimeSeriesColumnsValidator.cpp index a2308857e2e..5cd6127ad57 100644 --- a/src/Storages/TimeSeries/TimeSeriesColumnsValidator.cpp +++ b/src/Storages/TimeSeries/TimeSeriesColumnsValidator.cpp @@ -12,6 +12,11 @@ namespace DB { +namespace TimeSeriesSetting +{ + extern const TimeSeriesSettingsMap tags_to_columns; +} + namespace ErrorCodes { extern const int ILLEGAL_COLUMN; @@ -63,7 +68,7 @@ void TimeSeriesColumnsValidator::validateColumnsImpl(const ColumnsDescription & /// Validate columns for the "tags" table. validateColumnForMetricName(get_column_description(TimeSeriesColumnNames::MetricName)); - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -127,7 +132,7 @@ void TimeSeriesColumnsValidator::validateTargetColumnsImpl(ViewTarget::Kind targ { validateColumnForMetricName(get_column_description(TimeSeriesColumnNames::MetricName)); - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); diff --git a/src/Storages/TimeSeries/TimeSeriesDefinitionNormalizer.cpp b/src/Storages/TimeSeries/TimeSeriesDefinitionNormalizer.cpp index 253a80e22de..9f578043e54 100644 --- a/src/Storages/TimeSeries/TimeSeriesDefinitionNormalizer.cpp +++ b/src/Storages/TimeSeries/TimeSeriesDefinitionNormalizer.cpp @@ -15,6 +15,14 @@ namespace DB { +namespace TimeSeriesSetting +{ + extern const TimeSeriesSettingsBool aggregate_min_time_and_max_time; + extern const TimeSeriesSettingsBool store_min_time_and_max_time; + extern const TimeSeriesSettingsMap tags_to_columns; + extern const TimeSeriesSettingsBool use_all_tags_column_to_generate_id; +} + namespace ErrorCodes { extern const int INCOMPATIBLE_COLUMNS; @@ -85,7 +93,7 @@ void TimeSeriesDefinitionNormalizer::reorderColumns(ASTCreateQuery & create) con /// Reorder columns for the "tags" table. add_column_in_correct_order(TimeSeriesColumnNames::MetricName); - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -96,7 +104,7 @@ void TimeSeriesDefinitionNormalizer::reorderColumns(ASTCreateQuery & create) con add_column_in_correct_order(TimeSeriesColumnNames::Tags); add_column_in_correct_order(TimeSeriesColumnNames::AllTags); - if (time_series_settings.store_min_time_and_max_time) + if (time_series_settings[TimeSeriesSetting::store_min_time_and_max_time]) { add_column_in_correct_order(TimeSeriesColumnNames::MinTime); add_column_in_correct_order(TimeSeriesColumnNames::MaxTime); @@ -198,7 +206,7 @@ void TimeSeriesDefinitionNormalizer::addMissingColumns(ASTCreateQuery & create) make_new_column(TimeSeriesColumnNames::MetricName, get_lc_string_type()); } - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -221,7 +229,7 @@ void TimeSeriesDefinitionNormalizer::addMissingColumns(ASTCreateQuery & create) make_new_column(TimeSeriesColumnNames::AllTags, get_string_to_string_map_type()); } - if (time_series_settings.store_min_time_and_max_time) + if (time_series_settings[TimeSeriesSetting::store_min_time_and_max_time]) { /// We use Nullable(DateTime64(3)) as the default type of the `min_time` and `max_time` columns. /// It's nullable because it allows the aggregation (see aggregate_min_time_and_max_time) work correctly even @@ -284,13 +292,13 @@ ASTPtr TimeSeriesDefinitionNormalizer::chooseIDAlgorithm(const ASTColumnDeclarat ASTs arguments_for_hash_function; arguments_for_hash_function.push_back(std::make_shared(TimeSeriesColumnNames::MetricName)); - if (time_series_settings.use_all_tags_column_to_generate_id) + if (time_series_settings[TimeSeriesSetting::use_all_tags_column_to_generate_id]) { arguments_for_hash_function.push_back(std::make_shared(TimeSeriesColumnNames::AllTags)); } else { - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -424,7 +432,7 @@ void TimeSeriesDefinitionNormalizer::setInnerEngineByDefault(ViewTarget::Kind in case ViewTarget::Tags: { String engine_name; - if (time_series_settings.aggregate_min_time_and_max_time) + if (time_series_settings[TimeSeriesSetting::aggregate_min_time_and_max_time]) engine_name = "AggregatingMergeTree"; else engine_name = "ReplacingMergeTree"; @@ -441,7 +449,7 @@ void TimeSeriesDefinitionNormalizer::setInnerEngineByDefault(ViewTarget::Kind in order_by_list.push_back(std::make_shared(TimeSeriesColumnNames::MetricName)); order_by_list.push_back(std::make_shared(TimeSeriesColumnNames::ID)); - if (time_series_settings.store_min_time_and_max_time && !time_series_settings.aggregate_min_time_and_max_time) + if (time_series_settings[TimeSeriesSetting::store_min_time_and_max_time] && !time_series_settings[TimeSeriesSetting::aggregate_min_time_and_max_time]) { order_by_list.push_back(std::make_shared(TimeSeriesColumnNames::MinTime)); order_by_list.push_back(std::make_shared(TimeSeriesColumnNames::MaxTime)); diff --git a/src/Storages/TimeSeries/TimeSeriesInnerTablesCreator.cpp b/src/Storages/TimeSeries/TimeSeriesInnerTablesCreator.cpp index 5f616982a6f..3461ecf7ee1 100644 --- a/src/Storages/TimeSeries/TimeSeriesInnerTablesCreator.cpp +++ b/src/Storages/TimeSeries/TimeSeriesInnerTablesCreator.cpp @@ -22,6 +22,14 @@ namespace DB { +namespace TimeSeriesSetting +{ + extern const TimeSeriesSettingsBool aggregate_min_time_and_max_time; + extern const TimeSeriesSettingsBool store_min_time_and_max_time; + extern const TimeSeriesSettingsMap tags_to_columns; + extern const TimeSeriesSettingsBool use_all_tags_column_to_generate_id; +} + TimeSeriesInnerTablesCreator::TimeSeriesInnerTablesCreator(ContextPtr context_, StorageID time_series_storage_id_, std::reference_wrapper time_series_columns_, @@ -70,7 +78,7 @@ ColumnsDescription TimeSeriesInnerTablesCreator::getInnerTableColumnsDescription columns.add(time_series_columns.get(TimeSeriesColumnNames::MetricName)); /// Columns corresponding to specific tags specified in the "tags_to_columns" setting. - const Map & tags_to_columns = time_series_settings.tags_to_columns; + const Map & tags_to_columns = time_series_settings[TimeSeriesSetting::tags_to_columns]; for (const auto & tag_name_and_column_name : tags_to_columns) { const auto & tuple = tag_name_and_column_name.safeGet(); @@ -82,7 +90,7 @@ ColumnsDescription TimeSeriesInnerTablesCreator::getInnerTableColumnsDescription columns.add(time_series_columns.get(TimeSeriesColumnNames::Tags)); /// Column "all_tags". - if (time_series_settings.use_all_tags_column_to_generate_id) + if (time_series_settings[TimeSeriesSetting::use_all_tags_column_to_generate_id]) { ColumnDescription all_tags_column = time_series_columns.get(TimeSeriesColumnNames::AllTags); /// Column "all_tags" is here only to calculate the identifier of a time series for the "id" column, so it can be ephemeral. @@ -96,11 +104,11 @@ ColumnsDescription TimeSeriesInnerTablesCreator::getInnerTableColumnsDescription } /// Columns "min_time" and "max_time". - if (time_series_settings.store_min_time_and_max_time) + if (time_series_settings[TimeSeriesSetting::store_min_time_and_max_time]) { auto min_time_column = time_series_columns.get(TimeSeriesColumnNames::MinTime); auto max_time_column = time_series_columns.get(TimeSeriesColumnNames::MaxTime); - if (time_series_settings.aggregate_min_time_and_max_time) + if (time_series_settings[TimeSeriesSetting::aggregate_min_time_and_max_time]) { AggregateFunctionProperties properties; auto min_function = AggregateFunctionFactory::instance().get("min", NullsAction::EMPTY, {min_time_column.type}, {}, properties); diff --git a/src/Storages/TimeSeries/TimeSeriesSettings.cpp b/src/Storages/TimeSeries/TimeSeriesSettings.cpp index 3a15be59191..c8b9715250d 100644 --- a/src/Storages/TimeSeries/TimeSeriesSettings.cpp +++ b/src/Storages/TimeSeries/TimeSeriesSettings.cpp @@ -1,7 +1,8 @@ -#include - +#include +#include #include #include +#include namespace DB @@ -12,15 +13,53 @@ namespace ErrorCodes extern const int UNKNOWN_SETTING; } +#define LIST_OF_TIME_SERIES_SETTINGS(M, ALIAS) \ + M(Map, tags_to_columns, Map{}, "Map specifying which tags should be put to separate columns of the 'tags' table. Syntax: {'tag1': 'column1', 'tag2' : column2, ...}", 0) \ + M(Bool, use_all_tags_column_to_generate_id, true, "When generating an expression to calculate an identifier of a time series, this flag enables using the 'all_tags' column in that calculation. The 'all_tags' is a virtual column containing all tags except the metric name", 0) \ + M(Bool, store_min_time_and_max_time, true, "If set to true then the table will store 'min_time' and 'max_time' for each time series", 0) \ + M(Bool, aggregate_min_time_and_max_time, true, "When creating an inner target 'tags' table, this flag enables using 'SimpleAggregateFunction(min, Nullable(DateTime64(3)))' instead of just 'Nullable(DateTime64(3))' as the type of the 'min_time' column, and the same for the 'max_time' column", 0) \ + M(Bool, filter_by_min_time_and_max_time, true, "If set to true then the table will use the 'min_time' and 'max_time' columns for filtering time series", 0) \ + +DECLARE_SETTINGS_TRAITS(TimeSeriesSettingsTraits, LIST_OF_TIME_SERIES_SETTINGS) IMPLEMENT_SETTINGS_TRAITS(TimeSeriesSettingsTraits, LIST_OF_TIME_SERIES_SETTINGS) +struct TimeSeriesSettingsImpl : public BaseSettings +{ +}; + +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) TimeSeriesSettings##TYPE NAME = &TimeSeriesSettingsImpl ::NAME; + +namespace TimeSeriesSetting +{ +LIST_OF_TIME_SERIES_SETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) +} + +#undef INITIALIZE_SETTING_EXTERN + +TimeSeriesSettings::TimeSeriesSettings() : impl(std::make_unique()) +{ +} + +TimeSeriesSettings::TimeSeriesSettings(const TimeSeriesSettings & settings) : impl(std::make_unique(*settings.impl)) +{ +} + +TimeSeriesSettings::TimeSeriesSettings(TimeSeriesSettings && settings) noexcept + : impl(std::make_unique(std::move(*settings.impl))) +{ +} + +TimeSeriesSettings::~TimeSeriesSettings() = default; + +TIMESERIES_SETTINGS_SUPPORTED_TYPES(TimeSeriesSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) + void TimeSeriesSettings::loadFromQuery(ASTStorage & storage_def) { if (storage_def.settings) { try { - applyChanges(storage_def.settings->changes); + impl->applyChanges(storage_def.settings->changes); } catch (Exception & e) { diff --git a/src/Storages/TimeSeries/TimeSeriesSettings.h b/src/Storages/TimeSeries/TimeSeriesSettings.h index 4dc6a436cd0..7c65a18b8a9 100644 --- a/src/Storages/TimeSeries/TimeSeriesSettings.h +++ b/src/Storages/TimeSeries/TimeSeriesSettings.h @@ -1,29 +1,36 @@ #pragma once -#include +#include +#include namespace DB { class ASTStorage; +struct TimeSeriesSettingsImpl; -#define LIST_OF_TIME_SERIES_SETTINGS(M, ALIAS) \ - M(Map, tags_to_columns, Map{}, "Map specifying which tags should be put to separate columns of the 'tags' table. Syntax: {'tag1': 'column1', 'tag2' : column2, ...}", 0) \ - M(Bool, use_all_tags_column_to_generate_id, true, "When generating an expression to calculate an identifier of a time series, this flag enables using the 'all_tags' column in that calculation. The 'all_tags' is a virtual column containing all tags except the metric name", 0) \ - M(Bool, store_min_time_and_max_time, true, "If set to true then the table will store 'min_time' and 'max_time' for each time series", 0) \ - M(Bool, aggregate_min_time_and_max_time, true, "When creating an inner target 'tags' table, this flag enables using 'SimpleAggregateFunction(min, Nullable(DateTime64(3)))' instead of just 'Nullable(DateTime64(3))' as the type of the 'min_time' column, and the same for the 'max_time' column", 0) \ - M(Bool, filter_by_min_time_and_max_time, true, "If set to true then the table will use the 'min_time' and 'max_time' columns for filtering time series", 0) \ +/// List of available types supported in TimeSeriesSettings object +#define TIMESERIES_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + M(CLASS_NAME, Bool) \ + M(CLASS_NAME, Map) -DECLARE_SETTINGS_TRAITS(TimeSeriesSettingsTraits, LIST_OF_TIME_SERIES_SETTINGS) +TIMESERIES_SETTINGS_SUPPORTED_TYPES(TimeSeriesSettings, DECLARE_SETTING_TRAIT) /// Settings for the TimeSeries table engine. /// Could be loaded from a CREATE TABLE query (SETTINGS clause). For example: /// CREATE TABLE mytable ENGINE = TimeSeries() SETTINGS tags_to_columns = {'job':'job', 'instance':'instance'} DATA ENGINE = ReplicatedMergeTree('zkpath', 'replica'), ... -struct TimeSeriesSettings : public BaseSettings +struct TimeSeriesSettings { + TimeSeriesSettings(); + TimeSeriesSettings(const TimeSeriesSettings & settings); + TimeSeriesSettings(TimeSeriesSettings && settings) noexcept; + ~TimeSeriesSettings(); + + TIMESERIES_SETTINGS_SUPPORTED_TYPES(TimeSeriesSettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + void loadFromQuery(ASTStorage & storage_def); + +private: + std::unique_ptr impl; }; - -using TimeSeriesSettingsPtr = std::shared_ptr; - } diff --git a/tests/integration/test_merge_tree_s3/s3_mocks/unstable_proxy.py b/tests/integration/test_merge_tree_s3/s3_mocks/unstable_proxy.py index 6ccbf9b105c..21986c0d692 100644 --- a/tests/integration/test_merge_tree_s3/s3_mocks/unstable_proxy.py +++ b/tests/integration/test_merge_tree_s3/s3_mocks/unstable_proxy.py @@ -32,6 +32,7 @@ def request(command, url, headers={}, data=None): class RequestHandler(http.server.BaseHTTPRequestHandler): + # GetObject def do_GET(self): if self.path == "/": self.send_response(200) @@ -41,12 +42,18 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): else: self.do_HEAD() + # PutObject def do_PUT(self): self.do_HEAD() + # DeleteObjects (/root?delete) def do_POST(self): self.do_HEAD() + # DeleteObject + def do_DELETE(self): + self.do_HEAD() + def do_HEAD(self): content_length = self.headers.get("Content-Length") data = self.rfile.read(int(content_length)) if content_length else None diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 253c140340d..aca815fef4b 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -6,6 +6,7 @@ import random import subprocess import threading import time +import uuid from random import randrange import pika @@ -18,6 +19,8 @@ from helpers.test_tools import TSV from . import rabbitmq_pb2 +DEFAULT_TIMEOUT_SEC = 60 + cluster = ClickHouseCluster(__file__) instance = cluster.add_instance( "instance", @@ -173,12 +176,18 @@ def test_rabbitmq_select(rabbitmq_cluster, secure): time.sleep(1) result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result += instance.query( "SELECT * FROM test.rabbitmq ORDER BY key", ignore_error=True ) if rabbitmq_check_result(result): break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) rabbitmq_check_result(result, True) @@ -244,13 +253,20 @@ def test_rabbitmq_json_without_delimiter(rabbitmq_cluster): time.sleep(1) result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result += instance.query( "SELECT * FROM test.rabbitmq ORDER BY key", ignore_error=True ) if rabbitmq_check_result(result): break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) + rabbitmq_check_result(result, True) @@ -287,13 +303,20 @@ def test_rabbitmq_csv_with_delimiter(rabbitmq_cluster): time.sleep(1) result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result += instance.query( "SELECT * FROM test.rabbitmq ORDER BY key", ignore_error=True ) if rabbitmq_check_result(result): break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) + rabbitmq_check_result(result, True) @@ -334,10 +357,16 @@ def test_rabbitmq_tsv_with_delimiter(rabbitmq_cluster): connection.close() result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT * FROM test.view ORDER BY key") if rabbitmq_check_result(result): break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) rabbitmq_check_result(result, True) @@ -372,12 +401,18 @@ def test_rabbitmq_macros(rabbitmq_cluster): time.sleep(1) result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result += instance.query( "SELECT * FROM test.rabbitmq ORDER BY key", ignore_error=True ) if rabbitmq_check_result(result): break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) rabbitmq_check_result(result, True) @@ -428,6 +463,11 @@ def test_rabbitmq_materialized_view(rabbitmq_cluster): result = instance.query("SELECT * FROM test.view ORDER BY key") if rabbitmq_check_result(result): break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {time_limit_sec} seconds reached. The result did not match the expected value." + ) rabbitmq_check_result(result, True) @@ -439,6 +479,10 @@ def test_rabbitmq_materialized_view(rabbitmq_cluster): if rabbitmq_check_result(result): break time.sleep(1) + else: + pytest.fail( + f"Time limit of {time_limit_sec} seconds reached. The result did not match the expected value." + ) rabbitmq_check_result(result, True) connection.close() @@ -476,10 +520,17 @@ def test_rabbitmq_materialized_view_with_subquery(rabbitmq_cluster): for message in messages: channel.basic_publish(exchange="mvsq", routing_key="", body=message) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + + while time.monotonic() < deadline: result = instance.query("SELECT * FROM test.view ORDER BY key") if rabbitmq_check_result(result): break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) connection.close() rabbitmq_check_result(result, True) @@ -536,7 +587,7 @@ def test_rabbitmq_many_materialized_views(rabbitmq_cluster): channel.basic_publish(exchange="mmv", routing_key="", body=message) is_check_passed = False - deadline = time.monotonic() + 60 + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC while time.monotonic() < deadline: result1 = instance.query("SELECT * FROM test.view1 ORDER BY key") result2 = instance.query("SELECT * FROM test.view2 ORDER BY key") @@ -610,10 +661,16 @@ def test_rabbitmq_big_message(rabbitmq_cluster): for message in messages: channel.basic_publish(exchange="big", routing_key="", body=message) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view") if int(result) == batch_messages * rabbitmq_messages: break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) connection.close() @@ -687,13 +744,18 @@ def test_rabbitmq_sharding_between_queues_publish(rabbitmq_cluster): thread.start() result1 = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + expected = messages_num * threads_num + while time.monotonic() < deadline: result1 = instance.query("SELECT count() FROM test.view") time.sleep(1) - expected = messages_num * threads_num if int(result1) == expected: break logging.debug(f"Result {result1} / {expected}") + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) result2 = instance.query("SELECT count(DISTINCT channel_id) FROM test.view") @@ -778,17 +840,24 @@ def test_rabbitmq_mv_combo(rabbitmq_cluster): time.sleep(random.uniform(0, 1)) thread.start() - while True: + # with threadsanitizer the speed of execution is about 8-13k rows per second. + # so consumption of 1 mln rows will require about 125 seconds + deadline = time.monotonic() + 180 + expected = messages_num * threads_num * NUM_MV + while time.monotonic() < deadline: result = 0 for mv_id in range(NUM_MV): result += int( instance.query("SELECT count() FROM test.combo_{0}".format(mv_id)) ) - expected = messages_num * threads_num * NUM_MV if int(result) == expected: break logging.debug(f"Result: {result} / {expected}") time.sleep(1) + else: + pytest.fail( + f"Time limit of 180 seconds reached. The result did not match the expected value." + ) for thread in threads: thread.join() @@ -840,7 +909,8 @@ def test_rabbitmq_insert(rabbitmq_cluster): values.append("({i}, {i})".format(i=i)) values = ",".join(values) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: try: instance.query("INSERT INTO test.rabbitmq VALUES {}".format(values)) break @@ -849,6 +919,10 @@ def test_rabbitmq_insert(rabbitmq_cluster): continue else: raise + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The query could not be executed successfully." + ) insert_messages = [] @@ -903,7 +977,8 @@ def test_rabbitmq_insert_headers_exchange(rabbitmq_cluster): values.append("({i}, {i})".format(i=i)) values = ",".join(values) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: try: instance.query("INSERT INTO test.rabbitmq VALUES {}".format(values)) break @@ -912,6 +987,10 @@ def test_rabbitmq_insert_headers_exchange(rabbitmq_cluster): continue else: raise + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The query could not be executed successfully." + ) insert_messages = [] @@ -966,7 +1045,8 @@ def test_rabbitmq_many_inserts(rabbitmq_cluster): values = ",".join(values) def insert(): - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: try: instance.query( "INSERT INTO test.rabbitmq_many VALUES {}".format(values) @@ -977,6 +1057,10 @@ def test_rabbitmq_many_inserts(rabbitmq_cluster): continue else: raise + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The query could not be executed successfully." + ) threads = [] threads_num = 10 @@ -999,12 +1083,17 @@ def test_rabbitmq_many_inserts(rabbitmq_cluster): for thread in threads: thread.join() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view_many") logging.debug(result, messages_num * threads_num) if int(result) == messages_num * threads_num: break time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) instance.query( """ @@ -1064,7 +1153,8 @@ def test_rabbitmq_overloaded_insert(rabbitmq_cluster): values.append("({i}, {i})".format(i=i)) values = ",".join(values) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: try: instance.query( "INSERT INTO test.rabbitmq_overload VALUES {}".format(values) @@ -1075,6 +1165,10 @@ def test_rabbitmq_overloaded_insert(rabbitmq_cluster): continue else: raise + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The query could not be executed successfully." + ) threads = [] threads_num = 2 @@ -1087,13 +1181,18 @@ def test_rabbitmq_overloaded_insert(rabbitmq_cluster): for thread in threads: thread.join() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view_overload") expected = messages_num * threads_num if int(result) == expected: break logging.debug(f"Result: {result} / {expected}") time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) instance.query( """ @@ -1177,11 +1276,16 @@ def test_rabbitmq_direct_exchange(rabbitmq_cluster): connection.close() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.destination") time.sleep(1) if int(result) == messages_num * num_tables: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) for consumer_id in range(num_tables): instance.query( @@ -1265,11 +1369,16 @@ def test_rabbitmq_fanout_exchange(rabbitmq_cluster): connection.close() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.destination") time.sleep(1) if int(result) == messages_num * num_tables: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) for consumer_id in range(num_tables): instance.query( @@ -1389,11 +1498,16 @@ def test_rabbitmq_topic_exchange(rabbitmq_cluster): connection.close() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.destination") time.sleep(1) if int(result) == messages_num * num_tables + messages_num * num_tables: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) for consumer_id in range(num_tables * 2): instance.query( @@ -1486,11 +1600,16 @@ def test_rabbitmq_hash_exchange(rabbitmq_cluster): thread.start() result1 = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result1 = instance.query("SELECT count() FROM test.destination") time.sleep(1) if int(result1) == messages_num * threads_num: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) result2 = instance.query("SELECT count(DISTINCT channel_id) FROM test.destination") @@ -1586,11 +1705,16 @@ def test_rabbitmq_multiple_bindings(rabbitmq_cluster): time.sleep(random.uniform(0, 1)) thread.start() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.destination") time.sleep(1) if int(result) == messages_num * threads_num * 5: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) for thread in threads: thread.join() @@ -1699,11 +1823,16 @@ def test_rabbitmq_headers_exchange(rabbitmq_cluster): connection.close() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.destination") time.sleep(1) if int(result) == messages_num * num_tables_to_receive: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) for consumer_id in range(num_tables_to_receive + num_tables_to_ignore): instance.query( @@ -1758,11 +1887,16 @@ def test_rabbitmq_virtual_columns(rabbitmq_cluster): for message in messages: channel.basic_publish(exchange="virtuals", routing_key="", body=message) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view") time.sleep(1) if int(result) == message_num: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) connection.close() @@ -1832,11 +1966,16 @@ def test_rabbitmq_virtual_columns_with_materialized_view(rabbitmq_cluster): for message in messages: channel.basic_publish(exchange="virtuals_mv", routing_key="", body=message) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view") time.sleep(1) if int(result) == message_num: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) connection.close() @@ -1937,11 +2076,16 @@ def test_rabbitmq_many_consumers_to_each_queue(rabbitmq_cluster): thread.start() result1 = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result1 = instance.query("SELECT count() FROM test.destination") time.sleep(1) if int(result1) == messages_num * threads_num: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) result2 = instance.query("SELECT count(DISTINCT channel_id) FROM test.destination") @@ -2014,7 +2158,8 @@ def test_rabbitmq_restore_failed_connection_without_losses_1(rabbitmq_cluster): values.append("({i}, {i})".format(i=i)) values = ",".join(values) - while True: + deadline = time.monotonic() + 180 + while time.monotonic() < deadline: try: instance.query( "INSERT INTO test.producer_reconnect VALUES {}".format(values) @@ -2025,9 +2170,18 @@ def test_rabbitmq_restore_failed_connection_without_losses_1(rabbitmq_cluster): continue else: raise + else: + pytest.fail( + f"Time limit of 180 seconds reached. The query could not be executed successfully." + ) - while int(instance.query("SELECT count() FROM test.view")) == 0: + deadline = time.monotonic() + 180 + while time.monotonic() < deadline: + if int(instance.query("SELECT count() FROM test.view")) != 0: + break time.sleep(0.1) + else: + pytest.fail(f"Time limit of 180 seconds reached. The count is still 0.") kill_rabbitmq(rabbitmq_cluster.rabbitmq_docker_id) time.sleep(4) @@ -2035,11 +2189,16 @@ def test_rabbitmq_restore_failed_connection_without_losses_1(rabbitmq_cluster): rabbitmq_cluster.rabbitmq_docker_id, rabbitmq_cluster.rabbitmq_cookie ) - while True: + deadline = time.monotonic() + 180 + while time.monotonic() < deadline: result = instance.query("SELECT count(DISTINCT key) FROM test.view") time.sleep(1) if int(result) == messages_num: break + else: + pytest.fail( + f"Time limit of 180 seconds reached. The result did not match the expected value." + ) instance.query( """ @@ -2102,9 +2261,14 @@ def test_rabbitmq_restore_failed_connection_without_losses_2(rabbitmq_cluster): """ ) - while int(instance.query("SELECT count() FROM test.view")) == 0: + deadline = time.monotonic() + 180 + while time.monotonic() < deadline: + if int(instance.query("SELECT count() FROM test.view")) != 0: + break logging.debug(3) time.sleep(0.1) + else: + pytest.fail(f"Time limit of 180 seconds reached. The count is still 0.") kill_rabbitmq(rabbitmq_cluster.rabbitmq_docker_id) time.sleep(8) @@ -2119,12 +2283,17 @@ def test_rabbitmq_restore_failed_connection_without_losses_2(rabbitmq_cluster): # time.sleep(2) # revive_rabbitmq() - while True: + deadline = time.monotonic() + 180 + while time.monotonic() < deadline: result = instance.query("SELECT count(DISTINCT key) FROM test.view").strip() if int(result) == messages_num: break logging.debug(f"Result: {result} / {messages_num}") time.sleep(1) + else: + pytest.fail( + f"Time limit of 180 seconds reached. The result did not match the expected value." + ) instance.query( """ @@ -2182,27 +2351,49 @@ def test_rabbitmq_commit_on_block_write(rabbitmq_cluster): rabbitmq_thread = threading.Thread(target=produce) rabbitmq_thread.start() - while int(instance.query("SELECT count() FROM test.view")) == 0: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: + if int(instance.query("SELECT count() FROM test.view")) != 0: + break time.sleep(1) + else: + cancel.set() + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The count is still 0." + ) cancel.set() instance.query("DETACH TABLE test.rabbitmq;") - while ( - int( - instance.query( - "SELECT count() FROM system.tables WHERE database='test' AND name='rabbitmq'" + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: + if ( + int( + instance.query( + "SELECT count() FROM system.tables WHERE database='test' AND name='rabbitmq'" + ) ) - ) - == 1 - ): + != 1 + ): + break time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The table 'rabbitmq' still exists." + ) instance.query("ATTACH TABLE test.rabbitmq;") - while int(instance.query("SELECT uniqExact(key) FROM test.view")) < i[0]: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: + if int(instance.query("SELECT uniqExact(key) FROM test.view")) >= i[0]: + break time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The uniqExact(key) is still less than {i[0]}." + ) result = int(instance.query("SELECT count() == uniqExact(key) FROM test.view")) @@ -2276,11 +2467,16 @@ def test_rabbitmq_no_connection_at_startup_2(rabbitmq_cluster): ) connection.close() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view") time.sleep(1) if int(result) == messages_num: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) instance.query( """ @@ -2324,10 +2520,16 @@ def test_rabbitmq_format_factory_settings(rabbitmq_cluster): channel.basic_publish(exchange="format_settings", routing_key="", body=message) result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT date FROM test.format_settings") if result == expected: break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) instance.query( """ @@ -2340,11 +2542,18 @@ def test_rabbitmq_format_factory_settings(rabbitmq_cluster): ) channel.basic_publish(exchange="format_settings", routing_key="", body=message) + result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT date FROM test.view") if result == expected: break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) connection.close() instance.query( @@ -2380,12 +2589,18 @@ def test_rabbitmq_vhost(rabbitmq_cluster): exchange="vhost", routing_key="", body=json.dumps({"key": 1, "value": 2}) ) connection.close() - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query( "SELECT * FROM test.rabbitmq_vhost ORDER BY key", ignore_error=True ) if result == "1\t2\n": break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value '1\\t2\\n'." + ) def test_rabbitmq_drop_table_properly(rabbitmq_cluster): @@ -2411,12 +2626,18 @@ def test_rabbitmq_drop_table_properly(rabbitmq_cluster): channel.basic_publish( exchange="drop", routing_key="", body=json.dumps({"key": 1, "value": 2}) ) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query( "SELECT * FROM test.rabbitmq_drop ORDER BY key", ignore_error=True ) if result == "1\t2\n": break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value '1\\t2\\n'." + ) exists = channel.queue_declare(queue="rabbit_queue_drop", passive=True) assert exists @@ -2472,11 +2693,16 @@ def test_rabbitmq_queue_settings(rabbitmq_cluster): time.sleep(5) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view", ignore_error=True) if int(result) == 10: break time.sleep(0.5) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value of 10." + ) instance.query("DROP TABLE test.rabbitmq_settings") @@ -2530,11 +2756,16 @@ def test_rabbitmq_queue_consume(rabbitmq_cluster): ) result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view") if int(result) == messages_num * threads_num: break time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value." + ) for thread in threads: thread.join() @@ -2662,11 +2893,17 @@ def test_rabbitmq_issue_30691(rabbitmq_cluster): ), ) result = "" - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT * FROM test.rabbitmq_drop", ignore_error=True) logging.debug(result) if result != "": break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result is still empty." + ) assert ( result.strip() == """{"event_type": "purge", "as_src": 1234, "as_dst": 0, "as_path": "", "local_pref": 100, "med": 0, "peer_as_dst": 0, "ip_src": "", "ip_dst": "", "port_src": 443, "port_dst": 41930, "ip_proto": "tcp", "tos": 0, "stamp_inserted": "2021-10-26 15:20:00", "stamp_updated": "2021-10-26 15:23:14", "packets": 2, "bytes": 1216, "writer_id": "default_amqp/449206"}""" @@ -2712,13 +2949,19 @@ def test_rabbitmq_drop_mv(rabbitmq_cluster): exchange="mv", routing_key="", body=json.dumps({"key": i, "value": i}) ) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: res = instance.query("SELECT COUNT(*) FROM test.view") logging.debug(f"Current count (1): {res}") if int(res) == 20: break else: logging.debug(f"Number of rows in test.view: {res}") + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The row count did not reach 20." + ) instance.query("DROP VIEW test.consumer SYNC") for i in range(20, 40): @@ -2737,12 +2980,17 @@ def test_rabbitmq_drop_mv(rabbitmq_cluster): exchange="mv", routing_key="", body=json.dumps({"key": i, "value": i}) ) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("SELECT count() FROM test.view") logging.debug(f"Current count (2): {result}") if int(result) == 50: break time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The row count did not reach 50." + ) result = instance.query("SELECT * FROM test.view ORDER BY key") rabbitmq_check_result(result, True) @@ -2756,11 +3004,14 @@ def test_rabbitmq_drop_mv(rabbitmq_cluster): connection.close() count = 0 - start = time.time() - while time.time() - start < 30: + deadline = time.monotonic() + 30 + while time.monotonic() < deadline: count = int(instance.query("SELECT count() FROM test.drop_mv")) if count: break + time.sleep(0.05) + else: + pytest.fail(f"Time limit of 30 seconds reached. The count is still 0.") instance.query("DROP TABLE test.drop_mv") assert count > 0 @@ -2850,22 +3101,36 @@ def test_rabbitmq_predefined_configuration(rabbitmq_cluster): channel.basic_publish( exchange="named", routing_key="", body=json.dumps({"key": 1, "value": 2}) ) - while True: + + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query( "SELECT * FROM test.rabbitmq ORDER BY key", ignore_error=True ) if result == "1\t2\n": break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value '1\\t2\\n'." + ) + instance.restart_clickhouse() channel.basic_publish( exchange="named", routing_key="", body=json.dumps({"key": 1, "value": 2}) ) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query( "SELECT * FROM test.rabbitmq ORDER BY key", ignore_error=True ) if result == "1\t2\n": break + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match the expected value '1\\t2\\n'." + ) def test_rabbitmq_msgpack(rabbitmq_cluster): @@ -2895,16 +3160,17 @@ def test_rabbitmq_msgpack(rabbitmq_cluster): ) result = "" - try_no = 0 - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance.query("select * from rabbit_in;") if result.strip() == "kek": break - else: - try_no = try_no + 1 - if try_no == 20: - break time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match 'kek'." + ) + assert result.strip() == "kek" instance.query("drop table rabbit_in sync") @@ -2937,16 +3203,17 @@ def test_rabbitmq_address(rabbitmq_cluster): ) result = "" - try_no = 0 - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: result = instance2.query("select * from rabbit_in;") if result.strip() == "kek": break - else: - try_no = try_no + 1 - if try_no == 20: - break time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The result did not match 'kek'." + ) + assert result.strip() == "kek" instance2.query("drop table rabbit_in sync") @@ -3059,13 +3326,17 @@ def test_max_rows_per_message(rabbitmq_cluster): == "\n0\t0\n10\t100\n20\t200\n\n\n30\t300\n40\t400\n\n" ) - attempt = 0 + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC rows = 0 - while attempt < 100: + while time.monotonic() < deadline: rows = int(instance.query("SELECT count() FROM test.view")) if rows == num_rows: break - attempt += 1 + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The number of rows did not match {num_rows}." + ) assert rows == num_rows @@ -3151,13 +3422,17 @@ def test_row_based_formats(rabbitmq_cluster): assert insert_messages == 2 - attempt = 0 + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC rows = 0 - while attempt < 100: + while time.monotonic() < deadline: rows = int(instance.query("SELECT count() FROM test.view")) if rows == num_rows: break - attempt += 1 + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The number of rows did not match {num_rows}." + ) assert rows == num_rows @@ -3297,13 +3572,17 @@ def test_block_based_formats_2(rabbitmq_cluster): assert insert_messages == 9 - attempt = 0 + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC rows = 0 - while attempt < 100: + while time.monotonic() < deadline: rows = int(instance.query("SELECT count() FROM test.view")) if rows == num_rows: break - attempt += 1 + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The number of rows did not match {num_rows}." + ) assert rows == num_rows @@ -3366,12 +3645,22 @@ def test_rabbitmq_flush_by_block_size(rabbitmq_cluster): produce_thread = threading.Thread(target=produce) produce_thread.start() - while 0 == int( - instance.query( - "SELECT count() FROM system.parts WHERE database = 'test' AND table = 'view' AND name = 'all_1_1_0'" - ) - ): + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: + if ( + int( + instance.query( + "SELECT count() FROM system.parts WHERE database = 'test' AND table = 'view' AND name = 'all_1_1_0'" + ) + ) + != 0 + ): + break time.sleep(0.5) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The part 'all_1_1_0' is still missing." + ) cancel.set() produce_thread.join() @@ -3451,12 +3740,13 @@ def test_rabbitmq_flush_by_time(rabbitmq_cluster): """ ) - while True: + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC + while time.monotonic() < deadline: time.sleep(0.2) - count = instance.query( + total_count = instance.query( "SELECT count() FROM system.parts WHERE database = 'test' AND table = 'view'" ) - logging.debug(f"kssenii total count: {count}") + logging.debug(f"kssenii total count: {total_count}") count = int( instance.query( "SELECT count() FROM system.parts WHERE database = 'test' AND table = 'view' AND name = 'all_1_1_0'" @@ -3465,6 +3755,10 @@ def test_rabbitmq_flush_by_time(rabbitmq_cluster): logging.debug(f"kssenii count: {count}") if count > 0: break + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The part 'all_1_1_0' is still missing." + ) time.sleep(12) result = instance.query("SELECT uniqExact(ts) FROM test.view") @@ -3542,13 +3836,17 @@ def test_rabbitmq_handle_error_mode_stream(rabbitmq_cluster): # The order of messages in select * from test.rabbitmq is not guaranteed, so sleep to collect everything in one select time.sleep(1) - attempt = 0 + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC rows = 0 - while attempt < 500: + while time.monotonic() < deadline: rows = int(instance.query("SELECT count() FROM test.data")) if rows == num_rows: break - attempt += 1 + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The number of rows did not match {num_rows}." + ) assert rows == num_rows @@ -3560,13 +3858,17 @@ def test_rabbitmq_handle_error_mode_stream(rabbitmq_cluster): assert result == expected - attempt = 0 + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC errors_count = 0 - while attempt < 500: + while time.monotonic() < deadline: errors_count = int(instance.query("SELECT count() FROM test.errors")) - if errors_count == num_rows: + if errors_count == num_rows / 2: break - attempt += 1 + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The number of errors ({errors_count}) did not match the expected {num_rows}." + ) assert errors_count == num_rows / 2 @@ -3584,7 +3886,7 @@ def test_rabbitmq_handle_error_mode_stream(rabbitmq_cluster): def test_attach_broken_table(rabbitmq_cluster): instance.query( - "ATTACH TABLE rabbit_queue UUID '2d1cdf1a-f060-4a61-a7c9-5b59e59992c6' (`payload` String) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'nonexisting:5671', rabbitmq_format = 'JSONEachRow', rabbitmq_username = 'test', rabbitmq_password = 'test'" + f"ATTACH TABLE rabbit_queue UUID '{uuid.uuid4()}' (`payload` String) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'nonexisting:5671', rabbitmq_format = 'JSONEachRow', rabbitmq_username = 'test', rabbitmq_password = 'test'" ) error = instance.query_and_get_error("SELECT * FROM rabbit_queue") @@ -3658,13 +3960,17 @@ def test_rabbitmq_nack_failed_insert(rabbitmq_cluster): channel.basic_consume(queue_name, on_consume) channel.start_consuming() - attempt = 0 + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC count = 0 - while attempt < 100: + while time.monotonic() < deadline: count = int(instance3.query("SELECT count() FROM test.view")) if count == num_rows: break - attempt += 1 + time.sleep(0.05) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The count did not match {num_rows}." + ) assert count == num_rows @@ -3743,14 +4049,17 @@ def test_rabbitmq_reject_broken_messages(rabbitmq_cluster): time.sleep(1) - attempt = 0 + deadline = time.monotonic() + DEFAULT_TIMEOUT_SEC rows = 0 - while attempt < 500: + while time.monotonic() < deadline: rows = int(instance.query("SELECT count() FROM test.data")) if rows == num_rows: break - attempt += 1 time.sleep(1) + else: + pytest.fail( + f"Time limit of {DEFAULT_TIMEOUT_SEC} seconds reached. The number of rows did not match {num_rows}." + ) assert rows == num_rows diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index b9aa0a6f9bd..0d820334e70 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -2046,3 +2046,50 @@ def test_bad_settings(started_cluster): assert False except Exception as e: assert "Ordered mode in cloud without either" in str(e) + + +def test_processing_threads(started_cluster): + node = started_cluster.instances["node1"] + + table_name = f"test_processing_threads_{uuid.uuid4().hex[:8]}" + dst_table_name = f"{table_name}_dst" + # A unique path is necessary for repeatable tests + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 10 + + create_table( + started_cluster, + node, + table_name, + "ordered", + files_path, + additional_settings={ + "keeper_path": keeper_path, + }, + ) + + assert '"processing_threads_num":16' in node.query( + f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'" + ) + + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, start_ind=0, row_num=1 + ) + + create_mv(node, table_name, dst_table_name) + + def get_count(): + return int(node.query(f"SELECT count() FROM {dst_table_name}")) + + expected_rows = 10 + for _ in range(20): + if expected_rows == get_count(): + break + time.sleep(1) + + assert expected_rows == get_count() + + assert node.contains_in_log( + f"StorageS3Queue (default.{table_name}): Using 16 processing threads" + ) diff --git a/tests/queries/0_stateless/02337_base58.reference b/tests/queries/0_stateless/02337_base58.reference index a51eeec7bd1..90d433b5ac7 100644 --- a/tests/queries/0_stateless/02337_base58.reference +++ b/tests/queries/0_stateless/02337_base58.reference @@ -40,3 +40,15 @@ foobar 1 1 +32YCBjgZhV4AdCWHaCDNu +foobar +111 +bG7y +bQZu +bQbp +\0\0\0 +f\0\0 +fo\0 +foo +1 +1 diff --git a/tests/queries/0_stateless/02337_base58.sql b/tests/queries/0_stateless/02337_base58.sql index bc1b2c301e5..554492e8254 100644 --- a/tests/queries/0_stateless/02337_base58.sql +++ b/tests/queries/0_stateless/02337_base58.sql @@ -1,15 +1,25 @@ -- Tags: no-fasttest SELECT base58Encode('Hold my beer...'); + SELECT base58Encode('Hold my beer...', 'Second arg'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } SELECT base58Decode('Hold my beer...'); -- { serverError BAD_ARGUMENTS } -SELECT base58Decode(encoded) FROM (SELECT base58Encode(val) as encoded FROM (select arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar', 'Hello world!']) val)); -SELECT tryBase58Decode(encoded) FROM (SELECT base58Encode(val) as encoded FROM (select arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar', 'Hello world!']) val)); +SELECT base58Decode(encoded) FROM (SELECT base58Encode(val) as encoded FROM (SELECT arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar', 'Hello world!']) val)); +SELECT tryBase58Decode(encoded) FROM (SELECT base58Encode(val) as encoded FROM (SELECT arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar', 'Hello world!']) val)); SELECT tryBase58Decode(val) FROM (SELECT arrayJoin(['Hold my beer', 'Hold another beer', '3csAg9', 'And a wine', 'And another wine', 'And a lemonade', 't1Zv2yaZ', 'And another wine']) val); -SELECT base58Encode(val) FROM (select arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar']) val); -SELECT base58Decode(val) FROM (select arrayJoin(['', '2m', '8o8', 'bQbp', '3csAg9', 'CZJRhmz', 't1Zv2yaZ', '']) val); +SELECT base58Encode(val) FROM (SELECT arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar']) val); +SELECT base58Decode(val) FROM (SELECT arrayJoin(['', '2m', '8o8', 'bQbp', '3csAg9', 'CZJRhmz', 't1Zv2yaZ', '']) val); SELECT base58Encode(base58Decode('1BWutmTvYPwDtmw9abTkS4Ssr8no61spGAvW1X6NDix')) == '1BWutmTvYPwDtmw9abTkS4Ssr8no61spGAvW1X6NDix'; select base58Encode('\x00\x0b\xe3\xe1\xeb\xa1\x7a\x47\x3f\x89\xb0\xf7\xe8\xe2\x49\x40\xf2\x0a\xeb\x8e\xbc\xa7\x1a\x88\xfd\xe9\x5d\x4b\x83\xb7\x1a\x09') == '1BWutmTvYPwDtmw9abTkS4Ssr8no61spGAvW1X6NDix'; + +SELECT base58Encode(toFixedString('Hold my beer...', 15)); +SELECT base58Decode(toFixedString('t1Zv2yaZ', 8)); + +SELECT base58Encode(val) FROM (SELECT arrayJoin([toFixedString('', 3), toFixedString('f', 3), toFixedString('fo', 3), toFixedString('foo', 3)]) val); +SELECT base58Decode(val) FROM (SELECT arrayJoin([toFixedString('111', 3), toFixedString('bG7y', 4), toFixedString('bQZu', 4), toFixedString('bQbp', 4)]) val); + +Select base58Encode(reinterpretAsFixedString(byteSwap(toUInt256('256')))) == '1111111111111111111111111111115R'; +Select base58Encode(reinterpretAsString(byteSwap(toUInt256('256')))) == '1111111111111111111111111111112'; -- { reinterpretAsString drops the last null byte hence, encoded value is different than the FixedString version above } diff --git a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql index c4cf12ed995..fcbe9ee42b9 100644 --- a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql +++ b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql @@ -16,6 +16,14 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar -- deterministic randomness. INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest UInt64 +-- hnsw_candidate_list_size_for_search = 0 is illegal +WITH [0.5, 0.5] AS reference_vec +SELECT id, vec, L2Distance(vec, reference_vec) +FROM tab +ORDER BY L2Distance(vec, reference_vec) +LIMIT 3 +SETTINGS hnsw_candidate_list_size_for_search = 0; -- { serverError INVALID_SETTING_VALUE } + DROP TABLE IF EXISTS results; CREATE TABLE results(id Int32) ENGINE = Memory; diff --git a/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.reference b/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.reference index de65b4fa268..219149f209c 100644 --- a/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.reference +++ b/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.reference @@ -134,15 +134,30 @@ 3 Hello World not_found x Hello World 4 Hello World [eo] x Hxllo World 5 Hello World . x xello World -Check that whether an exception is thrown if the needle is empty +- should not throw an exception if the needle is empty +- non-const needle, const replacement +Hexxo Worxd +Hello World +Hexlo World +Hello World +Hexxo Worxd +Hello World +Hexlo World +Hello World +- const needle, non-const replacement +Hello World +Hello World +Hello World +Hello World +Hello World +Hello World +Hello World +Hello World +- non-const needle, non-const replacement Hexxo Worxd Hello World Hexlo World Hello World -Hello World -Hello World -Hello World -Hello World Hexxo Worxd Hello World Hexlo World diff --git a/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.sql b/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.sql index bccd9f3e609..7fd79e3b7ff 100644 --- a/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.sql +++ b/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.sql @@ -69,8 +69,7 @@ SELECT id, haystack, needle, replacement, replaceRegexpOne('Hello World', needle DROP TABLE IF EXISTS test_tab; - -SELECT 'Check that whether an exception is thrown if the needle is empty'; +SELECT '- should not throw an exception if the needle is empty'; CREATE TABLE test_tab (id UInt32, haystack String, needle String, replacement String) @@ -79,22 +78,22 @@ CREATE TABLE test_tab INSERT INTO test_tab VALUES (1, 'Hello World', 'l', 'x') (2, 'Hello World', '', 'y'); --- needle: non-const, replacement: const +SELECT '- non-const needle, const replacement'; SELECT replaceAll(haystack, needle, 'x') FROM test_tab; SELECT replaceOne(haystack, needle, 'x') FROM test_tab; -SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } -SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab; +SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab; --- needle: const, replacement: non-const +SELECT '- const needle, non-const replacement'; SELECT replaceAll(haystack, '', replacement) FROM test_tab; SELECT replaceOne(haystack, '', replacement) FROM test_tab; -SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } -SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab; +SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab; --- needle: non-const, replacement: non-const +SELECT '- non-const needle, non-const replacement'; SELECT replaceAll(haystack, needle, replacement) FROM test_tab; SELECT replaceOne(haystack, needle, replacement) FROM test_tab; -SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } -SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab; +SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab; DROP TABLE IF EXISTS test_tab; diff --git a/tests/queries/0_stateless/02911_join_on_nullsafe_optimization.reference b/tests/queries/0_stateless/02911_join_on_nullsafe_optimization.reference index 352e02941c3..2d8eff88ef3 100644 --- a/tests/queries/0_stateless/02911_join_on_nullsafe_optimization.reference +++ b/tests/queries/0_stateless/02911_join_on_nullsafe_optimization.reference @@ -64,3 +64,4 @@ SELECT * FROM t1n as t1 JOIN t2n as t2 ON (t1.x == t2.x AND ((t2.x IS NOT NULL) 0 2 2 +1 diff --git a/tests/queries/0_stateless/02911_join_on_nullsafe_optimization.sql b/tests/queries/0_stateless/02911_join_on_nullsafe_optimization.sql index e1e095a6eae..9189b3f4ab9 100644 --- a/tests/queries/0_stateless/02911_join_on_nullsafe_optimization.sql +++ b/tests/queries/0_stateless/02911_join_on_nullsafe_optimization.sql @@ -67,6 +67,10 @@ SELECT count() FROM ( EXPLAIN QUERY TREE SELECT * FROM t1 JOIN t2 ON t1.x <=> t2.x AND (t1.x = t1.y OR t1.x IS NULL AND t1.y IS NULL) ) WHERE explain like '%CONSTANT%' OR explain ilike '%is%null%'; +SELECT count() FROM ( EXPLAIN QUERY TREE + SELECT * FROM t1 JOIN t2 ON t1.x = t2.x AND NOT (t1.x = 1 OR t1.x IS NULL) +) WHERE explain ilike '%function_name: isNull%'; + DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; DROP TABLE IF EXISTS t1n; diff --git a/tests/queries/0_stateless/03168_inconsistent_ast_formatting.reference b/tests/queries/0_stateless/03168_inconsistent_ast_formatting.reference index e69de29bb2d..36eb3e954ec 100644 --- a/tests/queries/0_stateless/03168_inconsistent_ast_formatting.reference +++ b/tests/queries/0_stateless/03168_inconsistent_ast_formatting.reference @@ -0,0 +1,38 @@ +-- +0 +SELECT NOT 1 +SELECT NOT 1 +-- +SELECT not(1, 1, 1) +SELECT not(1, 1, 1) +-- +SELECT NOT tuple(1) +SELECT NOT tuple(1) +SELECT NOT tuple(1) +-- +SELECT NOT ((1, 1, 1)) +SELECT NOT ((1, 1, 1)) +SELECT NOT ((1, 1, 1)) +SELECT NOT ((1, 1, 1)) +-- +SELECT NOT [1] +SELECT NOT [1] +SELECT NOT [1] +SELECT NOT [1] +SELECT NOT [1] +-- +SELECT -[1] +SELECT -[1] +SELECT -[1] +SELECT -[1] +SELECT -[1] +-- +(-1,-1,-1) +SELECT -(1, 1, 1) +SELECT -(1, 1, 1) +SELECT -(1, 1, 1) +SELECT -(1, 1, 1) +-- +((-1,-1,-1)) +SELECT -tuple((1, 1, 1)) +SELECT -tuple((1, 1, 1)) diff --git a/tests/queries/0_stateless/03168_inconsistent_ast_formatting.sql b/tests/queries/0_stateless/03168_inconsistent_ast_formatting.sql index 5333ea29ce7..ace538a90b7 100644 --- a/tests/queries/0_stateless/03168_inconsistent_ast_formatting.sql +++ b/tests/queries/0_stateless/03168_inconsistent_ast_formatting.sql @@ -5,3 +5,57 @@ create table a (`value2` Enum8('Hello' = 1, equals(`Null`, 'World', 2), '!' = 3) create table a (x Int8) engine Memory; create table b empty as a; + +SELECT '--'; +SELECT NOT (1); +SELECT formatQuery('SELECT NOT 1'); +SELECT formatQuery('SELECT NOT (1)'); + +SELECT '--'; +SELECT NOT (1, 1, 1); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT formatQuery('SELECT NOT (1, 1, 1)'); +SELECT formatQuery('SELECT not(1, 1, 1)'); + +SELECT '--'; +SELECT NOT ((1,)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT NOT tuple(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT formatQuery('SELECT NOT ((1,))'); +SELECT formatQuery('SELECT NOT (tuple(1))'); +SELECT formatQuery('SELECT NOT tuple(1)'); + +SELECT '--'; +SELECT NOT ((1, 1, 1)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT formatQuery('SELECT NOT ((1, 1, 1))'); +SELECT formatQuery('SELECT not((1, 1, 1))'); +SELECT formatQuery('SELECT not tuple(1, 1, 1)'); +SELECT formatQuery('SELECT not (tuple(1, 1, 1))'); + +SELECT '--'; +SELECT NOT [1]; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT NOT [(1)]; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT formatQuery('SELECT NOT [1]'); +SELECT formatQuery('SELECT NOT array(1)'); +SELECT formatQuery('SELECT NOT (array(1))'); +SELECT formatQuery('SELECT NOT [(1)]'); +SELECT formatQuery('SELECT NOT ([1])'); + +SELECT '--'; +SELECT -[1]; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT -[(1)]; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT formatQuery('SELECT -[1]'); +SELECT formatQuery('SELECT -array(1)'); +SELECT formatQuery('SELECT -(array(1))'); +SELECT formatQuery('SELECT -[(1)]'); +SELECT formatQuery('SELECT -([1])'); + +SELECT '--'; +SELECT -(1, 1, 1); +SELECT formatQuery('SELECT -(1, 1, 1)'); +SELECT formatQuery('SELECT negate ((1, 1, 1))'); +SELECT formatQuery('SELECT -tuple(1, 1, 1)'); +SELECT formatQuery('SELECT -(tuple(1, 1, 1))'); + +SELECT '--'; +SELECT -tuple((1, 1, 1)); +SELECT formatQuery('SELECT -((1, 1, 1))'); +SELECT formatQuery('SELECT -tuple((1, 1, 1))'); diff --git a/tests/queries/0_stateless/03215_varian_as_common_type_tuple_map.sql b/tests/queries/0_stateless/03215_varian_as_common_type_tuple_map.sql index 4a9a788ab18..3981baf280f 100644 --- a/tests/queries/0_stateless/03215_varian_as_common_type_tuple_map.sql +++ b/tests/queries/0_stateless/03215_varian_as_common_type_tuple_map.sql @@ -1,5 +1,6 @@ set use_variant_as_common_type = 1; set allow_experimental_variant_type = 1; +SET enable_named_columns_in_function_tuple=1; SELECT if(number % 2, tuple(number), tuple(toString(number))) as res, toTypeName(res) FROM numbers(5); SELECT if(number % 2, map(number, number), map(toString(number), toString(number))) as res, toTypeName(res) FROM numbers(5); diff --git a/tests/queries/0_stateless/03224_tuple_element_identifier.sql b/tests/queries/0_stateless/03224_tuple_element_identifier.sql index 2a7fb9a97a3..6c47e69a77e 100644 --- a/tests/queries/0_stateless/03224_tuple_element_identifier.sql +++ b/tests/queries/0_stateless/03224_tuple_element_identifier.sql @@ -1,4 +1,5 @@ SET enable_analyzer = 1; +SET enable_named_columns_in_function_tuple=1; SELECT JSONExtract('{"hello":[{"world":"wtf"}]}', 'Tuple(hello Array(Tuple(world String)))') AS x, x.hello, x.hello[1].world; diff --git a/tests/queries/0_stateless/03247_pr_local_plan_non_constant_in_source.reference b/tests/queries/0_stateless/03247_pr_local_plan_non_constant_in_source.reference new file mode 100644 index 00000000000..a12f205d5b5 --- /dev/null +++ b/tests/queries/0_stateless/03247_pr_local_plan_non_constant_in_source.reference @@ -0,0 +1,5 @@ +4c36abda-8bd8-11eb-8204-005056aa8bf6 2021-03-24 01:04:27 1 +4c408902-8bd8-11eb-8204-005056aa8bf6 2021-03-24 01:04:27 1 +4c5bf20a-8bd8-11eb-8204-005056aa8bf6 2021-03-24 01:04:27 1 +4c61623a-8bd8-11eb-8204-005056aa8bf6 2021-03-24 01:04:27 1 +4c6efab2-8bd8-11eb-a952-005056aa8bf6 2021-03-24 01:04:27 1 diff --git a/tests/queries/0_stateless/03247_pr_local_plan_non_constant_in_source.sql b/tests/queries/0_stateless/03247_pr_local_plan_non_constant_in_source.sql new file mode 100644 index 00000000000..bf0f18cafa2 --- /dev/null +++ b/tests/queries/0_stateless/03247_pr_local_plan_non_constant_in_source.sql @@ -0,0 +1,18 @@ +DROP TABLE IF EXISTS table_3 SYNC; + +CREATE TABLE table_3 (uid UUID, date DateTime('Asia/Kamchatka')) ENGINE = ReplicatedMergeTree('/pr_local_plan/{database}/table_3', 'r1') ORDER BY date; + +INSERT INTO table_3 VALUES ('4c36abda-8bd8-11eb-8204-005056aa8bf6', '2021-03-24 01:04:27'), ('4c408902-8bd8-11eb-8204-005056aa8bf6', '2021-03-24 01:04:27'), ('4c5bf20a-8bd8-11eb-8204-005056aa8bf6', '2021-03-24 01:04:27'), ('4c61623a-8bd8-11eb-8204-005056aa8bf6', '2021-03-24 01:04:27'), ('4c6efab2-8bd8-11eb-a952-005056aa8bf6', '2021-03-24 01:04:27'); + +SELECT + uid, + date, + toDate(date) = toDate('2021-03-24') AS res +FROM table_3 +WHERE res = 1 +ORDER BY + uid ASC, + date ASC +SETTINGS enable_parallel_replicas = 1, max_parallel_replicas = 3, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_local_plan = 1; + +DROP TABLE table_3 SYNC; diff --git a/utils/check-style/check-settings-style b/utils/check-style/check-settings-style new file mode 100755 index 00000000000..460277b0277 --- /dev/null +++ b/utils/check-style/check-settings-style @@ -0,0 +1,115 @@ +#!/usr/bin/env bash + +# Fast check of all the setting struct usages +# The linker does not complain about incorrect extern usage, so we need to make sure the style checker handles + +LC_ALL="en_US.UTF-8" +ROOT_PATH=$(git rev-parse --show-toplevel) + +# Duplicated or incorrect setting declarations +SETTINGS_FILE=$(mktemp) +ALL_DECLARATION_FILES=" + $ROOT_PATH/src/Core/Settings.cpp + $ROOT_PATH/src/Core/ServerSettings.cpp + $ROOT_PATH/src/Storages/MergeTree/MergeTreeSettings.cpp + $ROOT_PATH/src/Coordination/CoordinationSettings.cpp + $ROOT_PATH/src/Databases/DatabaseReplicatedSettings.cpp + $ROOT_PATH/src/Storages/TimeSeries/TimeSeriesSettings.cpp + $ROOT_PATH/src/Storages/RocksDB/RocksDBSettings.cpp + $ROOT_PATH/src/Storages/RabbitMQ/RabbitMQSettings.cpp + $ROOT_PATH/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.cpp + $ROOT_PATH/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.cpp + $ROOT_PATH/src/Storages/MaterializedView/RefreshSettings.cpp + $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h" + +for settings_file in ${ALL_DECLARATION_FILES}; +do + if ! [ -f "${settings_file}" ]; then + echo "File '${settings_file}' does not exist." + fi +done + +cat $ROOT_PATH/src/Core/Settings.cpp $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " Settings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq > ${SETTINGS_FILE} +cat $ROOT_PATH/src/Core/ServerSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " ServerSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Storages/MergeTree/MergeTreeSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " MergeTreeSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Coordination/CoordinationSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " CoordinationSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Databases/DatabaseReplicatedSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " DatabaseReplicatedSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Storages/TimeSeries/TimeSeriesSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " TimeSeriesSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Storages/RocksDB/RocksDBSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " RocksDBSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Storages/RabbitMQ/RabbitMQSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " RabbitMQSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " MaterializedPostgreSQLSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Storages/ObjectStorageQueue/ObjectStorageQueueSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " ObjectStorageQueueSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} +cat $ROOT_PATH/src/Storages/MaterializedView/RefreshSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " RefreshSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} + + +# Check that if there are duplicated settings (declared in different objects) they all have the same type (it's simpler to validate style with that assert) +for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | \ + sed -e 's/CoordinationSettings//g' \ + -e 's/DatabaseReplicatedSettings//g' \ + -e 's/TimeSeriesSettings//g' \ + -e 's/RabbitMQSettings//g' \ + -e 's/RocksDBSettings//g' \ + -e 's/MaterializedPostgreSQLSettings//g' \ + -e 's/ObjectStorageQueueSettings//g' \ + -e 's/RefreshSettings//g' \ + -e 's/MergeTreeSettings//g' \ + -e 's/ServerSettings//g' \ + -e 's/Settings//g' | \ + sort | uniq | awk '{ print $1 }' | uniq -d); +do + echo "# Found multiple definitions of setting ${setting} with different types: " + grep --line-number " ${setting}," ${ALL_DECLARATION_FILES} | awk '{print " > " $0 }' +done + +# We append all uses of extern found in implementation files to validate them in a single pass and avoid reading the same files over and over +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | \ + xargs grep -e "^\s*extern const Settings" \ + -e "^\s**extern const ServerSettings" \ + -e "^\s**extern const MergeTreeSettings" \ + -e "^\s**extern const RabbitMQSettings" \ + -e "^\s**extern const RocksDBSettings" \ + -e "^\s**extern const MaterializedPostgreSQLSettings" \ + -e "^\s**extern const ObjectStorageQueueSettings" \ + -e "^\s**extern const RefreshSettings" \ + -e "^\s**extern const TimeSeriesSettings" \ + -e "^\s**extern const DatabaseReplicatedSettings" \ + -e "^\s**extern const CoordinationSettings" -T | \ + awk '{print substr($5, 0, length($5) -1) " " $4 " " substr($1, 0, length($1) - 1)}' >> ${SETTINGS_FILE} + +# Duplicate extern declarations for settings +awk '{if (seen[$0]++) print $3 " -> " $1 ;}' ${SETTINGS_FILE} | while read line; +do + echo "# Found duplicated setting declaration in: $line" +done + +# Find missing declarations (obsolete settings being used) +# Note that SettingsDeclaration are first in the file +# Disabled for now pending fixing the code +#awk '{print $1 " " $3}' ${SETTINGS_FILE} | awk '{if (!seen[$1]++) print $0}' | grep -v SettingsDeclaration | while read setting; +#do +# echo "Could not find setting (maybe obsolete but used?) $setting" +#done + +# Look for settings declared with multiple types +for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | \ + sed -e 's/MergeTreeSettings//g' \ + -e 's/ServerSettings//g' \ + -e 's/CoordinationSettings//g' \ + -e 's/TimeSeriesSettings//g' \ + -e 's/RabbitMQSettings//g' \ + -e 's/RefreshSettings//g' \ + -e 's/RocksDBSettings//g' \ + -e 's/MaterializedPostgreSQLSettings//g' \ + -e 's/ObjectStorageQueueSettings//g' \ + -e 's/DatabaseReplicatedSettings//g' \ + -e 's/Settings//g' | \ + sort | uniq | awk '{ print $1 }' | sort | uniq -d); +do + expected=$(grep "^$setting " ${SETTINGS_FILE} | grep SettingsDeclaration | awk '{ print $2 }') + grep "^$setting " ${SETTINGS_FILE} | grep -v " $expected" | awk '{ print $3 " found setting " $1 " with type " $2 }' | while read line; + do + echo "# In $line but it should be ${expected/$'\n'/ }" + done +done + +rm ${SETTINGS_FILE} diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 52a1ad6ab75..9314efa6f90 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -53,69 +53,7 @@ find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/n find -L $ROOT_PATH -type l 2>/dev/null | grep -v contrib && echo "^ Broken symlinks found" # Duplicated or incorrect setting declarations -SETTINGS_FILE=$(mktemp) -ALL_DECLARATION_FILES=" - $ROOT_PATH/src/Core/Settings.cpp - $ROOT_PATH/src/Core/ServerSettings.cpp - $ROOT_PATH/src/Storages/MergeTree/MergeTreeSettings.cpp - $ROOT_PATH/src/Coordination/CoordinationSettings.cpp - $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h" - -cat $ROOT_PATH/src/Core/Settings.cpp $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " Settings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq > ${SETTINGS_FILE} -cat $ROOT_PATH/src/Core/ServerSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " ServerSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} -cat $ROOT_PATH/src/Storages/MergeTree/MergeTreeSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " MergeTreeSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} -cat $ROOT_PATH/src/Coordination/CoordinationSettings.cpp | grep " M(" | awk '{print substr($2, 0, length($2) - 1) " CoordinationSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} - -# Check that if there are duplicated settings (declared in different objects) they all have the same type (it's simpler to validate style with that assert) -for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | \ - sed -e 's/CoordinationSettings//g' \ - -e 's/MergeTreeSettings//g' \ - -e 's/ServerSettings//g' \ - -e 's/Settings//g' | \ - sort | uniq | awk '{ print $1 }' | uniq -d); -do - echo "# Found multiple definitions of setting ${setting} with different types: " - grep --line-number " ${setting}," ${ALL_DECLARATION_FILES} | awk '{print " > " $0 }' -done - -# We append all uses of extern found in implementation files to validate them in a single pass and avoid reading the same files over and over -find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | \ - xargs grep -e "^\s*extern const Settings" \ - -e "^\s**extern const ServerSettings" \ - -e "^\s**extern const MergeTreeSettings" \ - -e "^\s**extern const CoordinationSettings" -T | \ - awk '{print substr($5, 0, length($5) -1) " " $4 " " substr($1, 0, length($1) - 1)}' >> ${SETTINGS_FILE} - -# Duplicate extern declarations for settings -awk '{if (seen[$0]++) print $3 " -> " $1 ;}' ${SETTINGS_FILE} | while read line; -do - echo "# Found duplicated setting declaration in: $line" -done - -# Find missing declarations (obsolete settings being used) -# Note that SettingsDeclaration are first in the file -# Disabled for now pending fixing the code -#awk '{print $1 " " $3}' ${SETTINGS_FILE} | awk '{if (!seen[$1]++) print $0}' | grep -v SettingsDeclaration | while read setting; -#do -# echo "Could not find setting (maybe obsolete but used?) $setting" -#done - -# Look for settings declared with multiple types -for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | \ - sed -e 's/MergeTreeSettings//g' \ - -e 's/ServerSettings//g' \ - -e 's/CoordinationSettings//g' \ - -e 's/Settings//g' | \ - sort | uniq | awk '{ print $1 }' | sort | uniq -d); -do - expected=$(grep "^$setting " ${SETTINGS_FILE} | grep SettingsDeclaration | awk '{ print $2 }') - grep "^$setting " ${SETTINGS_FILE} | grep -v " $expected" | awk '{ print $3 " found setting " $1 " with type " $2 }' | while read line; - do - echo "# In $line but it should be ${expected/$'\n'/ }" - done -done - -rm ${SETTINGS_FILE} +bash $ROOT_PATH/utils/check-style/check-settings-style # Unused/Undefined/Duplicates ErrorCodes/ProfileEvents/CurrentMetrics declare -A EXTERN_TYPES