mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-15 12:14:18 +00:00
Merge pull request #70946 from rschu1ze/tpc-h-decor
Docs: Add notes about TPC-H query correctness + decorrelated queries
This commit is contained in:
commit
0c6bd36ef3
@ -155,10 +155,10 @@ The queries are generated by `./qgen -s <scaling_factor>`. Example queries for `
|
||||
|
||||
## Queries
|
||||
|
||||
::::warning
|
||||
TPC-H makes heavy use of correlated subqueries which are at the time of writing (October 2024) not supported by ClickHouse ([issue #6697](https://github.com/ClickHouse/ClickHouse/issues/6697)).
|
||||
As a result, many of below benchmark queries will fail with errors.
|
||||
::::
|
||||
**Correctness**
|
||||
|
||||
The result of the queries agrees with the official results unless mentioned otherwise. To verify, generate a TPC-H database with scale
|
||||
factor = 1 (`dbgen`, see above) and compare with the [expected results in tpch-kit](https://github.com/gregrahn/tpch-kit/tree/master/dbgen/answers).
|
||||
|
||||
**Q1**
|
||||
|
||||
@ -177,7 +177,7 @@ SELECT
|
||||
FROM
|
||||
lineitem
|
||||
WHERE
|
||||
l_shipdate <= date '1998-12-01' - INTERVAL '90' DAY
|
||||
l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
|
||||
GROUP BY
|
||||
l_returnflag,
|
||||
l_linestatus
|
||||
@ -234,6 +234,62 @@ ORDER BY
|
||||
p_partkey;
|
||||
```
|
||||
|
||||
::::note
|
||||
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
|
||||
|
||||
This alternative formulation works and was verified to return the reference results.
|
||||
|
||||
```sql
|
||||
WITH MinSupplyCost AS (
|
||||
SELECT
|
||||
ps_partkey,
|
||||
MIN(ps_supplycost) AS min_supplycost
|
||||
FROM
|
||||
partsupp ps
|
||||
JOIN
|
||||
supplier s ON ps.ps_suppkey = s.s_suppkey
|
||||
JOIN
|
||||
nation n ON s.s_nationkey = n.n_nationkey
|
||||
JOIN
|
||||
region r ON n.n_regionkey = r.r_regionkey
|
||||
WHERE
|
||||
r.r_name = 'EUROPE'
|
||||
GROUP BY
|
||||
ps_partkey
|
||||
)
|
||||
SELECT
|
||||
s.s_acctbal,
|
||||
s.s_name,
|
||||
n.n_name,
|
||||
p.p_partkey,
|
||||
p.p_mfgr,
|
||||
s.s_address,
|
||||
s.s_phone,
|
||||
s.s_comment
|
||||
FROM
|
||||
part p
|
||||
JOIN
|
||||
partsupp ps ON p.p_partkey = ps.ps_partkey
|
||||
JOIN
|
||||
supplier s ON s.s_suppkey = ps.ps_suppkey
|
||||
JOIN
|
||||
nation n ON s.s_nationkey = n.n_nationkey
|
||||
JOIN
|
||||
region r ON n.n_regionkey = r.r_regionkey
|
||||
JOIN
|
||||
MinSupplyCost msc ON ps.ps_partkey = msc.ps_partkey AND ps.ps_supplycost = msc.min_supplycost
|
||||
WHERE
|
||||
p.p_size = 15
|
||||
AND p.p_type LIKE '%BRASS'
|
||||
AND r.r_name = 'EUROPE'
|
||||
ORDER BY
|
||||
s.s_acctbal DESC,
|
||||
n.n_name,
|
||||
s.s_name,
|
||||
p.p_partkey;
|
||||
```
|
||||
::::
|
||||
|
||||
**Q3**
|
||||
|
||||
```sql
|
||||
@ -250,8 +306,8 @@ WHERE
|
||||
c_mktsegment = 'BUILDING'
|
||||
AND c_custkey = o_custkey
|
||||
AND l_orderkey = o_orderkey
|
||||
AND o_orderdate < date '1995-03-15'
|
||||
AND l_shipdate > date '1995-03-15'
|
||||
AND o_orderdate < DATE '1995-03-15'
|
||||
AND l_shipdate > DATE '1995-03-15'
|
||||
GROUP BY
|
||||
l_orderkey,
|
||||
o_orderdate,
|
||||
@ -270,8 +326,8 @@ SELECT
|
||||
FROM
|
||||
orders
|
||||
WHERE
|
||||
o_orderdate >= date '1993-07-01'
|
||||
AND o_orderdate < date '1993-07-01' + INTERVAL '3' MONTH
|
||||
o_orderdate >= DATE '1993-07-01'
|
||||
AND o_orderdate < DATE '1993-07-01' + INTERVAL '3' MONTH
|
||||
AND EXISTS (
|
||||
SELECT
|
||||
*
|
||||
@ -287,6 +343,39 @@ ORDER BY
|
||||
o_orderpriority;
|
||||
```
|
||||
|
||||
::::note
|
||||
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
|
||||
|
||||
This alternative formulation works and was verified to return the reference results.
|
||||
|
||||
```sql
|
||||
WITH ValidLineItems AS (
|
||||
SELECT
|
||||
l_orderkey
|
||||
FROM
|
||||
lineitem
|
||||
WHERE
|
||||
l_commitdate < l_receiptdate
|
||||
GROUP BY
|
||||
l_orderkey
|
||||
)
|
||||
SELECT
|
||||
o.o_orderpriority,
|
||||
COUNT(*) AS order_count
|
||||
FROM
|
||||
orders o
|
||||
JOIN
|
||||
ValidLineItems vli ON o.o_orderkey = vli.l_orderkey
|
||||
WHERE
|
||||
o.o_orderdate >= DATE '1993-07-01'
|
||||
AND o.o_orderdate < DATE '1993-07-01' + INTERVAL '3' MONTH
|
||||
GROUP BY
|
||||
o.o_orderpriority
|
||||
ORDER BY
|
||||
o.o_orderpriority;
|
||||
```
|
||||
::::
|
||||
|
||||
**Q5**
|
||||
|
||||
```sql
|
||||
@ -308,8 +397,8 @@ WHERE
|
||||
AND s_nationkey = n_nationkey
|
||||
AND n_regionkey = r_regionkey
|
||||
AND r_name = 'ASIA'
|
||||
AND o_orderdate >= date '1994-01-01'
|
||||
AND o_orderdate < date '1994-01-01' + INTERVAL '1' year
|
||||
AND o_orderdate >= DATE '1994-01-01'
|
||||
AND o_orderdate < DATE '1994-01-01' + INTERVAL '1' year
|
||||
GROUP BY
|
||||
n_name
|
||||
ORDER BY
|
||||
@ -324,12 +413,30 @@ SELECT
|
||||
FROM
|
||||
lineitem
|
||||
WHERE
|
||||
l_shipdate >= date '1994-01-01'
|
||||
AND l_shipdate < date '1994-01-01' + INTERVAL '1' year
|
||||
l_shipdate >= DATE '1994-01-01'
|
||||
AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year
|
||||
AND l_discount BETWEEN 0.06 - 0.01 AND 0.06 + 0.01
|
||||
AND l_quantity < 24;
|
||||
```
|
||||
|
||||
::::note
|
||||
As of October 2024, the query does not work out-of-the box due to a bug with Decimal addition. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70136
|
||||
|
||||
This alternative formulation works and was verified to return the reference results.
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
sum(l_extendedprice * l_discount) AS revenue
|
||||
FROM
|
||||
lineitem
|
||||
WHERE
|
||||
l_shipdate >= DATE '1994-01-01'
|
||||
AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year
|
||||
AND l_discount BETWEEN 0.05 AND 0.07
|
||||
AND l_quantity < 24;
|
||||
```
|
||||
::::
|
||||
|
||||
**Q7**
|
||||
|
||||
```sql
|
||||
@ -361,7 +468,7 @@ FROM (
|
||||
(n1.n_name = 'FRANCE' AND n2.n_name = 'GERMANY')
|
||||
OR (n1.n_name = 'GERMANY' AND n2.n_name = 'FRANCE')
|
||||
)
|
||||
AND l_shipdate BETWEEN date '1995-01-01' AND date '1996-12-31'
|
||||
AND l_shipdate BETWEEN DATE '1995-01-01' AND DATE '1996-12-31'
|
||||
) AS shipping
|
||||
GROUP BY
|
||||
supp_nation,
|
||||
@ -406,7 +513,7 @@ FROM (
|
||||
AND n1.n_regionkey = r_regionkey
|
||||
AND r_name = 'AMERICA'
|
||||
AND s_nationkey = n2.n_nationkey
|
||||
AND o_orderdate BETWEEN date '1995-01-01' AND date '1996-12-31'
|
||||
AND o_orderdate BETWEEN DATE '1995-01-01' AND DATE '1996-12-31'
|
||||
AND p_type = 'ECONOMY ANODIZED STEEL'
|
||||
) AS all_nations
|
||||
GROUP BY
|
||||
@ -471,8 +578,8 @@ FROM
|
||||
WHERE
|
||||
c_custkey = o_custkey
|
||||
AND l_orderkey = o_orderkey
|
||||
AND o_orderdate >= date '1993-10-01'
|
||||
AND o_orderdate < date '1993-10-01' + INTERVAL '3' MONTH
|
||||
AND o_orderdate >= DATE '1993-10-01'
|
||||
AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
|
||||
AND l_returnflag = 'R'
|
||||
AND c_nationkey = n_nationkey
|
||||
GROUP BY
|
||||
@ -544,8 +651,8 @@ WHERE
|
||||
AND l_shipmode in ('MAIL', 'SHIP')
|
||||
AND l_commitdate < l_receiptdate
|
||||
AND l_shipdate < l_commitdate
|
||||
AND l_receiptdate >= date '1994-01-01'
|
||||
AND l_receiptdate < date '1994-01-01' + INTERVAL '1' year
|
||||
AND l_receiptdate >= DATE '1994-01-01'
|
||||
AND l_receiptdate < DATE '1994-01-01' + INTERVAL '1' year
|
||||
GROUP BY
|
||||
l_shipmode
|
||||
ORDER BY
|
||||
@ -576,6 +683,37 @@ ORDER BY
|
||||
c_count DESC;
|
||||
```
|
||||
|
||||
::::note
|
||||
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
|
||||
|
||||
This alternative formulation works and was verified to return the reference results.
|
||||
|
||||
```sql
|
||||
WITH CustomerOrderCounts AS (
|
||||
SELECT
|
||||
c.c_custkey,
|
||||
count(o.o_orderkey) AS order_count
|
||||
FROM
|
||||
customer c
|
||||
LEFT OUTER JOIN
|
||||
orders o ON c.c_custkey = o.o_custkey
|
||||
AND o.o_comment NOT LIKE '%special%requests%'
|
||||
GROUP BY
|
||||
c.c_custkey
|
||||
)
|
||||
SELECT
|
||||
order_count AS c_count,
|
||||
count(*) AS custdist
|
||||
FROM
|
||||
CustomerOrderCounts
|
||||
GROUP BY
|
||||
order_count
|
||||
ORDER BY
|
||||
custdist DESC,
|
||||
c_count DESC;
|
||||
```
|
||||
::::
|
||||
|
||||
**Q14**
|
||||
|
||||
```sql
|
||||
@ -590,8 +728,8 @@ FROM
|
||||
part
|
||||
WHERE
|
||||
l_partkey = p_partkey
|
||||
AND l_shipdate >= date '1995-09-01'
|
||||
AND l_shipdate < date '1995-09-01' + INTERVAL '1' MONTH;
|
||||
AND l_shipdate >= DATE '1995-09-01'
|
||||
AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
|
||||
```
|
||||
|
||||
**Q15**
|
||||
@ -604,8 +742,8 @@ CREATE VIEW revenue0 (supplier_no, total_revenue) AS
|
||||
FROM
|
||||
lineitem
|
||||
WHERE
|
||||
l_shipdate >= date '1996-01-01'
|
||||
AND l_shipdate < date '1996-01-01' + INTERVAL '3' MONTH
|
||||
l_shipdate >= DATE '1996-01-01'
|
||||
AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH
|
||||
GROUP BY
|
||||
l_suppkey;
|
||||
|
||||
@ -632,6 +770,26 @@ ORDER BY
|
||||
DROP VIEW revenue0;
|
||||
```
|
||||
|
||||
::::note
|
||||
As of October 2024, the view definition does not work out-of-the box. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70139
|
||||
|
||||
This alternative view definition does work:
|
||||
|
||||
```sql
|
||||
CREATE VIEW revenue0 AS
|
||||
SELECT
|
||||
l_suppkey AS supplier_no,
|
||||
sum(l_extendedprice * (1 - l_discount)) AS total_revenue
|
||||
FROM
|
||||
lineitem
|
||||
WHERE
|
||||
l_shipdate >= DATE '1996-01-01'
|
||||
AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH
|
||||
GROUP BY
|
||||
l_suppkey;
|
||||
```
|
||||
::::
|
||||
|
||||
**Q16**
|
||||
|
||||
```sql
|
||||
@ -689,6 +847,37 @@ WHERE
|
||||
);
|
||||
```
|
||||
|
||||
::::note
|
||||
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
|
||||
|
||||
This alternative formulation works and was verified to return the reference results.
|
||||
|
||||
```sql
|
||||
WITH AvgQuantity AS (
|
||||
SELECT
|
||||
l_partkey,
|
||||
AVG(l_quantity) * 0.2 AS avg_quantity
|
||||
FROM
|
||||
lineitem
|
||||
GROUP BY
|
||||
l_partkey
|
||||
)
|
||||
SELECT
|
||||
SUM(l.l_extendedprice) / 7.0 AS avg_yearly
|
||||
FROM
|
||||
lineitem l
|
||||
JOIN
|
||||
part p ON p.p_partkey = l.l_partkey
|
||||
JOIN
|
||||
AvgQuantity aq ON l.l_partkey = aq.l_partkey
|
||||
WHERE
|
||||
p.p_brand = 'Brand#23'
|
||||
AND p.p_container = 'MED BOX'
|
||||
AND l.l_quantity < aq.avg_quantity;
|
||||
|
||||
```
|
||||
::::
|
||||
|
||||
**Q18**
|
||||
|
||||
```sql
|
||||
@ -731,7 +920,7 @@ ORDER BY
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
sum(l_extendedprice* (1 - l_discount)) AS revenue
|
||||
sum(l_extendedprice * (1 - l_discount)) AS revenue
|
||||
FROM
|
||||
lineitem,
|
||||
part
|
||||
@ -767,6 +956,46 @@ WHERE
|
||||
);
|
||||
```
|
||||
|
||||
::::note
|
||||
As of October 2024, the query is extremely slow due to missing join predicate pushdown. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70802
|
||||
|
||||
This alternative formulation works and was verified to return the reference results.
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
sum(l_extendedprice * (1 - l_discount)) AS revenue
|
||||
FROM
|
||||
lineitem,
|
||||
part
|
||||
WHERE
|
||||
p_partkey = l_partkey
|
||||
AND l_shipinstruct = 'DELIVER IN PERSON'
|
||||
AND l_shipmode IN ('AIR', 'AIR REG')
|
||||
AND (
|
||||
(
|
||||
p_brand = 'Brand#12'
|
||||
AND p_container IN ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
|
||||
AND l_quantity >= 1 AND l_quantity <= 1 + 10
|
||||
AND p_size BETWEEN 1 AND 5
|
||||
)
|
||||
OR
|
||||
(
|
||||
p_brand = 'Brand#23'
|
||||
AND p_container IN ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
|
||||
AND l_quantity >= 10 AND l_quantity <= 10 + 10
|
||||
AND p_size BETWEEN 1 AND 10
|
||||
)
|
||||
OR
|
||||
(
|
||||
p_brand = 'Brand#34'
|
||||
AND p_container IN ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
|
||||
AND l_quantity >= 20 AND l_quantity <= 20 + 10
|
||||
AND p_size BETWEEN 1 AND 15
|
||||
)
|
||||
)
|
||||
```
|
||||
::::
|
||||
|
||||
**Q20**
|
||||
|
||||
```sql
|
||||
@ -789,7 +1018,7 @@ WHERE
|
||||
FROM
|
||||
part
|
||||
WHERE
|
||||
p_name LIKE 'forrest%'
|
||||
p_name LIKE 'forest%'
|
||||
)
|
||||
AND ps_availqty > (
|
||||
SELECT
|
||||
@ -799,8 +1028,8 @@ WHERE
|
||||
WHERE
|
||||
l_partkey = ps_partkey
|
||||
AND l_suppkey = ps_suppkey
|
||||
AND l_shipdate >= date '1994-01-01'
|
||||
AND l_shipdate < date '1994-01-01' + INTERVAL '1' year
|
||||
AND l_shipdate >= DATE '1994-01-01'
|
||||
AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year
|
||||
)
|
||||
)
|
||||
AND s_nationkey = n_nationkey
|
||||
@ -809,6 +1038,10 @@ ORDER BY
|
||||
s_name;
|
||||
```
|
||||
|
||||
::::note
|
||||
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
|
||||
::::
|
||||
|
||||
**Q21**
|
||||
|
||||
```sql
|
||||
@ -852,6 +1085,9 @@ ORDER BY
|
||||
numwait DESC,
|
||||
s_name;
|
||||
```
|
||||
::::note
|
||||
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
|
||||
::::
|
||||
|
||||
**Q22**
|
||||
|
||||
@ -860,34 +1096,33 @@ SELECT
|
||||
cntrycode,
|
||||
count(*) AS numcust,
|
||||
sum(c_acctbal) AS totacctbal
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
substring(c_phone FROM 1 for 2) AS cntrycode,
|
||||
c_acctbal
|
||||
FROM
|
||||
customer
|
||||
WHERE
|
||||
substring(c_phone FROM 1 for 2) in
|
||||
('13', '31', '23', '29', '30', '18', '17')
|
||||
AND c_acctbal > (
|
||||
SELECT
|
||||
avg(c_acctbal)
|
||||
FROM
|
||||
customer
|
||||
WHERE
|
||||
c_acctbal > 0.00
|
||||
AND substring(c_phone FROM 1 for 2) in
|
||||
('13', '31', '23', '29', '30', '18', '17')
|
||||
)
|
||||
AND NOT EXISTS (
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
orders
|
||||
WHERE
|
||||
o_custkey = c_custkey
|
||||
)
|
||||
FROM (
|
||||
SELECT
|
||||
substring(c_phone FROM 1 for 2) AS cntrycode,
|
||||
c_acctbal
|
||||
FROM
|
||||
customer
|
||||
WHERE
|
||||
substring(c_phone FROM 1 for 2) in
|
||||
('13', '31', '23', '29', '30', '18', '17')
|
||||
AND c_acctbal > (
|
||||
SELECT
|
||||
avg(c_acctbal)
|
||||
FROM
|
||||
customer
|
||||
WHERE
|
||||
c_acctbal > 0.00
|
||||
AND substring(c_phone FROM 1 for 2) in
|
||||
('13', '31', '23', '29', '30', '18', '17')
|
||||
)
|
||||
AND NOT EXISTS (
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
orders
|
||||
WHERE
|
||||
o_custkey = c_custkey
|
||||
)
|
||||
) AS custsale
|
||||
GROUP BY
|
||||
cntrycode
|
||||
|
Loading…
Reference in New Issue
Block a user