Merge pull request #70946 from rschu1ze/tpc-h-decor

Docs: Add notes about TPC-H query correctness + decorrelated queries
This commit is contained in:
Robert Schulze 2024-10-23 09:28:46 +00:00 committed by GitHub
commit 0c6bd36ef3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -155,10 +155,10 @@ The queries are generated by `./qgen -s <scaling_factor>`. Example queries for `
## Queries
::::warning
TPC-H makes heavy use of correlated subqueries which are at the time of writing (October 2024) not supported by ClickHouse ([issue #6697](https://github.com/ClickHouse/ClickHouse/issues/6697)).
As a result, many of below benchmark queries will fail with errors.
::::
**Correctness**
The result of the queries agrees with the official results unless mentioned otherwise. To verify, generate a TPC-H database with scale
factor = 1 (`dbgen`, see above) and compare with the [expected results in tpch-kit](https://github.com/gregrahn/tpch-kit/tree/master/dbgen/answers).
**Q1**
@ -177,7 +177,7 @@ SELECT
FROM
lineitem
WHERE
l_shipdate <= date '1998-12-01' - INTERVAL '90' DAY
l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
GROUP BY
l_returnflag,
l_linestatus
@ -234,6 +234,62 @@ ORDER BY
p_partkey;
```
::::note
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
This alternative formulation works and was verified to return the reference results.
```sql
WITH MinSupplyCost AS (
SELECT
ps_partkey,
MIN(ps_supplycost) AS min_supplycost
FROM
partsupp ps
JOIN
supplier s ON ps.ps_suppkey = s.s_suppkey
JOIN
nation n ON s.s_nationkey = n.n_nationkey
JOIN
region r ON n.n_regionkey = r.r_regionkey
WHERE
r.r_name = 'EUROPE'
GROUP BY
ps_partkey
)
SELECT
s.s_acctbal,
s.s_name,
n.n_name,
p.p_partkey,
p.p_mfgr,
s.s_address,
s.s_phone,
s.s_comment
FROM
part p
JOIN
partsupp ps ON p.p_partkey = ps.ps_partkey
JOIN
supplier s ON s.s_suppkey = ps.ps_suppkey
JOIN
nation n ON s.s_nationkey = n.n_nationkey
JOIN
region r ON n.n_regionkey = r.r_regionkey
JOIN
MinSupplyCost msc ON ps.ps_partkey = msc.ps_partkey AND ps.ps_supplycost = msc.min_supplycost
WHERE
p.p_size = 15
AND p.p_type LIKE '%BRASS'
AND r.r_name = 'EUROPE'
ORDER BY
s.s_acctbal DESC,
n.n_name,
s.s_name,
p.p_partkey;
```
::::
**Q3**
```sql
@ -250,8 +306,8 @@ WHERE
c_mktsegment = 'BUILDING'
AND c_custkey = o_custkey
AND l_orderkey = o_orderkey
AND o_orderdate < date '1995-03-15'
AND l_shipdate > date '1995-03-15'
AND o_orderdate < DATE '1995-03-15'
AND l_shipdate > DATE '1995-03-15'
GROUP BY
l_orderkey,
o_orderdate,
@ -270,8 +326,8 @@ SELECT
FROM
orders
WHERE
o_orderdate >= date '1993-07-01'
AND o_orderdate < date '1993-07-01' + INTERVAL '3' MONTH
o_orderdate >= DATE '1993-07-01'
AND o_orderdate < DATE '1993-07-01' + INTERVAL '3' MONTH
AND EXISTS (
SELECT
*
@ -287,6 +343,39 @@ ORDER BY
o_orderpriority;
```
::::note
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
This alternative formulation works and was verified to return the reference results.
```sql
WITH ValidLineItems AS (
SELECT
l_orderkey
FROM
lineitem
WHERE
l_commitdate < l_receiptdate
GROUP BY
l_orderkey
)
SELECT
o.o_orderpriority,
COUNT(*) AS order_count
FROM
orders o
JOIN
ValidLineItems vli ON o.o_orderkey = vli.l_orderkey
WHERE
o.o_orderdate >= DATE '1993-07-01'
AND o.o_orderdate < DATE '1993-07-01' + INTERVAL '3' MONTH
GROUP BY
o.o_orderpriority
ORDER BY
o.o_orderpriority;
```
::::
**Q5**
```sql
@ -308,8 +397,8 @@ WHERE
AND s_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'ASIA'
AND o_orderdate >= date '1994-01-01'
AND o_orderdate < date '1994-01-01' + INTERVAL '1' year
AND o_orderdate >= DATE '1994-01-01'
AND o_orderdate < DATE '1994-01-01' + INTERVAL '1' year
GROUP BY
n_name
ORDER BY
@ -324,12 +413,30 @@ SELECT
FROM
lineitem
WHERE
l_shipdate >= date '1994-01-01'
AND l_shipdate < date '1994-01-01' + INTERVAL '1' year
l_shipdate >= DATE '1994-01-01'
AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year
AND l_discount BETWEEN 0.06 - 0.01 AND 0.06 + 0.01
AND l_quantity < 24;
```
::::note
As of October 2024, the query does not work out-of-the box due to a bug with Decimal addition. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70136
This alternative formulation works and was verified to return the reference results.
```sql
SELECT
sum(l_extendedprice * l_discount) AS revenue
FROM
lineitem
WHERE
l_shipdate >= DATE '1994-01-01'
AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year
AND l_discount BETWEEN 0.05 AND 0.07
AND l_quantity < 24;
```
::::
**Q7**
```sql
@ -361,7 +468,7 @@ FROM (
(n1.n_name = 'FRANCE' AND n2.n_name = 'GERMANY')
OR (n1.n_name = 'GERMANY' AND n2.n_name = 'FRANCE')
)
AND l_shipdate BETWEEN date '1995-01-01' AND date '1996-12-31'
AND l_shipdate BETWEEN DATE '1995-01-01' AND DATE '1996-12-31'
) AS shipping
GROUP BY
supp_nation,
@ -406,7 +513,7 @@ FROM (
AND n1.n_regionkey = r_regionkey
AND r_name = 'AMERICA'
AND s_nationkey = n2.n_nationkey
AND o_orderdate BETWEEN date '1995-01-01' AND date '1996-12-31'
AND o_orderdate BETWEEN DATE '1995-01-01' AND DATE '1996-12-31'
AND p_type = 'ECONOMY ANODIZED STEEL'
) AS all_nations
GROUP BY
@ -471,8 +578,8 @@ FROM
WHERE
c_custkey = o_custkey
AND l_orderkey = o_orderkey
AND o_orderdate >= date '1993-10-01'
AND o_orderdate < date '1993-10-01' + INTERVAL '3' MONTH
AND o_orderdate >= DATE '1993-10-01'
AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
AND l_returnflag = 'R'
AND c_nationkey = n_nationkey
GROUP BY
@ -544,8 +651,8 @@ WHERE
AND l_shipmode in ('MAIL', 'SHIP')
AND l_commitdate < l_receiptdate
AND l_shipdate < l_commitdate
AND l_receiptdate >= date '1994-01-01'
AND l_receiptdate < date '1994-01-01' + INTERVAL '1' year
AND l_receiptdate >= DATE '1994-01-01'
AND l_receiptdate < DATE '1994-01-01' + INTERVAL '1' year
GROUP BY
l_shipmode
ORDER BY
@ -576,6 +683,37 @@ ORDER BY
c_count DESC;
```
::::note
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
This alternative formulation works and was verified to return the reference results.
```sql
WITH CustomerOrderCounts AS (
SELECT
c.c_custkey,
count(o.o_orderkey) AS order_count
FROM
customer c
LEFT OUTER JOIN
orders o ON c.c_custkey = o.o_custkey
AND o.o_comment NOT LIKE '%special%requests%'
GROUP BY
c.c_custkey
)
SELECT
order_count AS c_count,
count(*) AS custdist
FROM
CustomerOrderCounts
GROUP BY
order_count
ORDER BY
custdist DESC,
c_count DESC;
```
::::
**Q14**
```sql
@ -590,8 +728,8 @@ FROM
part
WHERE
l_partkey = p_partkey
AND l_shipdate >= date '1995-09-01'
AND l_shipdate < date '1995-09-01' + INTERVAL '1' MONTH;
AND l_shipdate >= DATE '1995-09-01'
AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
```
**Q15**
@ -604,8 +742,8 @@ CREATE VIEW revenue0 (supplier_no, total_revenue) AS
FROM
lineitem
WHERE
l_shipdate >= date '1996-01-01'
AND l_shipdate < date '1996-01-01' + INTERVAL '3' MONTH
l_shipdate >= DATE '1996-01-01'
AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH
GROUP BY
l_suppkey;
@ -632,6 +770,26 @@ ORDER BY
DROP VIEW revenue0;
```
::::note
As of October 2024, the view definition does not work out-of-the box. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70139
This alternative view definition does work:
```sql
CREATE VIEW revenue0 AS
SELECT
l_suppkey AS supplier_no,
sum(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM
lineitem
WHERE
l_shipdate >= DATE '1996-01-01'
AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH
GROUP BY
l_suppkey;
```
::::
**Q16**
```sql
@ -689,6 +847,37 @@ WHERE
);
```
::::note
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
This alternative formulation works and was verified to return the reference results.
```sql
WITH AvgQuantity AS (
SELECT
l_partkey,
AVG(l_quantity) * 0.2 AS avg_quantity
FROM
lineitem
GROUP BY
l_partkey
)
SELECT
SUM(l.l_extendedprice) / 7.0 AS avg_yearly
FROM
lineitem l
JOIN
part p ON p.p_partkey = l.l_partkey
JOIN
AvgQuantity aq ON l.l_partkey = aq.l_partkey
WHERE
p.p_brand = 'Brand#23'
AND p.p_container = 'MED BOX'
AND l.l_quantity < aq.avg_quantity;
```
::::
**Q18**
```sql
@ -731,7 +920,7 @@ ORDER BY
```sql
SELECT
sum(l_extendedprice* (1 - l_discount)) AS revenue
sum(l_extendedprice * (1 - l_discount)) AS revenue
FROM
lineitem,
part
@ -767,6 +956,46 @@ WHERE
);
```
::::note
As of October 2024, the query is extremely slow due to missing join predicate pushdown. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70802
This alternative formulation works and was verified to return the reference results.
```sql
SELECT
sum(l_extendedprice * (1 - l_discount)) AS revenue
FROM
lineitem,
part
WHERE
p_partkey = l_partkey
AND l_shipinstruct = 'DELIVER IN PERSON'
AND l_shipmode IN ('AIR', 'AIR REG')
AND (
(
p_brand = 'Brand#12'
AND p_container IN ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
AND l_quantity >= 1 AND l_quantity <= 1 + 10
AND p_size BETWEEN 1 AND 5
)
OR
(
p_brand = 'Brand#23'
AND p_container IN ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
AND l_quantity >= 10 AND l_quantity <= 10 + 10
AND p_size BETWEEN 1 AND 10
)
OR
(
p_brand = 'Brand#34'
AND p_container IN ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
AND l_quantity >= 20 AND l_quantity <= 20 + 10
AND p_size BETWEEN 1 AND 15
)
)
```
::::
**Q20**
```sql
@ -789,7 +1018,7 @@ WHERE
FROM
part
WHERE
p_name LIKE 'forrest%'
p_name LIKE 'forest%'
)
AND ps_availqty > (
SELECT
@ -799,8 +1028,8 @@ WHERE
WHERE
l_partkey = ps_partkey
AND l_suppkey = ps_suppkey
AND l_shipdate >= date '1994-01-01'
AND l_shipdate < date '1994-01-01' + INTERVAL '1' year
AND l_shipdate >= DATE '1994-01-01'
AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year
)
)
AND s_nationkey = n_nationkey
@ -809,6 +1038,10 @@ ORDER BY
s_name;
```
::::note
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
::::
**Q21**
```sql
@ -852,6 +1085,9 @@ ORDER BY
numwait DESC,
s_name;
```
::::note
As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697
::::
**Q22**
@ -860,34 +1096,33 @@ SELECT
cntrycode,
count(*) AS numcust,
sum(c_acctbal) AS totacctbal
FROM
(
SELECT
substring(c_phone FROM 1 for 2) AS cntrycode,
c_acctbal
FROM
customer
WHERE
substring(c_phone FROM 1 for 2) in
('13', '31', '23', '29', '30', '18', '17')
AND c_acctbal > (
SELECT
avg(c_acctbal)
FROM
customer
WHERE
c_acctbal > 0.00
AND substring(c_phone FROM 1 for 2) in
('13', '31', '23', '29', '30', '18', '17')
)
AND NOT EXISTS (
SELECT
*
FROM
orders
WHERE
o_custkey = c_custkey
)
FROM (
SELECT
substring(c_phone FROM 1 for 2) AS cntrycode,
c_acctbal
FROM
customer
WHERE
substring(c_phone FROM 1 for 2) in
('13', '31', '23', '29', '30', '18', '17')
AND c_acctbal > (
SELECT
avg(c_acctbal)
FROM
customer
WHERE
c_acctbal > 0.00
AND substring(c_phone FROM 1 for 2) in
('13', '31', '23', '29', '30', '18', '17')
)
AND NOT EXISTS (
SELECT
*
FROM
orders
WHERE
o_custkey = c_custkey
)
) AS custsale
GROUP BY
cntrycode