Update TPC-H docs

This commit is contained in:
Robert Schulze 2024-10-21 19:46:12 +00:00
parent 3ceb3b92bd
commit 036f475755
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A

View File

@ -10,12 +10,15 @@ A popular benchmark which models the internal data warehouse of a wholesale supp
The data is stored into a 3rd normal form representation, requiring lots of joins at query runtime.
Despite its age and its unrealistic assumption that the data is uniformly and independently distributed, TPC-H remains the most popular OLAP benchmark to date.
References
**References**
- [TPC-H](https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp)
- [New TPC Benchmarks for Decision Support and Web Commerce](https://doi.org/10.1145/369275.369291) (Poess et. al., 2000)
- [TPC-H Analyzed: Hidden Messages and Lessons Learned from an Influential Benchmark](https://doi.org/10.1007/978-3-319-04936-6_5) (Boncz et. al.), 2013
- [Quantifying TPC-H Choke Points and Their Optimizations](https://doi.org/10.14778/3389133.3389138) (Dresseler et. al.), 2020
## Data Generation and Import
First, checkout the TPC-H repository and compile the data generator:
``` bash
@ -30,7 +33,8 @@ Then, generate the data. Parameter `-s` specifies the scale factor. For example,
./dbgen -s 100
```
Now create tables in ClickHouse:
Now create tables in ClickHouse. Note that we stick to the rules of the TPC-H benchmark and create primary key constraints only for the
columns mentioned in section 1.4.2.2 of the the TPC-H specification.
```sql
CREATE TABLE nation (
@ -38,13 +42,13 @@ CREATE TABLE nation (
n_name String,
n_regionkey Int32,
n_comment String)
ORDER BY (n_regionkey, n_name);
ORDER BY (n_nationkey);
CREATE TABLE region (
r_regionkey Int32,
r_name String,
r_comment String)
ORDER BY (r_name);
ORDER BY (r_regionkey);
CREATE TABLE part (
p_partkey Int32,
@ -56,7 +60,7 @@ CREATE TABLE part (
p_container String,
p_retailprice Decimal(15,2),
p_comment String)
ORDER BY (p_mfgr, p_brand, p_type, p_name);
ORDER BY (p_partkey);
CREATE TABLE supplier (
s_suppkey Int32,
@ -66,7 +70,7 @@ CREATE TABLE supplier (
s_phone String,
s_acctbal Decimal(15,2),
s_comment String)
ORDER BY (s_nationkey, s_address, s_name);
ORDER BY (s_suppkey);
CREATE TABLE partsupp (
ps_partkey Int32,
@ -74,7 +78,7 @@ CREATE TABLE partsupp (
ps_availqty Int32,
ps_supplycost Decimal(15,2),
ps_comment String)
ORDER BY (ps_suppkey, ps_availqty, ps_supplycost, ps_partkey);
ORDER BY (ps_partkey, ps_suppkey);
CREATE TABLE customer (
c_custkey Int32,
@ -85,7 +89,7 @@ CREATE TABLE customer (
c_acctbal Decimal(15,2),
c_mktsegment String,
c_comment String)
ORDER BY (c_nationkey, c_mktsegment, c_address, c_name, c_custkey);
ORDER BY (c_custkey);
CREATE TABLE orders (
o_orderkey Int32,
@ -97,7 +101,10 @@ CREATE TABLE orders (
o_clerk String,
o_shippriority Int32,
o_comment String)
ORDER BY (o_orderdate, o_orderstatus, o_custkey);
ORDER BY (o_orderkey);
-- The following is an alternative order key which is not compliant with the official TPC-H rules but recommended by sec. 4.5 in
-- "Quantifying TPC-H Choke Points and Their Optimizations":
-- ORDER BY (o_orderdate, o_orderkey);
CREATE TABLE lineitem (
l_orderkey Int32,
@ -116,7 +123,10 @@ CREATE TABLE lineitem (
l_shipinstruct String,
l_shipmode String,
l_comment String)
ORDER BY (l_suppkey, l_partkey, l_shipdate, l_commitdate, l_receiptdate);
ORDER BY (l_orderkey, l_linenumber);
-- The following is an alternative order key which is not compliant with the official TPC-H rules but recommended by sec. 4.5 in
-- "Quantifying TPC-H Choke Points and Their Optimizations":
-- ORDER BY (l_shipdate, l_orderkey, l_linenumber);
```
The data can be imported as follows:
@ -127,19 +137,21 @@ clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO region FORMAT
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO part FORMAT CSV" < part.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO partsupp FORMAT CSV" < partsupp.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customers FORMAT CSV" < customers.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer FORMAT CSV" < customer.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO orders FORMAT CSV" < orders.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO lineitem FORMAT CSV" < lineitem.tbl
```
The queries are generated by `./qgen -s <scaling_factor>`. Example queries for `s = 100`:
## Queries
::::warning
TPC-H makes heavy use of correlated subqueries which are at the time of writing (September 2024) not supported by ClickHouse ([issue #6697](https://github.com/ClickHouse/ClickHouse/issues/6697)).
TPC-H makes heavy use of correlated subqueries which are at the time of writing (October 2024) not supported by ClickHouse ([issue #6697](https://github.com/ClickHouse/ClickHouse/issues/6697)).
As a result, many of below benchmark queries will fail with errors.
::::
Q1
**Q1**
```sql
SELECT
@ -156,7 +168,7 @@ SELECT
FROM
lineitem
WHERE
l_shipdate <= date '1998-12-01' - interval '100' day
l_shipdate <= date '1998-12-01' - INTERVAL '90' DAY
GROUP BY
l_returnflag,
l_linestatus
@ -165,7 +177,7 @@ ORDER BY
l_linestatus;
```
Q2
**Q2**
```sql
SELECT
@ -186,11 +198,11 @@ FROM
WHERE
p_partkey = ps_partkey
AND s_suppkey = ps_suppkey
AND p_size = 21
AND p_type LIKE '%COPPER'
AND p_size = 15
AND p_type LIKE '%BRASS'
AND s_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'AMERICA'
AND r_name = 'EUROPE'
AND ps_supplycost = (
SELECT
min(ps_supplycost)
@ -204,17 +216,16 @@ WHERE
AND s_suppkey = ps_suppkey
AND s_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'AMERICA'
AND r_name = 'EUROPE'
)
ORDER BY
s_acctbal desc,
s_acctbal DESC,
n_name,
s_name,
p_partkey
LIMIT 100;
p_partkey;
```
Q3
**Q3**
```sql
SELECT
@ -230,19 +241,18 @@ WHERE
c_mktsegment = 'BUILDING'
AND c_custkey = o_custkey
AND l_orderkey = o_orderkey
AND o_orderdate < date '1995-03-10'
AND l_shipdate > date '1995-03-10'
AND o_orderdate < date '1995-03-15'
AND l_shipdate > date '1995-03-15'
GROUP BY
l_orderkey,
o_orderdate,
o_shippriority
ORDER BY
revenue desc,
o_orderdate
LIMIT 10;
revenue DESC,
o_orderdate;
```
Q4
**Q4**
```sql
SELECT
@ -251,8 +261,8 @@ SELECT
FROM
orders
WHERE
o_orderdate >= date '1994-07-01'
AND o_orderdate < date '1994-07-01' + interval '3' month
o_orderdate >= date '1993-07-01'
AND o_orderdate < date '1993-07-01' + INTERVAL '3' MONTH
AND EXISTS (
SELECT
*
@ -268,7 +278,7 @@ ORDER BY
o_orderpriority;
```
Q5
**Q5**
```sql
SELECT
@ -288,16 +298,16 @@ WHERE
AND c_nationkey = s_nationkey
AND s_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'MIDDLE EAST'
AND r_name = 'ASIA'
AND o_orderdate >= date '1994-01-01'
AND o_orderdate < date '1994-01-01' + interval '1' year
AND o_orderdate < date '1994-01-01' + INTERVAL '1' year
GROUP BY
n_name
ORDER BY
revenue desc;
revenue DESC;
```
Q6
**Q6**
```sql
SELECT
@ -306,12 +316,12 @@ FROM
lineitem
WHERE
l_shipdate >= date '1994-01-01'
AND l_shipdate < date '1994-01-01' + interval '1' year
AND l_discount between 0.09 - 0.01 AND 0.09 + 0.01
AND l_shipdate < date '1994-01-01' + INTERVAL '1' year
AND l_discount BETWEEN 0.06 - 0.01 AND 0.06 + 0.01
AND l_quantity < 24;
```
Q7
**Q7**
```sql
SELECT
@ -319,31 +329,30 @@ SELECT
cust_nation,
l_year,
sum(volume) AS revenue
FROM
(
SELECT
n1.n_name AS supp_nation,
n2.n_name AS cust_nation,
extract(year FROM l_shipdate) AS l_year,
l_extendedprice * (1 - l_discount) AS volume
FROM
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2
WHERE
s_suppkey = l_suppkey
AND o_orderkey = l_orderkey
AND c_custkey = o_custkey
AND s_nationkey = n1.n_nationkey
AND c_nationkey = n2.n_nationkey
AND (
(n1.n_name = 'UNITED KINGDOM' AND n2.n_name = 'ETHIOPIA')
OR (n1.n_name = 'ETHIOPIA' AND n2.n_name = 'UNITED KINGDOM')
)
AND l_shipdate between date '1995-01-01' AND date '1996-12-31'
FROM (
SELECT
n1.n_name AS supp_nation,
n2.n_name AS cust_nation,
extract(year FROM l_shipdate) AS l_year,
l_extendedprice * (1 - l_discount) AS volume
FROM
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2
WHERE
s_suppkey = l_suppkey
AND o_orderkey = l_orderkey
AND c_custkey = o_custkey
AND s_nationkey = n1.n_nationkey
AND c_nationkey = n2.n_nationkey
AND (
(n1.n_name = 'FRANCE' AND n2.n_name = 'GERMANY')
OR (n1.n_name = 'GERMANY' AND n2.n_name = 'FRANCE')
)
AND l_shipdate BETWEEN date '1995-01-01' AND date '1996-12-31'
) AS shipping
GROUP BY
supp_nation,
@ -355,41 +364,41 @@ ORDER BY
l_year;
```
Q8
**Q8**
```sql
SELECT
o_year,
sum(CASE
WHEN nation = 'ETHIOPIA' THEN volume
ELSE 0
END) / sum(volume) AS mkt_share
FROM
(
SELECT
extract(year FROM o_orderdate) AS o_year,
l_extendedprice * (1 - l_discount) AS volume,
n2.n_name AS nation
FROM
part,
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2,
region
WHERE
p_partkey = l_partkey
AND s_suppkey = l_suppkey
AND l_orderkey = o_orderkey
AND o_custkey = c_custkey
AND c_nationkey = n1.n_nationkey
AND n1.n_regionkey = r_regionkey
AND r_name = 'AFRICA'
AND s_nationkey = n2.n_nationkey
AND o_orderdate between date '1995-01-01' AND date '1996-12-31'
AND p_type = 'SMALL POLISHED TIN'
WHEN nation = 'BRAZIL'
THEN volume
ELSE 0
END) / sum(volume) AS mkt_share
FROM (
SELECT
extract(year FROM o_orderdate) AS o_year,
l_extendedprice * (1 - l_discount) AS volume,
n2.n_name AS nation
FROM
part,
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2,
region
WHERE
p_partkey = l_partkey
AND s_suppkey = l_suppkey
AND l_orderkey = o_orderkey
AND o_custkey = c_custkey
AND c_nationkey = n1.n_nationkey
AND n1.n_regionkey = r_regionkey
AND r_name = 'AMERICA'
AND s_nationkey = n2.n_nationkey
AND o_orderdate BETWEEN date '1995-01-01' AND date '1996-12-31'
AND p_type = 'ECONOMY ANODIZED STEEL'
) AS all_nations
GROUP BY
o_year
@ -397,44 +406,43 @@ ORDER BY
o_year;
```
Q9
**Q9**
```sql
SELECT
nation,
o_year,
sum(amount) AS sum_profit
FROM
(
SELECT
n_name AS nation,
extract(year FROM o_orderdate) AS o_year,
l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity AS amount
FROM
part,
supplier,
lineitem,
partsupp,
orders,
nation
WHERE
s_suppkey = l_suppkey
AND ps_suppkey = l_suppkey
AND ps_partkey = l_partkey
AND p_partkey = l_partkey
AND o_orderkey = l_orderkey
AND s_nationkey = n_nationkey
AND p_name LIKE '%drab%'
FROM (
SELECT
n_name AS nation,
extract(year FROM o_orderdate) AS o_year,
l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity AS amount
FROM
part,
supplier,
lineitem,
partsupp,
orders,
nation
WHERE
s_suppkey = l_suppkey
AND ps_suppkey = l_suppkey
AND ps_partkey = l_partkey
AND p_partkey = l_partkey
AND o_orderkey = l_orderkey
AND s_nationkey = n_nationkey
AND p_name LIKE '%green%'
) AS profit
GROUP BY
nation,
o_year
ORDER BY
nation,
o_year desc;
o_year DESC;
```
Q10
**Q10**
```sql
SELECT
@ -454,8 +462,8 @@ FROM
WHERE
c_custkey = o_custkey
AND l_orderkey = o_orderkey
AND o_orderdate >= date '1993-06-01'
AND o_orderdate < date '1993-06-01' + interval '3' month
AND o_orderdate >= date '1993-10-01'
AND o_orderdate < date '1993-10-01' + INTERVAL '3' MONTH
AND l_returnflag = 'R'
AND c_nationkey = n_nationkey
GROUP BY
@ -467,11 +475,10 @@ GROUP BY
c_address,
c_comment
ORDER BY
revenue desc
LIMIT 20;
revenue DESC;
```
Q11
**Q11**
```sql
SELECT
@ -484,12 +491,12 @@ FROM
WHERE
ps_suppkey = s_suppkey
AND s_nationkey = n_nationkey
AND n_name = 'MOZAMBIQUE'
AND n_name = 'GERMANY'
GROUP BY
ps_partkey having
ps_partkey HAVING
sum(ps_supplycost * ps_availqty) > (
SELECT
sum(ps_supplycost * ps_availqty) * 0.0000010000
sum(ps_supplycost * ps_availqty) * 0.0001
FROM
partsupp,
supplier,
@ -497,89 +504,88 @@ GROUP BY
WHERE
ps_suppkey = s_suppkey
AND s_nationkey = n_nationkey
AND n_name = 'MOZAMBIQUE'
AND n_name = 'GERMANY'
)
ORDER BY
value desc;
value DESC;
```
Q12
**Q12**
```sql
SELECT
l_shipmode,
sum(CASE
WHEN o_orderpriority = '1-URGENT'
OR o_orderpriority = '2-HIGH'
WHEN o_orderpriority = '1-URGENT'
OR o_orderpriority = '2-HIGH'
THEN 1
ELSE 0
END) AS high_line_count,
ELSE 0
END) AS high_line_count,
sum(CASE
WHEN o_orderpriority <> '1-URGENT'
AND o_orderpriority <> '2-HIGH'
AND o_orderpriority <> '2-HIGH'
THEN 1
ELSE 0
END) AS low_line_count
END) AS low_line_count
FROM
orders,
lineitem
WHERE
o_orderkey = l_orderkey
AND l_shipmode in ('MAIL', 'AIR')
AND l_shipmode in ('MAIL', 'SHIP')
AND l_commitdate < l_receiptdate
AND l_shipdate < l_commitdate
AND l_receiptdate >= date '1996-01-01'
AND l_receiptdate < date '1996-01-01' + interval '1' year
AND l_receiptdate >= date '1994-01-01'
AND l_receiptdate < date '1994-01-01' + INTERVAL '1' year
GROUP BY
l_shipmode
ORDER BY
l_shipmode;
```
Q13
**Q13**
```sql
SELECT
c_count,
count(*) AS custdist
FROM
(
SELECT
c_custkey,
count(o_orderkey)
FROM
customer LEFT OUTER JOIN orders ON
c_custkey = o_custkey
AND o_comment NOT LIKE '%special%deposits%'
GROUP BY
c_custkey
FROM (
SELECT
c_custkey,
count(o_orderkey)
FROM
customer LEFT OUTER JOIN orders ON
c_custkey = o_custkey
AND o_comment NOT LIKE '%special%requests%'
GROUP BY
c_custkey
) AS c_orders
GROUP BY
c_count
ORDER BY
custdist desc,
c_count desc;
custdist DESC,
c_count DESC;
```
Q14
**Q14**
```sql
SELECT
100.00 * sum(CASE
WHEN p_type LIKE 'PROMO%'
THEN l_extendedprice * (1 - l_discount)
ELSE 0
END) / sum(l_extendedprice * (1 - l_discount)) AS promo_revenue
WHEN p_type LIKE 'PROMO%'
THEN l_extendedprice * (1 - l_discount)
ELSE 0
END) / sum(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
lineitem,
part
WHERE
l_partkey = p_partkey
AND l_shipdate >= date '1996-10-01'
AND l_shipdate < date '1996-10-01' + interval '1' month;
AND l_shipdate >= date '1995-09-01'
AND l_shipdate < date '1995-09-01' + INTERVAL '1' MONTH;
```
Q15
**Q15**
```sql
CREATE VIEW revenue0 (supplier_no, total_revenue) AS
@ -589,8 +595,8 @@ CREATE VIEW revenue0 (supplier_no, total_revenue) AS
FROM
lineitem
WHERE
l_shipdate >= date '1997-06-01'
AND l_shipdate < date '1997-06-01' + interval '3' month
l_shipdate >= date '1996-01-01'
AND l_shipdate < date '1996-01-01' + INTERVAL '3' MONTH
GROUP BY
l_suppkey;
@ -617,7 +623,7 @@ ORDER BY
DROP VIEW revenue0;
```
Q16
**Q16**
```sql
SELECT
@ -630,9 +636,9 @@ FROM
part
WHERE
p_partkey = ps_partkey
AND p_brand <> 'Brand#15'
AND p_type NOT LIKE 'SMALL POLISHED%'
AND p_size in (21, 9, 46, 34, 50, 33, 17, 36)
AND p_brand <> 'Brand#45'
AND p_type NOT LIKE 'MEDIUM POLISHED%'
AND p_size in (49, 14, 23, 45, 19, 3, 36, 9)
AND ps_suppkey NOT in (
SELECT
s_suppkey
@ -646,13 +652,13 @@ GROUP BY
p_type,
p_size
ORDER BY
supplier_cnt desc,
supplier_cnt DESC,
p_brand,
p_type,
p_size;
```
Q17
**Q17**
```sql
SELECT
@ -662,8 +668,8 @@ FROM
part
WHERE
p_partkey = l_partkey
AND p_brand = 'Brand#52'
AND p_container = 'MED CASE'
AND p_brand = 'Brand#23'
AND p_container = 'MED BOX'
AND l_quantity < (
SELECT
0.2 * avg(l_quantity)
@ -674,7 +680,7 @@ WHERE
);
```
Q18
**Q18**
```sql
SELECT
@ -695,8 +701,9 @@ WHERE
FROM
lineitem
GROUP BY
l_orderkey having
sum(l_quantity) > 313
l_orderkey
HAVING
sum(l_quantity) > 300
)
AND c_custkey = o_custkey
AND o_orderkey = l_orderkey
@ -707,12 +714,11 @@ GROUP BY
o_orderdate,
o_totalprice
ORDER BY
o_totalprice desc,
o_orderdate
LIMIT 100;
o_totalprice DESC,
o_orderdate;
```
Q19
**Q19**
```sql
SELECT
@ -723,36 +729,36 @@ FROM
WHERE
(
p_partkey = l_partkey
AND p_brand = 'Brand#31'
AND p_brand = 'Brand#12'
AND p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
AND l_quantity >= 3 AND l_quantity <= 3 + 10
AND p_size between 1 AND 5
AND l_quantity >= 1 AND l_quantity <= 1 + 10
AND p_size BETWEEN 1 AND 5
AND l_shipmode in ('AIR', 'AIR REG')
AND l_shipinstruct = 'DELIVER IN PERSON'
)
OR
(
p_partkey = l_partkey
AND p_brand = 'Brand#54'
AND p_brand = 'Brand#23'
AND p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
AND l_quantity >= 17 AND l_quantity <= 17 + 10
AND p_size between 1 AND 10
AND l_quantity >= 10 AND l_quantity <= 10 + 10
AND p_size BETWEEN 1 AND 10
AND l_shipmode in ('AIR', 'AIR REG')
AND l_shipinstruct = 'DELIVER IN PERSON'
)
OR
(
p_partkey = l_partkey
AND p_brand = 'Brand#54'
AND p_brand = 'Brand#34'
AND p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
AND l_quantity >= 26 AND l_quantity <= 26 + 10
AND p_size between 1 AND 15
AND l_quantity >= 20 AND l_quantity <= 20 + 10
AND p_size BETWEEN 1 AND 15
AND l_shipmode in ('AIR', 'AIR REG')
AND l_shipinstruct = 'DELIVER IN PERSON'
);
```
Q20
**Q20**
```sql
SELECT
@ -774,7 +780,7 @@ WHERE
FROM
part
WHERE
p_name LIKE 'chiffon%'
p_name LIKE 'forrest%'
)
AND ps_availqty > (
SELECT
@ -784,17 +790,17 @@ WHERE
WHERE
l_partkey = ps_partkey
AND l_suppkey = ps_suppkey
AND l_shipdate >= date '1997-01-01'
AND l_shipdate < date '1997-01-01' + interval '1' year
AND l_shipdate >= date '1994-01-01'
AND l_shipdate < date '1994-01-01' + INTERVAL '1' year
)
)
AND s_nationkey = n_nationkey
AND n_name = 'MOZAMBIQUE'
AND n_name = 'CANADA'
ORDER BY
s_name;
```
Q21
**Q21**
```sql
SELECT
@ -830,16 +836,15 @@ WHERE
AND l3.l_receiptdate > l3.l_commitdate
)
AND s_nationkey = n_nationkey
AND n_name = 'RUSSIA'
AND n_name = 'SAUDI ARABIA'
GROUP BY
s_name
ORDER BY
numwait desc,
s_name
LIMIT 100;
numwait DESC,
s_name;
```
Q22
**Q22**
```sql
SELECT
@ -855,7 +860,7 @@ FROM
customer
WHERE
substring(c_phone FROM 1 for 2) in
('26', '34', '10', '18', '27', '12', '11')
('13', '31', '23', '29', '30', '18', '17')
AND c_acctbal > (
SELECT
avg(c_acctbal)
@ -864,7 +869,7 @@ FROM
WHERE
c_acctbal > 0.00
AND substring(c_phone FROM 1 for 2) in
('26', '34', '10', '18', '27', '12', '11')
('13', '31', '23', '29', '30', '18', '17')
)
AND NOT EXISTS (
SELECT