mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-11 08:52:06 +00:00
461 lines
12 KiB
Plaintext
461 lines
12 KiB
Plaintext
---
|
||
slug: /zh/getting-started/example-datasets/brown-benchmark
|
||
sidebar_label: 布朗大学基准
|
||
description: 机器生成日志数据的新分析基准
|
||
title: "布朗大学基准"
|
||
---
|
||
|
||
`MgBench` 是机器生成的日志数据的新分析基准,[Andrew Crotty](http://cs.brown.edu/people/acrotty/)。
|
||
|
||
下载数据:
|
||
|
||
```bash
|
||
wget https://datasets.clickhouse.com/mgbench{1..3}.csv.xz
|
||
```
|
||
|
||
解压数据:
|
||
|
||
```bash
|
||
xz -v -d mgbench{1..3}.csv.xz
|
||
```
|
||
|
||
创建数据库和表:
|
||
|
||
```sql
|
||
CREATE DATABASE mgbench;
|
||
```
|
||
|
||
```sql
|
||
USE mgbench;
|
||
```
|
||
|
||
```sql
|
||
CREATE TABLE mgbench.logs1 (
|
||
log_time DateTime,
|
||
machine_name LowCardinality(String),
|
||
machine_group LowCardinality(String),
|
||
cpu_idle Nullable(Float32),
|
||
cpu_nice Nullable(Float32),
|
||
cpu_system Nullable(Float32),
|
||
cpu_user Nullable(Float32),
|
||
cpu_wio Nullable(Float32),
|
||
disk_free Nullable(Float32),
|
||
disk_total Nullable(Float32),
|
||
part_max_used Nullable(Float32),
|
||
load_fifteen Nullable(Float32),
|
||
load_five Nullable(Float32),
|
||
load_one Nullable(Float32),
|
||
mem_buffers Nullable(Float32),
|
||
mem_cached Nullable(Float32),
|
||
mem_free Nullable(Float32),
|
||
mem_shared Nullable(Float32),
|
||
swap_free Nullable(Float32),
|
||
bytes_in Nullable(Float32),
|
||
bytes_out Nullable(Float32)
|
||
)
|
||
ENGINE = MergeTree()
|
||
ORDER BY (machine_group, machine_name, log_time);
|
||
```
|
||
|
||
|
||
```sql
|
||
CREATE TABLE mgbench.logs2 (
|
||
log_time DateTime,
|
||
client_ip IPv4,
|
||
request String,
|
||
status_code UInt16,
|
||
object_size UInt64
|
||
)
|
||
ENGINE = MergeTree()
|
||
ORDER BY log_time;
|
||
```
|
||
|
||
|
||
```sql
|
||
CREATE TABLE mgbench.logs3 (
|
||
log_time DateTime64,
|
||
device_id FixedString(15),
|
||
device_name LowCardinality(String),
|
||
device_type LowCardinality(String),
|
||
device_floor UInt8,
|
||
event_type LowCardinality(String),
|
||
event_unit FixedString(1),
|
||
event_value Nullable(Float32)
|
||
)
|
||
ENGINE = MergeTree()
|
||
ORDER BY (event_type, log_time);
|
||
```
|
||
|
||
插入数据:
|
||
|
||
```
|
||
clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv
|
||
clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbench2.csv
|
||
clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv
|
||
```
|
||
|
||
## 运行基准查询:
|
||
|
||
```sql
|
||
USE mgbench;
|
||
```
|
||
|
||
```sql
|
||
-- Q1.1: 自午夜以来每个 Web 服务器的 CPU/网络利用率是多少?
|
||
|
||
SELECT machine_name,
|
||
MIN(cpu) AS cpu_min,
|
||
MAX(cpu) AS cpu_max,
|
||
AVG(cpu) AS cpu_avg,
|
||
MIN(net_in) AS net_in_min,
|
||
MAX(net_in) AS net_in_max,
|
||
AVG(net_in) AS net_in_avg,
|
||
MIN(net_out) AS net_out_min,
|
||
MAX(net_out) AS net_out_max,
|
||
AVG(net_out) AS net_out_avg
|
||
FROM (
|
||
SELECT machine_name,
|
||
COALESCE(cpu_user, 0.0) AS cpu,
|
||
COALESCE(bytes_in, 0.0) AS net_in,
|
||
COALESCE(bytes_out, 0.0) AS net_out
|
||
FROM logs1
|
||
WHERE machine_name IN ('anansi','aragog','urd')
|
||
AND log_time >= TIMESTAMP '2017-01-11 00:00:00'
|
||
) AS r
|
||
GROUP BY machine_name;
|
||
```
|
||
|
||
|
||
```sql
|
||
-- Q1.2:最近一天有哪些机房的机器离线?
|
||
|
||
SELECT machine_name,
|
||
log_time
|
||
FROM logs1
|
||
WHERE (machine_name LIKE 'cslab%' OR
|
||
machine_name LIKE 'mslab%')
|
||
AND load_one IS NULL
|
||
AND log_time >= TIMESTAMP '2017-01-10 00:00:00'
|
||
ORDER BY machine_name,
|
||
log_time;
|
||
```
|
||
|
||
```sql
|
||
-- Q1.3:特定工作站过去 10 天的每小时的平均指标是多少?
|
||
|
||
SELECT dt,
|
||
hr,
|
||
AVG(load_fifteen) AS load_fifteen_avg,
|
||
AVG(load_five) AS load_five_avg,
|
||
AVG(load_one) AS load_one_avg,
|
||
AVG(mem_free) AS mem_free_avg,
|
||
AVG(swap_free) AS swap_free_avg
|
||
FROM (
|
||
SELECT CAST(log_time AS DATE) AS dt,
|
||
EXTRACT(HOUR FROM log_time) AS hr,
|
||
load_fifteen,
|
||
load_five,
|
||
load_one,
|
||
mem_free,
|
||
swap_free
|
||
FROM logs1
|
||
WHERE machine_name = 'babbage'
|
||
AND load_fifteen IS NOT NULL
|
||
AND load_five IS NOT NULL
|
||
AND load_one IS NOT NULL
|
||
AND mem_free IS NOT NULL
|
||
AND swap_free IS NOT NULL
|
||
AND log_time >= TIMESTAMP '2017-01-01 00:00:00'
|
||
) AS r
|
||
GROUP BY dt,
|
||
hr
|
||
ORDER BY dt,
|
||
hr;
|
||
```
|
||
|
||
```sql
|
||
-- Q1.4: 1 个月内,每台服务器的磁盘 I/O 阻塞的频率是多少?
|
||
|
||
SELECT machine_name,
|
||
COUNT(*) AS spikes
|
||
FROM logs1
|
||
WHERE machine_group = 'Servers'
|
||
AND cpu_wio > 0.99
|
||
AND log_time >= TIMESTAMP '2016-12-01 00:00:00'
|
||
AND log_time < TIMESTAMP '2017-01-01 00:00:00'
|
||
GROUP BY machine_name
|
||
ORDER BY spikes DESC
|
||
LIMIT 10;
|
||
```
|
||
|
||
```sql
|
||
-- Q1.5:哪些外部可访问的虚拟机的运行内存不足?
|
||
|
||
SELECT machine_name,
|
||
dt,
|
||
MIN(mem_free) AS mem_free_min
|
||
FROM (
|
||
SELECT machine_name,
|
||
CAST(log_time AS DATE) AS dt,
|
||
mem_free
|
||
FROM logs1
|
||
WHERE machine_group = 'DMZ'
|
||
AND mem_free IS NOT NULL
|
||
) AS r
|
||
GROUP BY machine_name,
|
||
dt
|
||
HAVING MIN(mem_free) < 10000
|
||
ORDER BY machine_name,
|
||
dt;
|
||
```
|
||
|
||
```sql
|
||
-- Q1.6: 每小时所有文件服务器的总网络流量是多少?
|
||
|
||
SELECT dt,
|
||
hr,
|
||
SUM(net_in) AS net_in_sum,
|
||
SUM(net_out) AS net_out_sum,
|
||
SUM(net_in) + SUM(net_out) AS both_sum
|
||
FROM (
|
||
SELECT CAST(log_time AS DATE) AS dt,
|
||
EXTRACT(HOUR FROM log_time) AS hr,
|
||
COALESCE(bytes_in, 0.0) / 1000000000.0 AS net_in,
|
||
COALESCE(bytes_out, 0.0) / 1000000000.0 AS net_out
|
||
FROM logs1
|
||
WHERE machine_name IN ('allsorts','andes','bigred','blackjack','bonbon',
|
||
'cadbury','chiclets','cotton','crows','dove','fireball','hearts','huey',
|
||
'lindt','milkduds','milkyway','mnm','necco','nerds','orbit','peeps',
|
||
'poprocks','razzles','runts','smarties','smuggler','spree','stride',
|
||
'tootsie','trident','wrigley','york')
|
||
) AS r
|
||
GROUP BY dt,
|
||
hr
|
||
ORDER BY both_sum DESC
|
||
LIMIT 10;
|
||
```
|
||
|
||
```sql
|
||
-- Q2.1:过去 2 周内哪些请求导致了服务器错误?
|
||
|
||
SELECT *
|
||
FROM logs2
|
||
WHERE status_code >= 500
|
||
AND log_time >= TIMESTAMP '2012-12-18 00:00:00'
|
||
ORDER BY log_time;
|
||
```
|
||
|
||
```sql
|
||
-- Q2.2:在特定的某 2 周内,用户密码文件是否被泄露了?
|
||
|
||
SELECT *
|
||
FROM logs2
|
||
WHERE status_code >= 200
|
||
AND status_code < 300
|
||
AND request LIKE '%/etc/passwd%'
|
||
AND log_time >= TIMESTAMP '2012-05-06 00:00:00'
|
||
AND log_time < TIMESTAMP '2012-05-20 00:00:00';
|
||
```
|
||
|
||
|
||
```sql
|
||
-- Q2.3:过去一个月顶级请求的平均路径深度是多少?
|
||
|
||
SELECT top_level,
|
||
AVG(LENGTH(request) - LENGTH(REPLACE(request, '/', ''))) AS depth_avg
|
||
FROM (
|
||
SELECT SUBSTRING(request FROM 1 FOR len) AS top_level,
|
||
request
|
||
FROM (
|
||
SELECT POSITION(SUBSTRING(request FROM 2), '/') AS len,
|
||
request
|
||
FROM logs2
|
||
WHERE status_code >= 200
|
||
AND status_code < 300
|
||
AND log_time >= TIMESTAMP '2012-12-01 00:00:00'
|
||
) AS r
|
||
WHERE len > 0
|
||
) AS s
|
||
WHERE top_level IN ('/about','/courses','/degrees','/events',
|
||
'/grad','/industry','/news','/people',
|
||
'/publications','/research','/teaching','/ugrad')
|
||
GROUP BY top_level
|
||
ORDER BY top_level;
|
||
```
|
||
|
||
|
||
```sql
|
||
-- Q2.4:在过去的 3 个月里,哪些客户端发出了过多的请求?
|
||
|
||
SELECT client_ip,
|
||
COUNT(*) AS num_requests
|
||
FROM logs2
|
||
WHERE log_time >= TIMESTAMP '2012-10-01 00:00:00'
|
||
GROUP BY client_ip
|
||
HAVING COUNT(*) >= 100000
|
||
ORDER BY num_requests DESC;
|
||
```
|
||
|
||
|
||
```sql
|
||
-- Q2.5:每天的独立访问者数量是多少?
|
||
|
||
SELECT dt,
|
||
COUNT(DISTINCT client_ip)
|
||
FROM (
|
||
SELECT CAST(log_time AS DATE) AS dt,
|
||
client_ip
|
||
FROM logs2
|
||
) AS r
|
||
GROUP BY dt
|
||
ORDER BY dt;
|
||
```
|
||
|
||
|
||
```sql
|
||
-- Q2.6:平均和最大数据传输速率(Gbps)是多少?
|
||
|
||
SELECT AVG(transfer) / 125000000.0 AS transfer_avg,
|
||
MAX(transfer) / 125000000.0 AS transfer_max
|
||
FROM (
|
||
SELECT log_time,
|
||
SUM(object_size) AS transfer
|
||
FROM logs2
|
||
GROUP BY log_time
|
||
) AS r;
|
||
```
|
||
|
||
|
||
```sql
|
||
-- Q3.1:自 2019/11/29 17:00 以来,室温是否达到过冰点?
|
||
|
||
SELECT *
|
||
FROM logs3
|
||
WHERE event_type = 'temperature'
|
||
AND event_value <= 32.0
|
||
AND log_time >= '2019-11-29 17:00:00.000';
|
||
```
|
||
|
||
|
||
```sql
|
||
-- Q3.4:在过去的 6 个月里,每扇门打开的频率是多少?
|
||
|
||
SELECT device_name,
|
||
device_floor,
|
||
COUNT(*) AS ct
|
||
FROM logs3
|
||
WHERE event_type = 'door_open'
|
||
AND log_time >= '2019-06-01 00:00:00.000'
|
||
GROUP BY device_name,
|
||
device_floor
|
||
ORDER BY ct DESC;
|
||
```
|
||
|
||
下面的查询 3.5 使用了 UNION 关键词。设置该模式以便组合 SELECT 的查询结果。该设置仅在未明确指定 UNION ALL 或 UNION DISTINCT 但使用了 UNION 进行共享时使用。
|
||
|
||
```sql
|
||
SET union_default_mode = 'DISTINCT'
|
||
```
|
||
|
||
```sql
|
||
-- Q3.5: 在冬季和夏季,建筑物内哪些地方会出现较大的温度变化?
|
||
|
||
WITH temperature AS (
|
||
SELECT dt,
|
||
device_name,
|
||
device_type,
|
||
device_floor
|
||
FROM (
|
||
SELECT dt,
|
||
hr,
|
||
device_name,
|
||
device_type,
|
||
device_floor,
|
||
AVG(event_value) AS temperature_hourly_avg
|
||
FROM (
|
||
SELECT CAST(log_time AS DATE) AS dt,
|
||
EXTRACT(HOUR FROM log_time) AS hr,
|
||
device_name,
|
||
device_type,
|
||
device_floor,
|
||
event_value
|
||
FROM logs3
|
||
WHERE event_type = 'temperature'
|
||
) AS r
|
||
GROUP BY dt,
|
||
hr,
|
||
device_name,
|
||
device_type,
|
||
device_floor
|
||
) AS s
|
||
GROUP BY dt,
|
||
device_name,
|
||
device_type,
|
||
device_floor
|
||
HAVING MAX(temperature_hourly_avg) - MIN(temperature_hourly_avg) >= 25.0
|
||
)
|
||
SELECT DISTINCT device_name,
|
||
device_type,
|
||
device_floor,
|
||
'WINTER'
|
||
FROM temperature
|
||
WHERE dt >= DATE '2018-12-01'
|
||
AND dt < DATE '2019-03-01'
|
||
UNION
|
||
SELECT DISTINCT device_name,
|
||
device_type,
|
||
device_floor,
|
||
'SUMMER'
|
||
FROM temperature
|
||
WHERE dt >= DATE '2019-06-01'
|
||
AND dt < DATE '2019-09-01';
|
||
```
|
||
|
||
|
||
```sql
|
||
-- Q3.6:对于每种类别的设备,每月的功耗指标是什么?
|
||
|
||
SELECT yr,
|
||
mo,
|
||
SUM(coffee_hourly_avg) AS coffee_monthly_sum,
|
||
AVG(coffee_hourly_avg) AS coffee_monthly_avg,
|
||
SUM(printer_hourly_avg) AS printer_monthly_sum,
|
||
AVG(printer_hourly_avg) AS printer_monthly_avg,
|
||
SUM(projector_hourly_avg) AS projector_monthly_sum,
|
||
AVG(projector_hourly_avg) AS projector_monthly_avg,
|
||
SUM(vending_hourly_avg) AS vending_monthly_sum,
|
||
AVG(vending_hourly_avg) AS vending_monthly_avg
|
||
FROM (
|
||
SELECT dt,
|
||
yr,
|
||
mo,
|
||
hr,
|
||
AVG(coffee) AS coffee_hourly_avg,
|
||
AVG(printer) AS printer_hourly_avg,
|
||
AVG(projector) AS projector_hourly_avg,
|
||
AVG(vending) AS vending_hourly_avg
|
||
FROM (
|
||
SELECT CAST(log_time AS DATE) AS dt,
|
||
EXTRACT(YEAR FROM log_time) AS yr,
|
||
EXTRACT(MONTH FROM log_time) AS mo,
|
||
EXTRACT(HOUR FROM log_time) AS hr,
|
||
CASE WHEN device_name LIKE 'coffee%' THEN event_value END AS coffee,
|
||
CASE WHEN device_name LIKE 'printer%' THEN event_value END AS printer,
|
||
CASE WHEN device_name LIKE 'projector%' THEN event_value END AS projector,
|
||
CASE WHEN device_name LIKE 'vending%' THEN event_value END AS vending
|
||
FROM logs3
|
||
WHERE device_type = 'meter'
|
||
) AS r
|
||
GROUP BY dt,
|
||
yr,
|
||
mo,
|
||
hr
|
||
) AS s
|
||
GROUP BY yr,
|
||
mo
|
||
ORDER BY yr,
|
||
mo;
|
||
```
|
||
|
||
此数据集可在 [Playground](https://play.clickhouse.com/play?user=play) 中进行交互式的请求, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).
|