mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 01:22:04 +00:00
Add description for Brown Benchmark
This commit is contained in:
parent
a4f8a5390e
commit
04d8544151
412
docs/en/getting-started/example-datasets/brown-benchmark.md
Normal file
412
docs/en/getting-started/example-datasets/brown-benchmark.md
Normal file
@ -0,0 +1,412 @@
|
|||||||
|
---
|
||||||
|
toc_priority: 20
|
||||||
|
toc_title: Brown University Benchmark
|
||||||
|
---
|
||||||
|
|
||||||
|
# Brown University Benchmark
|
||||||
|
|
||||||
|
MgBench - A new analytical benchmark for machine-generated log data, [Andrew Crotty](http://cs.brown.edu/people/acrotty/).
|
||||||
|
|
||||||
|
Download the data:
|
||||||
|
```
|
||||||
|
wget https://datasets.clickhouse.tech/mgbench{1..3}.csv.xz
|
||||||
|
```
|
||||||
|
|
||||||
|
Unpack the data:
|
||||||
|
```
|
||||||
|
xz -v -d mgbench{1..3}.csv.xz
|
||||||
|
```
|
||||||
|
|
||||||
|
Create tables:
|
||||||
|
```
|
||||||
|
CREATE DATABASE mgbench;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE mgbench.logs1 (
|
||||||
|
log_time DateTime,
|
||||||
|
machine_name LowCardinality(String),
|
||||||
|
machine_group LowCardinality(String),
|
||||||
|
cpu_idle Nullable(Float32),
|
||||||
|
cpu_nice Nullable(Float32),
|
||||||
|
cpu_system Nullable(Float32),
|
||||||
|
cpu_user Nullable(Float32),
|
||||||
|
cpu_wio Nullable(Float32),
|
||||||
|
disk_free Nullable(Float32),
|
||||||
|
disk_total Nullable(Float32),
|
||||||
|
part_max_used Nullable(Float32),
|
||||||
|
load_fifteen Nullable(Float32),
|
||||||
|
load_five Nullable(Float32),
|
||||||
|
load_one Nullable(Float32),
|
||||||
|
mem_buffers Nullable(Float32),
|
||||||
|
mem_cached Nullable(Float32),
|
||||||
|
mem_free Nullable(Float32),
|
||||||
|
mem_shared Nullable(Float32),
|
||||||
|
swap_free Nullable(Float32),
|
||||||
|
bytes_in Nullable(Float32),
|
||||||
|
bytes_out Nullable(Float32)
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree()
|
||||||
|
ORDER BY (machine_group, machine_name, log_time);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE mgbench.logs2 (
|
||||||
|
log_time DateTime,
|
||||||
|
client_ip IPv4,
|
||||||
|
request String,
|
||||||
|
status_code UInt16,
|
||||||
|
object_size UInt64
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree()
|
||||||
|
ORDER BY log_time;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE mgbench.logs3 (
|
||||||
|
log_time DateTime64,
|
||||||
|
device_id FixedString(15),
|
||||||
|
device_name LowCardinality(String),
|
||||||
|
device_type LowCardinality(String),
|
||||||
|
device_floor UInt8,
|
||||||
|
event_type LowCardinality(String),
|
||||||
|
event_unit FixedString(1),
|
||||||
|
event_value Nullable(Float32)
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree()
|
||||||
|
ORDER BY (event_type, log_time);
|
||||||
|
```
|
||||||
|
|
||||||
|
Insert data:
|
||||||
|
|
||||||
|
```
|
||||||
|
clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv
|
||||||
|
clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbench2.csv
|
||||||
|
clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
Run benchmark queries:
|
||||||
|
```
|
||||||
|
-- Q1.1: What is the CPU/network utilization for each web server since midnight?
|
||||||
|
|
||||||
|
SELECT machine_name,
|
||||||
|
MIN(cpu) AS cpu_min,
|
||||||
|
MAX(cpu) AS cpu_max,
|
||||||
|
AVG(cpu) AS cpu_avg,
|
||||||
|
MIN(net_in) AS net_in_min,
|
||||||
|
MAX(net_in) AS net_in_max,
|
||||||
|
AVG(net_in) AS net_in_avg,
|
||||||
|
MIN(net_out) AS net_out_min,
|
||||||
|
MAX(net_out) AS net_out_max,
|
||||||
|
AVG(net_out) AS net_out_avg
|
||||||
|
FROM (
|
||||||
|
SELECT machine_name,
|
||||||
|
COALESCE(cpu_user, 0.0) AS cpu,
|
||||||
|
COALESCE(bytes_in, 0.0) AS net_in,
|
||||||
|
COALESCE(bytes_out, 0.0) AS net_out
|
||||||
|
FROM logs1
|
||||||
|
WHERE machine_name IN ('anansi','aragog','urd')
|
||||||
|
AND log_time >= TIMESTAMP '2017-01-11 00:00:00'
|
||||||
|
) AS r
|
||||||
|
GROUP BY machine_name;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q1.2: Which computer lab machines have been offline in the past day?
|
||||||
|
|
||||||
|
SELECT machine_name,
|
||||||
|
log_time
|
||||||
|
FROM logs1
|
||||||
|
WHERE (machine_name LIKE 'cslab%' OR
|
||||||
|
machine_name LIKE 'mslab%')
|
||||||
|
AND load_one IS NULL
|
||||||
|
AND log_time >= TIMESTAMP '2017-01-10 00:00:00'
|
||||||
|
ORDER BY machine_name,
|
||||||
|
log_time;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q1.3: What are the hourly average metrics during the past 10 days for a specific workstation?
|
||||||
|
|
||||||
|
SELECT dt,
|
||||||
|
hr,
|
||||||
|
AVG(load_fifteen) AS load_fifteen_avg,
|
||||||
|
AVG(load_five) AS load_five_avg,
|
||||||
|
AVG(load_one) AS load_one_avg,
|
||||||
|
AVG(mem_free) AS mem_free_avg,
|
||||||
|
AVG(swap_free) AS swap_free_avg
|
||||||
|
FROM (
|
||||||
|
SELECT CAST(log_time AS DATE) AS dt,
|
||||||
|
EXTRACT(HOUR FROM log_time) AS hr,
|
||||||
|
load_fifteen,
|
||||||
|
load_five,
|
||||||
|
load_one,
|
||||||
|
mem_free,
|
||||||
|
swap_free
|
||||||
|
FROM logs1
|
||||||
|
WHERE machine_name = 'babbage'
|
||||||
|
AND load_fifteen IS NOT NULL
|
||||||
|
AND load_five IS NOT NULL
|
||||||
|
AND load_one IS NOT NULL
|
||||||
|
AND mem_free IS NOT NULL
|
||||||
|
AND swap_free IS NOT NULL
|
||||||
|
AND log_time >= TIMESTAMP '2017-01-01 00:00:00'
|
||||||
|
) AS r
|
||||||
|
GROUP BY dt,
|
||||||
|
hr
|
||||||
|
ORDER BY dt,
|
||||||
|
hr;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q1.4: Over a 1-month period, how often was each server blocked on disk I/O?
|
||||||
|
|
||||||
|
SELECT machine_name,
|
||||||
|
COUNT(*) AS spikes
|
||||||
|
FROM logs1
|
||||||
|
WHERE machine_group = 'Servers'
|
||||||
|
AND cpu_wio > 0.99
|
||||||
|
AND log_time >= TIMESTAMP '2016-12-01 00:00:00'
|
||||||
|
AND log_time < TIMESTAMP '2017-01-01 00:00:00'
|
||||||
|
GROUP BY machine_name
|
||||||
|
ORDER BY spikes DESC
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q1.5: Which externally reachable VMs have run low on memory?
|
||||||
|
|
||||||
|
SELECT machine_name,
|
||||||
|
dt,
|
||||||
|
MIN(mem_free) AS mem_free_min
|
||||||
|
FROM (
|
||||||
|
SELECT machine_name,
|
||||||
|
CAST(log_time AS DATE) AS dt,
|
||||||
|
mem_free
|
||||||
|
FROM logs1
|
||||||
|
WHERE machine_group = 'DMZ'
|
||||||
|
AND mem_free IS NOT NULL
|
||||||
|
) AS r
|
||||||
|
GROUP BY machine_name,
|
||||||
|
dt
|
||||||
|
HAVING MIN(mem_free) < 10000
|
||||||
|
ORDER BY machine_name,
|
||||||
|
dt;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q1.6: What is the total hourly network traffic across all file servers?
|
||||||
|
|
||||||
|
SELECT dt,
|
||||||
|
hr,
|
||||||
|
SUM(net_in) AS net_in_sum,
|
||||||
|
SUM(net_out) AS net_out_sum,
|
||||||
|
SUM(net_in) + SUM(net_out) AS both_sum
|
||||||
|
FROM (
|
||||||
|
SELECT CAST(log_time AS DATE) AS dt,
|
||||||
|
EXTRACT(HOUR FROM log_time) AS hr,
|
||||||
|
COALESCE(bytes_in, 0.0) / 1000000000.0 AS net_in,
|
||||||
|
COALESCE(bytes_out, 0.0) / 1000000000.0 AS net_out
|
||||||
|
FROM logs1
|
||||||
|
WHERE machine_name IN ('allsorts','andes','bigred','blackjack','bonbon',
|
||||||
|
'cadbury','chiclets','cotton','crows','dove','fireball','hearts','huey',
|
||||||
|
'lindt','milkduds','milkyway','mnm','necco','nerds','orbit','peeps',
|
||||||
|
'poprocks','razzles','runts','smarties','smuggler','spree','stride',
|
||||||
|
'tootsie','trident','wrigley','york')
|
||||||
|
) AS r
|
||||||
|
GROUP BY dt,
|
||||||
|
hr
|
||||||
|
ORDER BY both_sum DESC
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q2.1: Which requests have caused server errors within the past 2 weeks?
|
||||||
|
|
||||||
|
SELECT *
|
||||||
|
FROM logs2
|
||||||
|
WHERE status_code >= 500
|
||||||
|
AND log_time >= TIMESTAMP '2012-12-18 00:00:00'
|
||||||
|
ORDER BY log_time;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q2.2: During a specific 2-week period, was the user password file leaked?
|
||||||
|
|
||||||
|
SELECT *
|
||||||
|
FROM logs2
|
||||||
|
WHERE status_code >= 200
|
||||||
|
AND status_code < 300
|
||||||
|
AND request LIKE '%/etc/passwd%'
|
||||||
|
AND log_time >= TIMESTAMP '2012-05-06 00:00:00'
|
||||||
|
AND log_time < TIMESTAMP '2012-05-20 00:00:00';
|
||||||
|
|
||||||
|
|
||||||
|
-- Q2.3: What was the average path depth for top-level requests in the past month?
|
||||||
|
|
||||||
|
SELECT top_level,
|
||||||
|
AVG(LENGTH(request) - LENGTH(REPLACE(request, '/', ''))) AS depth_avg
|
||||||
|
FROM (
|
||||||
|
SELECT SUBSTRING(request FROM 1 FOR len) AS top_level,
|
||||||
|
request
|
||||||
|
FROM (
|
||||||
|
SELECT POSITION(SUBSTRING(request FROM 2), '/') AS len,
|
||||||
|
request
|
||||||
|
FROM logs2
|
||||||
|
WHERE status_code >= 200
|
||||||
|
AND status_code < 300
|
||||||
|
AND log_time >= TIMESTAMP '2012-12-01 00:00:00'
|
||||||
|
) AS r
|
||||||
|
WHERE len > 0
|
||||||
|
) AS s
|
||||||
|
WHERE top_level IN ('/about','/courses','/degrees','/events',
|
||||||
|
'/grad','/industry','/news','/people',
|
||||||
|
'/publications','/research','/teaching','/ugrad')
|
||||||
|
GROUP BY top_level
|
||||||
|
ORDER BY top_level;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q2.4: During the last 3 months, which clients have made an excessive number of requests?
|
||||||
|
|
||||||
|
SELECT client_ip,
|
||||||
|
COUNT(*) AS num_requests
|
||||||
|
FROM logs2
|
||||||
|
WHERE log_time >= TIMESTAMP '2012-10-01 00:00:00'
|
||||||
|
GROUP BY client_ip
|
||||||
|
HAVING COUNT(*) >= 100000
|
||||||
|
ORDER BY num_requests DESC;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q2.5: What are the daily unique visitors?
|
||||||
|
|
||||||
|
SELECT dt,
|
||||||
|
COUNT(DISTINCT client_ip)
|
||||||
|
FROM (
|
||||||
|
SELECT CAST(log_time AS DATE) AS dt,
|
||||||
|
client_ip
|
||||||
|
FROM logs2
|
||||||
|
) AS r
|
||||||
|
GROUP BY dt
|
||||||
|
ORDER BY dt;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q2.6: What are the average and maximum data transfer rates (Gbps)?
|
||||||
|
|
||||||
|
SELECT AVG(transfer) / 125000000.0 AS transfer_avg,
|
||||||
|
MAX(transfer) / 125000000.0 AS transfer_max
|
||||||
|
FROM (
|
||||||
|
SELECT log_time,
|
||||||
|
SUM(object_size) AS transfer
|
||||||
|
FROM logs2
|
||||||
|
GROUP BY log_time
|
||||||
|
) AS r;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q3.1: Did the indoor temperature reach freezing over the weekend?
|
||||||
|
|
||||||
|
SELECT *
|
||||||
|
FROM logs3
|
||||||
|
WHERE event_type = 'temperature'
|
||||||
|
AND event_value <= 32.0
|
||||||
|
AND log_time >= '2019-11-29 17:00:00.000';
|
||||||
|
|
||||||
|
|
||||||
|
-- Q3.4: Over the past 6 months, how frequently was each door opened?
|
||||||
|
|
||||||
|
SELECT device_name,
|
||||||
|
device_floor,
|
||||||
|
COUNT(*) AS ct
|
||||||
|
FROM logs3
|
||||||
|
WHERE event_type = 'door_open'
|
||||||
|
AND log_time >= '2019-06-01 00:00:00.000'
|
||||||
|
GROUP BY device_name,
|
||||||
|
device_floor
|
||||||
|
ORDER BY ct DESC;
|
||||||
|
|
||||||
|
|
||||||
|
-- Q3.5: Where in the building do large temperature variations occur in winter and summer?
|
||||||
|
|
||||||
|
WITH temperature AS (
|
||||||
|
SELECT dt,
|
||||||
|
device_name,
|
||||||
|
device_type,
|
||||||
|
device_floor
|
||||||
|
FROM (
|
||||||
|
SELECT dt,
|
||||||
|
hr,
|
||||||
|
device_name,
|
||||||
|
device_type,
|
||||||
|
device_floor,
|
||||||
|
AVG(event_value) AS temperature_hourly_avg
|
||||||
|
FROM (
|
||||||
|
SELECT CAST(log_time AS DATE) AS dt,
|
||||||
|
EXTRACT(HOUR FROM log_time) AS hr,
|
||||||
|
device_name,
|
||||||
|
device_type,
|
||||||
|
device_floor,
|
||||||
|
event_value
|
||||||
|
FROM logs3
|
||||||
|
WHERE event_type = 'temperature'
|
||||||
|
) AS r
|
||||||
|
GROUP BY dt,
|
||||||
|
hr,
|
||||||
|
device_name,
|
||||||
|
device_type,
|
||||||
|
device_floor
|
||||||
|
) AS s
|
||||||
|
GROUP BY dt,
|
||||||
|
device_name,
|
||||||
|
device_type,
|
||||||
|
device_floor
|
||||||
|
HAVING MAX(temperature_hourly_avg) - MIN(temperature_hourly_avg) >= 25.0
|
||||||
|
)
|
||||||
|
SELECT DISTINCT device_name,
|
||||||
|
device_type,
|
||||||
|
device_floor,
|
||||||
|
'WINTER'
|
||||||
|
FROM temperature
|
||||||
|
WHERE dt >= DATE '2018-12-01'
|
||||||
|
AND dt < DATE '2019-03-01'
|
||||||
|
UNION
|
||||||
|
SELECT DISTINCT device_name,
|
||||||
|
device_type,
|
||||||
|
device_floor,
|
||||||
|
'SUMMER'
|
||||||
|
FROM temperature
|
||||||
|
WHERE dt >= DATE '2019-06-01'
|
||||||
|
AND dt < DATE '2019-09-01';
|
||||||
|
|
||||||
|
|
||||||
|
-- Q3.6: For each device category, what are the monthly power consumption metrics?
|
||||||
|
|
||||||
|
SELECT yr,
|
||||||
|
mo,
|
||||||
|
SUM(coffee_hourly_avg) AS coffee_monthly_sum,
|
||||||
|
AVG(coffee_hourly_avg) AS coffee_monthly_avg,
|
||||||
|
SUM(printer_hourly_avg) AS printer_monthly_sum,
|
||||||
|
AVG(printer_hourly_avg) AS printer_monthly_avg,
|
||||||
|
SUM(projector_hourly_avg) AS projector_monthly_sum,
|
||||||
|
AVG(projector_hourly_avg) AS projector_monthly_avg,
|
||||||
|
SUM(vending_hourly_avg) AS vending_monthly_sum,
|
||||||
|
AVG(vending_hourly_avg) AS vending_monthly_avg
|
||||||
|
FROM (
|
||||||
|
SELECT dt,
|
||||||
|
yr,
|
||||||
|
mo,
|
||||||
|
hr,
|
||||||
|
AVG(coffee) AS coffee_hourly_avg,
|
||||||
|
AVG(printer) AS printer_hourly_avg,
|
||||||
|
AVG(projector) AS projector_hourly_avg,
|
||||||
|
AVG(vending) AS vending_hourly_avg
|
||||||
|
FROM (
|
||||||
|
SELECT CAST(log_time AS DATE) AS dt,
|
||||||
|
EXTRACT(YEAR FROM log_time) AS yr,
|
||||||
|
EXTRACT(MONTH FROM log_time) AS mo,
|
||||||
|
EXTRACT(HOUR FROM log_time) AS hr,
|
||||||
|
CASE WHEN device_name LIKE 'coffee%' THEN event_value END AS coffee,
|
||||||
|
CASE WHEN device_name LIKE 'printer%' THEN event_value END AS printer,
|
||||||
|
CASE WHEN device_name LIKE 'projector%' THEN event_value END AS projector,
|
||||||
|
CASE WHEN device_name LIKE 'vending%' THEN event_value END AS vending
|
||||||
|
FROM logs3
|
||||||
|
WHERE device_type = 'meter'
|
||||||
|
) AS r
|
||||||
|
GROUP BY dt,
|
||||||
|
yr,
|
||||||
|
mo,
|
||||||
|
hr
|
||||||
|
) AS s
|
||||||
|
GROUP BY yr,
|
||||||
|
mo
|
||||||
|
ORDER BY yr,
|
||||||
|
mo;
|
||||||
|
```
|
@ -13,11 +13,12 @@ The list of documented datasets:
|
|||||||
- [GitHub Events](../../getting-started/example-datasets/github-events.md)
|
- [GitHub Events](../../getting-started/example-datasets/github-events.md)
|
||||||
- [Anonymized Yandex.Metrica Dataset](../../getting-started/example-datasets/metrica.md)
|
- [Anonymized Yandex.Metrica Dataset](../../getting-started/example-datasets/metrica.md)
|
||||||
- [Recipes](../../getting-started/example-datasets/recipes.md)
|
- [Recipes](../../getting-started/example-datasets/recipes.md)
|
||||||
|
- [OnTime](../../getting-started/example-datasets/ontime.md)
|
||||||
|
- [New York Taxi Data](../../getting-started/example-datasets/nyc-taxi.md)
|
||||||
- [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md)
|
- [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md)
|
||||||
- [WikiStat](../../getting-started/example-datasets/wikistat.md)
|
- [WikiStat](../../getting-started/example-datasets/wikistat.md)
|
||||||
- [Terabyte of Click Logs from Criteo](../../getting-started/example-datasets/criteo.md)
|
- [Terabyte of Click Logs from Criteo](../../getting-started/example-datasets/criteo.md)
|
||||||
- [AMPLab Big Data Benchmark](../../getting-started/example-datasets/amplab-benchmark.md)
|
- [AMPLab Big Data Benchmark](../../getting-started/example-datasets/amplab-benchmark.md)
|
||||||
- [New York Taxi Data](../../getting-started/example-datasets/nyc-taxi.md)
|
- [Brown University Benchmark](../../getting-started/example-datasets/brown-benchmark.md)
|
||||||
- [OnTime](../../getting-started/example-datasets/ontime.md)
|
|
||||||
|
|
||||||
[Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets) <!--hide-->
|
[Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets) <!--hide-->
|
||||||
|
Loading…
Reference in New Issue
Block a user