mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
Add Chinese Getting started docs (#3399)
This commit is contained in:
parent
5466ca27b8
commit
96435f6880
@ -3,19 +3,19 @@ nav:
|
||||
- '介绍':
|
||||
- '概貌': 'index.md'
|
||||
- 'ClickHouse的独特功能': 'introduction/distinctive_features.md'
|
||||
- 'ClickHouse功能可被视为缺点': 'introduction/features_considered_disadvantages.md'
|
||||
- 'ClickHouse不擅长做什么': 'introduction/features_considered_disadvantages.md'
|
||||
- '性能': 'introduction/performance.md'
|
||||
- 'Yandex.Metrica使用案例': 'introduction/ya_metrika_task.md'
|
||||
|
||||
- '起步':
|
||||
- '入门指南':
|
||||
- '部署运行': 'getting_started/index.md'
|
||||
- '示例数据集':
|
||||
- 'OnTime': 'getting_started/example_datasets/ontime.md'
|
||||
- 'New York Taxi data': 'getting_started/example_datasets/nyc_taxi.md'
|
||||
- 'AMPLab Big Data Benchmark': 'getting_started/example_datasets/amplab_benchmark.md'
|
||||
- 'WikiStat': 'getting_started/example_datasets/wikistat.md'
|
||||
- 'Terabyte click logs from Criteo': 'getting_started/example_datasets/criteo.md'
|
||||
- 'Star Schema Benchmark': 'getting_started/example_datasets/star_schema.md'
|
||||
- '航班飞行数据': 'getting_started/example_datasets/ontime.md'
|
||||
- '纽约市出租车数据': 'getting_started/example_datasets/nyc_taxi.md'
|
||||
- 'AMPLab大数据基准测试': 'getting_started/example_datasets/amplab_benchmark.md'
|
||||
- '维基访问数据': 'getting_started/example_datasets/wikistat.md'
|
||||
- 'Criteo TB级别点击日志': 'getting_started/example_datasets/criteo.md'
|
||||
- 'Star Schema基准测试': 'getting_started/example_datasets/star_schema.md'
|
||||
|
||||
- '客户端':
|
||||
- '介绍': 'interfaces/index.md'
|
||||
|
@ -1 +0,0 @@
|
||||
../../../en/getting_started/example_datasets/amplab_benchmark.md
|
123
docs/zh/getting_started/example_datasets/amplab_benchmark.md
Normal file
123
docs/zh/getting_started/example_datasets/amplab_benchmark.md
Normal file
@ -0,0 +1,123 @@
|
||||
# AMPLab 大数据基准测试
|
||||
|
||||
参考 <https://amplab.cs.berkeley.edu/benchmark/>
|
||||
|
||||
需要您在<https://aws.amazon.com>注册一个免费的账号。注册时需要您提供信用卡、邮箱、电话等信息。之后可以在<https://console.aws.amazon.com/iam/home?nc2=h_m_sc#security_credential>获取新的访问密钥
|
||||
|
||||
在控制台运行以下命令:
|
||||
|
||||
```bash
|
||||
sudo apt-get install s3cmd
|
||||
mkdir tiny; cd tiny;
|
||||
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ .
|
||||
cd ..
|
||||
mkdir 1node; cd 1node;
|
||||
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ .
|
||||
cd ..
|
||||
mkdir 5nodes; cd 5nodes;
|
||||
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ .
|
||||
cd ..
|
||||
```
|
||||
|
||||
在ClickHouse运行如下查询:
|
||||
|
||||
``` sql
|
||||
CREATE TABLE rankings_tiny
|
||||
(
|
||||
pageURL String,
|
||||
pageRank UInt32,
|
||||
avgDuration UInt32
|
||||
) ENGINE = Log;
|
||||
|
||||
CREATE TABLE uservisits_tiny
|
||||
(
|
||||
sourceIP String,
|
||||
destinationURL String,
|
||||
visitDate Date,
|
||||
adRevenue Float32,
|
||||
UserAgent String,
|
||||
cCode FixedString(3),
|
||||
lCode FixedString(6),
|
||||
searchWord String,
|
||||
duration UInt32
|
||||
) ENGINE = MergeTree(visitDate, visitDate, 8192);
|
||||
|
||||
CREATE TABLE rankings_1node
|
||||
(
|
||||
pageURL String,
|
||||
pageRank UInt32,
|
||||
avgDuration UInt32
|
||||
) ENGINE = Log;
|
||||
|
||||
CREATE TABLE uservisits_1node
|
||||
(
|
||||
sourceIP String,
|
||||
destinationURL String,
|
||||
visitDate Date,
|
||||
adRevenue Float32,
|
||||
UserAgent String,
|
||||
cCode FixedString(3),
|
||||
lCode FixedString(6),
|
||||
searchWord String,
|
||||
duration UInt32
|
||||
) ENGINE = MergeTree(visitDate, visitDate, 8192);
|
||||
|
||||
CREATE TABLE rankings_5nodes_on_single
|
||||
(
|
||||
pageURL String,
|
||||
pageRank UInt32,
|
||||
avgDuration UInt32
|
||||
) ENGINE = Log;
|
||||
|
||||
CREATE TABLE uservisits_5nodes_on_single
|
||||
(
|
||||
sourceIP String,
|
||||
destinationURL String,
|
||||
visitDate Date,
|
||||
adRevenue Float32,
|
||||
UserAgent String,
|
||||
cCode FixedString(3),
|
||||
lCode FixedString(6),
|
||||
searchWord String,
|
||||
duration UInt32
|
||||
) ENGINE = MergeTree(visitDate, visitDate, 8192);
|
||||
```
|
||||
|
||||
回到控制台运行如下命令:
|
||||
|
||||
```bash
|
||||
for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done
|
||||
for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done
|
||||
for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done
|
||||
for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done
|
||||
for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done
|
||||
for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done
|
||||
```
|
||||
|
||||
简单的查询示例:
|
||||
|
||||
``` sql
|
||||
SELECT pageURL, pageRank FROM rankings_1node WHERE pageRank > 1000
|
||||
|
||||
SELECT substring(sourceIP, 1, 8), sum(adRevenue) FROM uservisits_1node GROUP BY substring(sourceIP, 1, 8)
|
||||
|
||||
SELECT
|
||||
sourceIP,
|
||||
sum(adRevenue) AS totalRevenue,
|
||||
avg(pageRank) AS pageRank
|
||||
FROM rankings_1node ALL INNER JOIN
|
||||
(
|
||||
SELECT
|
||||
sourceIP,
|
||||
destinationURL AS pageURL,
|
||||
adRevenue
|
||||
FROM uservisits_1node
|
||||
WHERE (visitDate > '1980-01-01') AND (visitDate < '1980-04-01')
|
||||
) USING pageURL
|
||||
GROUP BY sourceIP
|
||||
ORDER BY totalRevenue DESC
|
||||
LIMIT 1
|
||||
```
|
||||
|
||||
|
||||
[Original article](https://clickhouse.yandex/docs/en/getting_started/example_datasets/amplab_benchmark/) <!--hide-->
|
@ -1 +0,0 @@
|
||||
../../../en/getting_started/example_datasets/criteo.md
|
75
docs/zh/getting_started/example_datasets/criteo.md
Normal file
75
docs/zh/getting_started/example_datasets/criteo.md
Normal file
@ -0,0 +1,75 @@
|
||||
# Criteo TB级别点击日志
|
||||
|
||||
可以从<http://labs.criteo.com/downloads/download-terabyte-click-logs/>上下载数据
|
||||
|
||||
创建原始数据对应的表结构:
|
||||
|
||||
``` sql
|
||||
CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log
|
||||
```
|
||||
|
||||
下载数据:
|
||||
|
||||
```bash
|
||||
for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done
|
||||
```
|
||||
|
||||
创建转换后的数据对应的表结构:
|
||||
|
||||
``` sql
|
||||
CREATE TABLE criteo
|
||||
(
|
||||
date Date,
|
||||
clicked UInt8,
|
||||
int1 Int32,
|
||||
int2 Int32,
|
||||
int3 Int32,
|
||||
int4 Int32,
|
||||
int5 Int32,
|
||||
int6 Int32,
|
||||
int7 Int32,
|
||||
int8 Int32,
|
||||
int9 Int32,
|
||||
int10 Int32,
|
||||
int11 Int32,
|
||||
int12 Int32,
|
||||
int13 Int32,
|
||||
icat1 UInt32,
|
||||
icat2 UInt32,
|
||||
icat3 UInt32,
|
||||
icat4 UInt32,
|
||||
icat5 UInt32,
|
||||
icat6 UInt32,
|
||||
icat7 UInt32,
|
||||
icat8 UInt32,
|
||||
icat9 UInt32,
|
||||
icat10 UInt32,
|
||||
icat11 UInt32,
|
||||
icat12 UInt32,
|
||||
icat13 UInt32,
|
||||
icat14 UInt32,
|
||||
icat15 UInt32,
|
||||
icat16 UInt32,
|
||||
icat17 UInt32,
|
||||
icat18 UInt32,
|
||||
icat19 UInt32,
|
||||
icat20 UInt32,
|
||||
icat21 UInt32,
|
||||
icat22 UInt32,
|
||||
icat23 UInt32,
|
||||
icat24 UInt32,
|
||||
icat25 UInt32,
|
||||
icat26 UInt32
|
||||
) ENGINE = MergeTree(date, intHash32(icat1), (date, intHash32(icat1)), 8192)
|
||||
```
|
||||
|
||||
将第一张表中的原始数据转化写入到第二张表中去:
|
||||
|
||||
``` sql
|
||||
INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log;
|
||||
|
||||
DROP TABLE criteo_log;
|
||||
```
|
||||
|
||||
|
||||
[Original article](https://clickhouse.yandex/docs/en/getting_started/example_datasets/criteo/) <!--hide-->
|
@ -1 +0,0 @@
|
||||
../../../en/getting_started/example_datasets/nyc_taxi.md
|
368
docs/zh/getting_started/example_datasets/nyc_taxi.md
Normal file
368
docs/zh/getting_started/example_datasets/nyc_taxi.md
Normal file
File diff suppressed because one or more lines are too long
@ -1 +0,0 @@
|
||||
../../../en/getting_started/example_datasets/ontime.md
|
318
docs/zh/getting_started/example_datasets/ontime.md
Normal file
318
docs/zh/getting_started/example_datasets/ontime.md
Normal file
@ -0,0 +1,318 @@
|
||||
<a name="example_datasets-ontime"></a>
|
||||
|
||||
# 航班飞行数据
|
||||
|
||||
下载数据:
|
||||
|
||||
```bash
|
||||
for s in `seq 1987 2017`
|
||||
do
|
||||
for m in `seq 1 12`
|
||||
do
|
||||
wget http://transtats.bts.gov/PREZIP/On_Time_On_Time_Performance_${s}_${m}.zip
|
||||
done
|
||||
done
|
||||
```
|
||||
|
||||
(引用 <https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh> )
|
||||
|
||||
创建表结构:
|
||||
|
||||
```sql
|
||||
CREATE TABLE `ontime` (
|
||||
`Year` UInt16,
|
||||
`Quarter` UInt8,
|
||||
`Month` UInt8,
|
||||
`DayofMonth` UInt8,
|
||||
`DayOfWeek` UInt8,
|
||||
`FlightDate` Date,
|
||||
`UniqueCarrier` FixedString(7),
|
||||
`AirlineID` Int32,
|
||||
`Carrier` FixedString(2),
|
||||
`TailNum` String,
|
||||
`FlightNum` String,
|
||||
`OriginAirportID` Int32,
|
||||
`OriginAirportSeqID` Int32,
|
||||
`OriginCityMarketID` Int32,
|
||||
`Origin` FixedString(5),
|
||||
`OriginCityName` String,
|
||||
`OriginState` FixedString(2),
|
||||
`OriginStateFips` String,
|
||||
`OriginStateName` String,
|
||||
`OriginWac` Int32,
|
||||
`DestAirportID` Int32,
|
||||
`DestAirportSeqID` Int32,
|
||||
`DestCityMarketID` Int32,
|
||||
`Dest` FixedString(5),
|
||||
`DestCityName` String,
|
||||
`DestState` FixedString(2),
|
||||
`DestStateFips` String,
|
||||
`DestStateName` String,
|
||||
`DestWac` Int32,
|
||||
`CRSDepTime` Int32,
|
||||
`DepTime` Int32,
|
||||
`DepDelay` Int32,
|
||||
`DepDelayMinutes` Int32,
|
||||
`DepDel15` Int32,
|
||||
`DepartureDelayGroups` String,
|
||||
`DepTimeBlk` String,
|
||||
`TaxiOut` Int32,
|
||||
`WheelsOff` Int32,
|
||||
`WheelsOn` Int32,
|
||||
`TaxiIn` Int32,
|
||||
`CRSArrTime` Int32,
|
||||
`ArrTime` Int32,
|
||||
`ArrDelay` Int32,
|
||||
`ArrDelayMinutes` Int32,
|
||||
`ArrDel15` Int32,
|
||||
`ArrivalDelayGroups` Int32,
|
||||
`ArrTimeBlk` String,
|
||||
`Cancelled` UInt8,
|
||||
`CancellationCode` FixedString(1),
|
||||
`Diverted` UInt8,
|
||||
`CRSElapsedTime` Int32,
|
||||
`ActualElapsedTime` Int32,
|
||||
`AirTime` Int32,
|
||||
`Flights` Int32,
|
||||
`Distance` Int32,
|
||||
`DistanceGroup` UInt8,
|
||||
`CarrierDelay` Int32,
|
||||
`WeatherDelay` Int32,
|
||||
`NASDelay` Int32,
|
||||
`SecurityDelay` Int32,
|
||||
`LateAircraftDelay` Int32,
|
||||
`FirstDepTime` String,
|
||||
`TotalAddGTime` String,
|
||||
`LongestAddGTime` String,
|
||||
`DivAirportLandings` String,
|
||||
`DivReachedDest` String,
|
||||
`DivActualElapsedTime` String,
|
||||
`DivArrDelay` String,
|
||||
`DivDistance` String,
|
||||
`Div1Airport` String,
|
||||
`Div1AirportID` Int32,
|
||||
`Div1AirportSeqID` Int32,
|
||||
`Div1WheelsOn` String,
|
||||
`Div1TotalGTime` String,
|
||||
`Div1LongestGTime` String,
|
||||
`Div1WheelsOff` String,
|
||||
`Div1TailNum` String,
|
||||
`Div2Airport` String,
|
||||
`Div2AirportID` Int32,
|
||||
`Div2AirportSeqID` Int32,
|
||||
`Div2WheelsOn` String,
|
||||
`Div2TotalGTime` String,
|
||||
`Div2LongestGTime` String,
|
||||
`Div2WheelsOff` String,
|
||||
`Div2TailNum` String,
|
||||
`Div3Airport` String,
|
||||
`Div3AirportID` Int32,
|
||||
`Div3AirportSeqID` Int32,
|
||||
`Div3WheelsOn` String,
|
||||
`Div3TotalGTime` String,
|
||||
`Div3LongestGTime` String,
|
||||
`Div3WheelsOff` String,
|
||||
`Div3TailNum` String,
|
||||
`Div4Airport` String,
|
||||
`Div4AirportID` Int32,
|
||||
`Div4AirportSeqID` Int32,
|
||||
`Div4WheelsOn` String,
|
||||
`Div4TotalGTime` String,
|
||||
`Div4LongestGTime` String,
|
||||
`Div4WheelsOff` String,
|
||||
`Div4TailNum` String,
|
||||
`Div5Airport` String,
|
||||
`Div5AirportID` Int32,
|
||||
`Div5AirportSeqID` Int32,
|
||||
`Div5WheelsOn` String,
|
||||
`Div5TotalGTime` String,
|
||||
`Div5LongestGTime` String,
|
||||
`Div5WheelsOff` String,
|
||||
`Div5TailNum` String
|
||||
) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192)
|
||||
```
|
||||
|
||||
加载数据:
|
||||
|
||||
```bash
|
||||
for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
|
||||
```
|
||||
|
||||
查询:
|
||||
|
||||
Q0.
|
||||
|
||||
```sql
|
||||
select avg(c1) from (select Year, Month, count(*) as c1 from ontime group by Year, Month);
|
||||
```
|
||||
|
||||
Q1. 查询从2000年到2008年每天的航班数
|
||||
|
||||
```sql
|
||||
SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC;
|
||||
```
|
||||
|
||||
Q2. 查询从2000年到2008年每周延误超过10分钟的航班数。
|
||||
|
||||
```sql
|
||||
SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC
|
||||
```
|
||||
|
||||
Q3. 查询2000年到2008年每个机场延误超过10分钟以上的次数
|
||||
|
||||
```sql
|
||||
SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY Origin ORDER BY c DESC LIMIT 10
|
||||
```
|
||||
|
||||
Q4. 查询2007年各航空公司延误超过10分钟以上的次数
|
||||
|
||||
```sql
|
||||
SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year = 2007 GROUP BY Carrier ORDER BY count(*) DESC
|
||||
```
|
||||
|
||||
Q5. 查询2007年各航空公司延误超过10分钟以上的百分比
|
||||
|
||||
```sql
|
||||
SELECT Carrier, c, c2, c*1000/c2 as c3
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
Carrier,
|
||||
count(*) AS c
|
||||
FROM ontime
|
||||
WHERE DepDelay>10
|
||||
AND Year=2007
|
||||
GROUP BY Carrier
|
||||
)
|
||||
ANY INNER JOIN
|
||||
(
|
||||
SELECT
|
||||
Carrier,
|
||||
count(*) AS c2
|
||||
FROM ontime
|
||||
WHERE Year=2007
|
||||
GROUP BY Carrier
|
||||
) USING Carrier
|
||||
ORDER BY c3 DESC;
|
||||
```
|
||||
|
||||
更好的查询版本:
|
||||
|
||||
```sql
|
||||
SELECT Carrier, avg(DepDelay > 10) * 1000 AS c3 FROM ontime WHERE Year = 2007 GROUP BY Carrier ORDER BY Carrier
|
||||
```
|
||||
|
||||
Q6. 同上一个查询一致,只是查询范围扩大到2000年到2008年
|
||||
|
||||
```sql
|
||||
SELECT Carrier, c, c2, c*1000/c2 as c3
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
Carrier,
|
||||
count(*) AS c
|
||||
FROM ontime
|
||||
WHERE DepDelay>10
|
||||
AND Year >= 2000 AND Year <= 2008
|
||||
GROUP BY Carrier
|
||||
)
|
||||
ANY INNER JOIN
|
||||
(
|
||||
SELECT
|
||||
Carrier,
|
||||
count(*) AS c2
|
||||
FROM ontime
|
||||
WHERE Year >= 2000 AND Year <= 2008
|
||||
GROUP BY Carrier
|
||||
) USING Carrier
|
||||
ORDER BY c3 DESC;
|
||||
```
|
||||
|
||||
更好的查询版本:
|
||||
|
||||
```sql
|
||||
SELECT Carrier, avg(DepDelay > 10) * 1000 AS c3 FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY Carrier ORDER BY Carrier
|
||||
```
|
||||
|
||||
Q7. 每年航班延误超过10分钟的百分比
|
||||
|
||||
```sql
|
||||
SELECT Year, c1/c2
|
||||
FROM
|
||||
(
|
||||
select
|
||||
Year,
|
||||
count(*)*1000 as c1
|
||||
from ontime
|
||||
WHERE DepDelay>10
|
||||
GROUP BY Year
|
||||
)
|
||||
ANY INNER JOIN
|
||||
(
|
||||
select
|
||||
Year,
|
||||
count(*) as c2
|
||||
from ontime
|
||||
GROUP BY Year
|
||||
) USING (Year)
|
||||
ORDER BY Year
|
||||
```
|
||||
|
||||
更好的查询版本:
|
||||
|
||||
```sql
|
||||
SELECT Year, avg(DepDelay > 10) FROM ontime GROUP BY Year ORDER BY Year
|
||||
```
|
||||
|
||||
Q8. 每年更受人们喜爱的目的地
|
||||
|
||||
```sql
|
||||
SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
|
||||
```
|
||||
|
||||
Q9.
|
||||
|
||||
```sql
|
||||
select Year, count(*) as c1 from ontime group by Year;
|
||||
```
|
||||
|
||||
Q10.
|
||||
|
||||
```sql
|
||||
select
|
||||
min(Year), max(Year), Carrier, count(*) as cnt,
|
||||
sum(ArrDelayMinutes>30) as flights_delayed,
|
||||
round(sum(ArrDelayMinutes>30)/count(*),2) as rate
|
||||
FROM ontime
|
||||
WHERE
|
||||
DayOfWeek not in (6,7) and OriginState not in ('AK', 'HI', 'PR', 'VI')
|
||||
and DestState not in ('AK', 'HI', 'PR', 'VI')
|
||||
and FlightDate < '2010-01-01'
|
||||
GROUP by Carrier
|
||||
HAVING cnt > 100000 and max(Year) > 1990
|
||||
ORDER by rate DESC
|
||||
LIMIT 1000;
|
||||
```
|
||||
|
||||
Bonus:
|
||||
|
||||
```sql
|
||||
SELECT avg(cnt) FROM (SELECT Year,Month,count(*) AS cnt FROM ontime WHERE DepDel15=1 GROUP BY Year,Month)
|
||||
|
||||
select avg(c1) from (select Year,Month,count(*) as c1 from ontime group by Year,Month)
|
||||
|
||||
SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
|
||||
|
||||
SELECT OriginCityName, DestCityName, count() AS c FROM ontime GROUP BY OriginCityName, DestCityName ORDER BY c DESC LIMIT 10;
|
||||
|
||||
SELECT OriginCityName, count() AS c FROM ontime GROUP BY OriginCityName ORDER BY c DESC LIMIT 10;
|
||||
```
|
||||
|
||||
这个性能测试由Vadim Tkachenko提供。参考:
|
||||
|
||||
- <https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/>
|
||||
- <https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/>
|
||||
- <https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/>
|
||||
- <https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/>
|
||||
- <https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/>
|
||||
- <http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html>
|
@ -1 +0,0 @@
|
||||
../../../en/getting_started/example_datasets/star_schema.md
|
87
docs/zh/getting_started/example_datasets/star_schema.md
Normal file
87
docs/zh/getting_started/example_datasets/star_schema.md
Normal file
@ -0,0 +1,87 @@
|
||||
# Star Schema 基准测试
|
||||
|
||||
编译 dbgen: <https://github.com/vadimtk/ssb-dbgen>
|
||||
|
||||
```bash
|
||||
git clone git@github.com:vadimtk/ssb-dbgen.git
|
||||
cd ssb-dbgen
|
||||
make
|
||||
```
|
||||
|
||||
在编译过程中可能会有一些警告,这是正常的。
|
||||
|
||||
将`dbgen`和`dists.dss`放在一个可用容量大于800GB的磁盘中。
|
||||
|
||||
开始生成数据:
|
||||
|
||||
```bash
|
||||
./dbgen -s 1000 -T c
|
||||
./dbgen -s 1000 -T l
|
||||
```
|
||||
|
||||
在ClickHouse中创建表结构:
|
||||
|
||||
``` sql
|
||||
CREATE TABLE lineorder (
|
||||
LO_ORDERKEY UInt32,
|
||||
LO_LINENUMBER UInt8,
|
||||
LO_CUSTKEY UInt32,
|
||||
LO_PARTKEY UInt32,
|
||||
LO_SUPPKEY UInt32,
|
||||
LO_ORDERDATE Date,
|
||||
LO_ORDERPRIORITY String,
|
||||
LO_SHIPPRIORITY UInt8,
|
||||
LO_QUANTITY UInt8,
|
||||
LO_EXTENDEDPRICE UInt32,
|
||||
LO_ORDTOTALPRICE UInt32,
|
||||
LO_DISCOUNT UInt8,
|
||||
LO_REVENUE UInt32,
|
||||
LO_SUPPLYCOST UInt32,
|
||||
LO_TAX UInt8,
|
||||
LO_COMMITDATE Date,
|
||||
LO_SHIPMODE String
|
||||
)Engine=MergeTree(LO_ORDERDATE,(LO_ORDERKEY,LO_LINENUMBER,LO_ORDERDATE),8192);
|
||||
|
||||
CREATE TABLE customer (
|
||||
C_CUSTKEY UInt32,
|
||||
C_NAME String,
|
||||
C_ADDRESS String,
|
||||
C_CITY String,
|
||||
C_NATION String,
|
||||
C_REGION String,
|
||||
C_PHONE String,
|
||||
C_MKTSEGMENT String,
|
||||
C_FAKEDATE Date
|
||||
)Engine=MergeTree(C_FAKEDATE,(C_CUSTKEY,C_FAKEDATE),8192);
|
||||
|
||||
CREATE TABLE part (
|
||||
P_PARTKEY UInt32,
|
||||
P_NAME String,
|
||||
P_MFGR String,
|
||||
P_CATEGORY String,
|
||||
P_BRAND String,
|
||||
P_COLOR String,
|
||||
P_TYPE String,
|
||||
P_SIZE UInt8,
|
||||
P_CONTAINER String,
|
||||
P_FAKEDATE Date
|
||||
)Engine=MergeTree(P_FAKEDATE,(P_PARTKEY,P_FAKEDATE),8192);
|
||||
|
||||
CREATE TABLE lineorderd AS lineorder ENGINE = Distributed(perftest_3shards_1replicas, default, lineorder, rand());
|
||||
CREATE TABLE customerd AS customer ENGINE = Distributed(perftest_3shards_1replicas, default, customer, rand());
|
||||
CREATE TABLE partd AS part ENGINE = Distributed(perftest_3shards_1replicas, default, part, rand());
|
||||
```
|
||||
|
||||
如果是在单节点中进行的测试,那么只需要创建对应的MergeTree表。
|
||||
如果是在多节点中进行的测试,您需要在配置文件中配置`perftest_3shards_1replicas`集群的信息。
|
||||
然后在每个节点中同时创建MergeTree表和Distributed表。
|
||||
|
||||
下载数据(如果您是分布式测试的话将'customer'更改为'customerd'):
|
||||
|
||||
```bash
|
||||
cat customer.tbl | sed 's/$/2000-01-01/' | clickhouse-client --query "INSERT INTO customer FORMAT CSV"
|
||||
cat lineorder.tbl | clickhouse-client --query "INSERT INTO lineorder FORMAT CSV"
|
||||
```
|
||||
|
||||
|
||||
[Original article](https://clickhouse.yandex/docs/en/getting_started/example_datasets/star_schema/) <!--hide-->
|
@ -1 +0,0 @@
|
||||
../../../en/getting_started/example_datasets/wikistat.md
|
29
docs/zh/getting_started/example_datasets/wikistat.md
Normal file
29
docs/zh/getting_started/example_datasets/wikistat.md
Normal file
@ -0,0 +1,29 @@
|
||||
# 维基访问数据
|
||||
|
||||
参考: <http://dumps.wikimedia.org/other/pagecounts-raw/>
|
||||
|
||||
创建表结构:
|
||||
|
||||
``` sql
|
||||
CREATE TABLE wikistat
|
||||
(
|
||||
date Date,
|
||||
time DateTime,
|
||||
project String,
|
||||
subproject String,
|
||||
path String,
|
||||
hits UInt64,
|
||||
size UInt64
|
||||
) ENGINE = MergeTree(date, (path, time), 8192);
|
||||
```
|
||||
|
||||
加载数据:
|
||||
|
||||
```bash
|
||||
for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt
|
||||
cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done
|
||||
ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done
|
||||
```
|
||||
|
||||
|
||||
[Original article](https://clickhouse.yandex/docs/en/getting_started/example_datasets/wikistat/) <!--hide-->
|
@ -1 +0,0 @@
|
||||
../../en/getting_started/index.md
|
141
docs/zh/getting_started/index.md
Normal file
141
docs/zh/getting_started/index.md
Normal file
@ -0,0 +1,141 @@
|
||||
# 入门指南
|
||||
|
||||
## 系统要求
|
||||
|
||||
如果从官方仓库安装,需要确保您使用的是x86_64处理器构架的Linux并且支持SSE 4.2指令集
|
||||
|
||||
检查是否支持SSE 4.2:
|
||||
|
||||
```bash
|
||||
grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported"
|
||||
```
|
||||
|
||||
我们推荐使用Ubuntu或者Debian。终端必须使用UTF-8编码。
|
||||
|
||||
基于rpm的系统,你可以使用第三方的安装包:https://packagecloud.io/altinity/clickhouse 或者直接安装debian安装包。
|
||||
|
||||
ClickHouse还可以在FreeBSD与Mac OS X上工作。同时它可以在不支持SSE 4.2的x86_64构架和AArch64 CPUs上编译。
|
||||
|
||||
## 安装
|
||||
|
||||
为了测试和开发,系统可以安装在单个服务器或普通PC机上。
|
||||
|
||||
### 为Debian/Ubuntu安装
|
||||
|
||||
在`/etc/apt/sources.list` (或创建`/etc/apt/sources.list.d/clickhouse.list`文件)中添加仓库:
|
||||
|
||||
```text
|
||||
deb http://repo.yandex.ru/clickhouse/deb/stable/ main/
|
||||
```
|
||||
|
||||
如果你想使用最新的测试版本,请使用'testing'替换'stable'。
|
||||
|
||||
然后运行:
|
||||
|
||||
```bash
|
||||
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 # optional
|
||||
sudo apt-get update
|
||||
sudo apt-get install clickhouse-client clickhouse-server
|
||||
```
|
||||
|
||||
你也可以从这里手动下载安装包:<https://repo.yandex.ru/clickhouse/deb/stable/main/>。
|
||||
|
||||
ClickHouse包含访问控制配置,它们位于`users.xml`文件中(与'config.xml'同目录)。
|
||||
默认情况下,允许从任何地方使用默认的‘default’用户无密码的访问ClickHouse。参考‘user/default/networks’。
|
||||
有关更多信息,请参考"Configuration files"部分。
|
||||
|
||||
### 使用源码安装
|
||||
|
||||
具体编译方式可以参考build.md。
|
||||
|
||||
你可以编译并安装它们。
|
||||
你也可以直接使用而不进行安装。
|
||||
|
||||
```text
|
||||
Client: dbms/programs/clickhouse-client
|
||||
Server: dbms/programs/clickhouse-server
|
||||
```
|
||||
|
||||
在服务器中为数据创建如下目录:
|
||||
|
||||
```text
|
||||
/opt/clickhouse/data/default/
|
||||
/opt/clickhouse/metadata/default/
|
||||
```
|
||||
|
||||
(它们可以在server config中配置。)
|
||||
为需要的用户运行‘chown’
|
||||
|
||||
日志的路径可以在server config (src/dbms/programs/server/config.xml)中配置。
|
||||
|
||||
### 其他的安装方法
|
||||
|
||||
Docker image:<https://hub.docker.com/r/yandex/clickhouse-server/>
|
||||
|
||||
CentOS或RHEL安装包:<https://github.com/Altinity/clickhouse-rpm-install>
|
||||
|
||||
Gentoo:`emerge clickhouse`
|
||||
|
||||
## 启动
|
||||
|
||||
可以运行如下命令在后台启动服务:
|
||||
|
||||
```bash
|
||||
sudo service clickhouse-server start
|
||||
```
|
||||
|
||||
可以在`/var/log/clickhouse-server/`目录中查看日志。
|
||||
|
||||
如果服务没有启动,请检查配置文件 `/etc/clickhouse-server/config.xml`。
|
||||
|
||||
你也可以在控制台中直接启动服务:
|
||||
|
||||
```bash
|
||||
clickhouse-server --config-file=/etc/clickhouse-server/config.xml
|
||||
```
|
||||
|
||||
在这种情况下,日志将被打印到控制台中,这在开发过程中很方便。
|
||||
如果配置文件在当前目录中,你可以不指定‘--config-file’参数。它默认使用‘./config.xml’。
|
||||
|
||||
你可以使用命令行客户端连接到服务:
|
||||
|
||||
```bash
|
||||
clickhouse-client
|
||||
```
|
||||
|
||||
默认情况下它使用‘default’用户无密码的与localhost:9000服务建立连接。
|
||||
客户端也可以用于连接远程服务,例如:
|
||||
|
||||
```bash
|
||||
clickhouse-client --host=example.com
|
||||
```
|
||||
|
||||
有关更多信息,请参考"Command-line client"部分。
|
||||
|
||||
检查系统是否工作:
|
||||
|
||||
```bash
|
||||
milovidov@hostname:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client
|
||||
ClickHouse client version 0.0.18749.
|
||||
Connecting to localhost:9000.
|
||||
Connected to ClickHouse server version 0.0.18749.
|
||||
|
||||
:) SELECT 1
|
||||
|
||||
SELECT 1
|
||||
|
||||
┌─1─┐
|
||||
│ 1 │
|
||||
└───┘
|
||||
|
||||
1 rows in set. Elapsed: 0.003 sec.
|
||||
|
||||
:)
|
||||
```
|
||||
|
||||
**恭喜,系统已经工作了!**
|
||||
|
||||
为了继续进行实验,你可以尝试下载测试数据集。
|
||||
|
||||
|
||||
[Original article](https://clickhouse.yandex/docs/en/getting_started/) <!--hide-->
|
Loading…
Reference in New Issue
Block a user