Merge pull request #21105 from eaxdev/fix_doc

Actualize OnTime example
This commit is contained in:
Maksim Kita 2021-05-02 14:23:04 +03:00 committed by GitHub
commit 6f08f945e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 517 additions and 513 deletions

View File

@ -21,18 +21,19 @@ echo https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performa
Creating a table:
``` sql
CREATE TABLE `ontime` (
CREATE TABLE `ontime`
(
`Year` UInt16,
`Quarter` UInt8,
`Month` UInt8,
`DayofMonth` UInt8,
`DayOfWeek` UInt8,
`FlightDate` Date,
`UniqueCarrier` FixedString(7),
`AirlineID` Int32,
`Carrier` FixedString(2),
`TailNum` String,
`FlightNum` String,
`Reporting_Airline` String,
`DOT_ID_Reporting_Airline` Int32,
`IATA_CODE_Reporting_Airline` String,
`Tail_Number` Int32,
`Flight_Number_Reporting_Airline` String,
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,
`OriginCityMarketID` Int32,
@ -74,7 +75,7 @@ CREATE TABLE `ontime` (
`Diverted` UInt8,
`CRSElapsedTime` Int32,
`ActualElapsedTime` Int32,
`AirTime` Int32,
`AirTime` Nullable(Int32),
`Flights` Int32,
`Distance` Int32,
`DistanceGroup` UInt8,
@ -132,9 +133,9 @@ CREATE TABLE `ontime` (
`Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree
PARTITION BY Year
ORDER BY (Carrier, FlightDate)
SETTINGS index_granularity = 8192;
PARTITION BY Year
ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192;
```
Loading data with multiple threads:
@ -206,7 +207,7 @@ LIMIT 10;
Q4. The number of delays by carrier for 2007
``` sql
SELECT Carrier, count(*)
SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*)
FROM ontime
WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier
@ -220,29 +221,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year=2007
GROUP BY Carrier
)
) q
JOIN
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2
FROM ontime
WHERE Year=2007
GROUP BY Carrier
) USING Carrier
) qq USING Carrier
ORDER BY c3 DESC;
```
Better version of the same query:
``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3
SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year=2007
GROUP BY Carrier
@ -256,29 +257,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year>=2000 AND Year<=2008
GROUP BY Carrier
)
) q
JOIN
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
) USING Carrier
) qq USING Carrier
ORDER BY c3 DESC;
```
Better version of the same query:
``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3
SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
@ -297,7 +298,7 @@ FROM
from ontime
WHERE DepDelay>10
GROUP BY Year
)
) q
JOIN
(
select
@ -305,7 +306,7 @@ JOIN
count(*) as c2
from ontime
GROUP BY Year
) USING (Year)
) qq USING (Year)
ORDER BY Year;
```
@ -340,7 +341,7 @@ Q10.
``` sql
SELECT
min(Year), max(Year), Carrier, count(*) AS cnt,
min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime

View File

@ -29,18 +29,19 @@ done
テーブルの作成:
``` sql
CREATE TABLE `ontime` (
CREATE TABLE `ontime`
(
`Year` UInt16,
`Quarter` UInt8,
`Month` UInt8,
`DayofMonth` UInt8,
`DayOfWeek` UInt8,
`FlightDate` Date,
`UniqueCarrier` FixedString(7),
`AirlineID` Int32,
`Carrier` FixedString(2),
`TailNum` String,
`FlightNum` String,
`Reporting_Airline` String,
`DOT_ID_Reporting_Airline` Int32,
`IATA_CODE_Reporting_Airline` String,
`Tail_Number` Int32,
`Flight_Number_Reporting_Airline` String,
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,
`OriginCityMarketID` Int32,
@ -82,7 +83,7 @@ CREATE TABLE `ontime` (
`Diverted` UInt8,
`CRSElapsedTime` Int32,
`ActualElapsedTime` Int32,
`AirTime` Int32,
`AirTime` Nullable(Int32),
`Flights` Int32,
`Distance` Int32,
`DistanceGroup` UInt8,
@ -140,15 +141,15 @@ CREATE TABLE `ontime` (
`Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree
PARTITION BY Year
ORDER BY (Carrier, FlightDate)
SETTINGS index_granularity = 8192;
PARTITION BY Year
ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192;
```
データのロード:
``` bash
$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'"
```
## パーティション済みデータのダウンロード {#download-of-prepared-partitions}
@ -212,10 +213,10 @@ LIMIT 10;
Q4. 2007年のキャリア別の遅延の数
``` sql
SELECT Carrier, count(*)
SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*)
FROM ontime
WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier
GROUP BY IATA_CODE_Reporting_Airline
ORDER BY count(*) DESC;
```
@ -226,32 +227,32 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year=2007
GROUP BY Carrier
)
) q
JOIN
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2
FROM ontime
WHERE Year=2007
GROUP BY Carrier
) USING Carrier
) qq USING Carrier
ORDER BY c3 DESC;
```
同じクエリのより良いバージョン:
``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3
SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year=2007
GROUP BY Carrier
GROUP BY IATA_CODE_Reporting_Airline
ORDER BY c3 DESC
```
@ -262,29 +263,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year>=2000 AND Year<=2008
GROUP BY Carrier
)
) q
JOIN
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
) USING Carrier
) qq USING Carrier
ORDER BY c3 DESC;
```
同じクエリのより良いバージョン:
``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3
SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
@ -303,7 +304,7 @@ FROM
from ontime
WHERE DepDelay>10
GROUP BY Year
)
) q
JOIN
(
select
@ -311,7 +312,7 @@ JOIN
count(*) as c2
from ontime
GROUP BY Year
) USING (Year)
) qq USING (Year)
ORDER BY Year;
```
@ -346,7 +347,7 @@ Q10.
``` sql
SELECT
min(Year), max(Year), Carrier, count(*) AS cnt,
min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime

View File

@ -27,18 +27,19 @@ done
Создание таблицы:
``` sql
CREATE TABLE `ontime` (
CREATE TABLE `ontime`
(
`Year` UInt16,
`Quarter` UInt8,
`Month` UInt8,
`DayofMonth` UInt8,
`DayOfWeek` UInt8,
`FlightDate` Date,
`UniqueCarrier` FixedString(7),
`AirlineID` Int32,
`Carrier` FixedString(2),
`TailNum` String,
`FlightNum` String,
`Reporting_Airline` String,
`DOT_ID_Reporting_Airline` Int32,
`IATA_CODE_Reporting_Airline` String,
`Tail_Number` Int32,
`Flight_Number_Reporting_Airline` String,
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,
`OriginCityMarketID` Int32,
@ -80,7 +81,7 @@ CREATE TABLE `ontime` (
`Diverted` UInt8,
`CRSElapsedTime` Int32,
`ActualElapsedTime` Int32,
`AirTime` Int32,
`AirTime` Nullable(Int32),
`Flights` Int32,
`Distance` Int32,
`DistanceGroup` UInt8,
@ -138,15 +139,15 @@ CREATE TABLE `ontime` (
`Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree
PARTITION BY Year
ORDER BY (Carrier, FlightDate)
SETTINGS index_granularity = 8192;
PARTITION BY Year
ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192;
```
Загрузка данных:
``` bash
$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'"
```
## Скачивание готовых партиций {#skachivanie-gotovykh-partitsii}
@ -211,7 +212,7 @@ LIMIT 10;
Q4. Количество задержек по перевозчикам за 2007 год
``` sql
SELECT Carrier, count(*)
SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*)
FROM ontime
WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier
@ -225,29 +226,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year=2007
GROUP BY Carrier
)
) q
JOIN
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2
FROM ontime
WHERE Year=2007
GROUP BY Carrier
) USING Carrier
) qq USING Carrier
ORDER BY c3 DESC;
```
Более оптимальная версия того же запроса:
``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3
SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year=2007
GROUP BY Carrier
@ -261,29 +262,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year>=2000 AND Year<=2008
GROUP BY Carrier
)
) q
JOIN
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
) USING Carrier
) qq USING Carrier
ORDER BY c3 DESC;
```
Более оптимальная версия того же запроса:
``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3
SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
@ -302,7 +303,7 @@ FROM
from ontime
WHERE DepDelay>10
GROUP BY Year
)
) q
JOIN
(
select
@ -310,7 +311,7 @@ JOIN
count(*) as c2
from ontime
GROUP BY Year
) USING (Year)
) qq USING (Year)
ORDER BY Year;
```
@ -346,7 +347,7 @@ Q10.
``` sql
SELECT
min(Year), max(Year), Carrier, count(*) AS cnt,
min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime

View File

@ -29,18 +29,19 @@ done
创建表结构:
``` sql
CREATE TABLE `ontime` (
CREATE TABLE `ontime`
(
`Year` UInt16,
`Quarter` UInt8,
`Month` UInt8,
`DayofMonth` UInt8,
`DayOfWeek` UInt8,
`FlightDate` Date,
`UniqueCarrier` FixedString(7),
`AirlineID` Int32,
`Carrier` FixedString(2),
`TailNum` String,
`FlightNum` String,
`Reporting_Airline` String,
`DOT_ID_Reporting_Airline` Int32,
`IATA_CODE_Reporting_Airline` String,
`Tail_Number` Int32,
`Flight_Number_Reporting_Airline` String,
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,
`OriginCityMarketID` Int32,
@ -82,7 +83,7 @@ CREATE TABLE `ontime` (
`Diverted` UInt8,
`CRSElapsedTime` Int32,
`ActualElapsedTime` Int32,
`AirTime` Int32,
`AirTime` Nullable(Int32),
`Flights` Int32,
`Distance` Int32,
`DistanceGroup` UInt8,
@ -140,15 +141,15 @@ CREATE TABLE `ontime` (
`Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree
PARTITION BY Year
ORDER BY (Carrier, FlightDate)
SETTINGS index_granularity = 8192;
PARTITION BY Year
ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192;
```
加载数据:
``` bash
$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'"
```
## 下载预处理好的分区数据 {#xia-zai-yu-chu-li-hao-de-fen-qu-shu-ju}
@ -212,7 +213,7 @@ LIMIT 10;
Q4. 查询2007年各航空公司延误超过10分钟以上的次数
``` sql
SELECT Carrier, count(*)
SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*)
FROM ontime
WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier
@ -226,29 +227,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year=2007
GROUP BY Carrier
)
) q
JOIN
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2
FROM ontime
WHERE Year=2007
GROUP BY Carrier
) USING Carrier
) qq USING Carrier
ORDER BY c3 DESC;
```
更好的查询版本:
``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3
SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year=2007
GROUP BY Carrier
@ -262,29 +263,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year>=2000 AND Year<=2008
GROUP BY Carrier
)
) q
JOIN
(
SELECT
Carrier,
IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
) USING Carrier
) qq USING Carrier
ORDER BY c3 DESC;
```
更好的查询版本:
``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3
SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
@ -303,7 +304,7 @@ FROM
from ontime
WHERE DepDelay>10
GROUP BY Year
)
) q
JOIN
(
select
@ -311,7 +312,7 @@ JOIN
count(*) as c2
from ontime
GROUP BY Year
) USING (Year)
) qq USING (Year)
ORDER BY Year;
```
@ -346,7 +347,7 @@ Q10.
``` sql
SELECT
min(Year), max(Year), Carrier, count(*) AS cnt,
min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime