Merge pull request #21105 from eaxdev/fix_doc

Actualize OnTime example
This commit is contained in:
Maksim Kita 2021-05-02 14:23:04 +03:00 committed by GitHub
commit 6f08f945e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 517 additions and 513 deletions

View File

@ -21,120 +21,121 @@ echo https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performa
Creating a table: Creating a table:
``` sql ``` sql
CREATE TABLE `ontime` ( CREATE TABLE `ontime`
`Year` UInt16, (
`Quarter` UInt8, `Year` UInt16,
`Month` UInt8, `Quarter` UInt8,
`DayofMonth` UInt8, `Month` UInt8,
`DayOfWeek` UInt8, `DayofMonth` UInt8,
`FlightDate` Date, `DayOfWeek` UInt8,
`UniqueCarrier` FixedString(7), `FlightDate` Date,
`AirlineID` Int32, `Reporting_Airline` String,
`Carrier` FixedString(2), `DOT_ID_Reporting_Airline` Int32,
`TailNum` String, `IATA_CODE_Reporting_Airline` String,
`FlightNum` String, `Tail_Number` Int32,
`OriginAirportID` Int32, `Flight_Number_Reporting_Airline` String,
`OriginAirportSeqID` Int32, `OriginAirportID` Int32,
`OriginCityMarketID` Int32, `OriginAirportSeqID` Int32,
`Origin` FixedString(5), `OriginCityMarketID` Int32,
`OriginCityName` String, `Origin` FixedString(5),
`OriginState` FixedString(2), `OriginCityName` String,
`OriginStateFips` String, `OriginState` FixedString(2),
`OriginStateName` String, `OriginStateFips` String,
`OriginWac` Int32, `OriginStateName` String,
`DestAirportID` Int32, `OriginWac` Int32,
`DestAirportSeqID` Int32, `DestAirportID` Int32,
`DestCityMarketID` Int32, `DestAirportSeqID` Int32,
`Dest` FixedString(5), `DestCityMarketID` Int32,
`DestCityName` String, `Dest` FixedString(5),
`DestState` FixedString(2), `DestCityName` String,
`DestStateFips` String, `DestState` FixedString(2),
`DestStateName` String, `DestStateFips` String,
`DestWac` Int32, `DestStateName` String,
`CRSDepTime` Int32, `DestWac` Int32,
`DepTime` Int32, `CRSDepTime` Int32,
`DepDelay` Int32, `DepTime` Int32,
`DepDelayMinutes` Int32, `DepDelay` Int32,
`DepDel15` Int32, `DepDelayMinutes` Int32,
`DepartureDelayGroups` String, `DepDel15` Int32,
`DepTimeBlk` String, `DepartureDelayGroups` String,
`TaxiOut` Int32, `DepTimeBlk` String,
`WheelsOff` Int32, `TaxiOut` Int32,
`WheelsOn` Int32, `WheelsOff` Int32,
`TaxiIn` Int32, `WheelsOn` Int32,
`CRSArrTime` Int32, `TaxiIn` Int32,
`ArrTime` Int32, `CRSArrTime` Int32,
`ArrDelay` Int32, `ArrTime` Int32,
`ArrDelayMinutes` Int32, `ArrDelay` Int32,
`ArrDel15` Int32, `ArrDelayMinutes` Int32,
`ArrivalDelayGroups` Int32, `ArrDel15` Int32,
`ArrTimeBlk` String, `ArrivalDelayGroups` Int32,
`Cancelled` UInt8, `ArrTimeBlk` String,
`CancellationCode` FixedString(1), `Cancelled` UInt8,
`Diverted` UInt8, `CancellationCode` FixedString(1),
`CRSElapsedTime` Int32, `Diverted` UInt8,
`ActualElapsedTime` Int32, `CRSElapsedTime` Int32,
`AirTime` Int32, `ActualElapsedTime` Int32,
`Flights` Int32, `AirTime` Nullable(Int32),
`Distance` Int32, `Flights` Int32,
`DistanceGroup` UInt8, `Distance` Int32,
`CarrierDelay` Int32, `DistanceGroup` UInt8,
`WeatherDelay` Int32, `CarrierDelay` Int32,
`NASDelay` Int32, `WeatherDelay` Int32,
`SecurityDelay` Int32, `NASDelay` Int32,
`LateAircraftDelay` Int32, `SecurityDelay` Int32,
`FirstDepTime` String, `LateAircraftDelay` Int32,
`TotalAddGTime` String, `FirstDepTime` String,
`LongestAddGTime` String, `TotalAddGTime` String,
`DivAirportLandings` String, `LongestAddGTime` String,
`DivReachedDest` String, `DivAirportLandings` String,
`DivActualElapsedTime` String, `DivReachedDest` String,
`DivArrDelay` String, `DivActualElapsedTime` String,
`DivDistance` String, `DivArrDelay` String,
`Div1Airport` String, `DivDistance` String,
`Div1AirportID` Int32, `Div1Airport` String,
`Div1AirportSeqID` Int32, `Div1AirportID` Int32,
`Div1WheelsOn` String, `Div1AirportSeqID` Int32,
`Div1TotalGTime` String, `Div1WheelsOn` String,
`Div1LongestGTime` String, `Div1TotalGTime` String,
`Div1WheelsOff` String, `Div1LongestGTime` String,
`Div1TailNum` String, `Div1WheelsOff` String,
`Div2Airport` String, `Div1TailNum` String,
`Div2AirportID` Int32, `Div2Airport` String,
`Div2AirportSeqID` Int32, `Div2AirportID` Int32,
`Div2WheelsOn` String, `Div2AirportSeqID` Int32,
`Div2TotalGTime` String, `Div2WheelsOn` String,
`Div2LongestGTime` String, `Div2TotalGTime` String,
`Div2WheelsOff` String, `Div2LongestGTime` String,
`Div2TailNum` String, `Div2WheelsOff` String,
`Div3Airport` String, `Div2TailNum` String,
`Div3AirportID` Int32, `Div3Airport` String,
`Div3AirportSeqID` Int32, `Div3AirportID` Int32,
`Div3WheelsOn` String, `Div3AirportSeqID` Int32,
`Div3TotalGTime` String, `Div3WheelsOn` String,
`Div3LongestGTime` String, `Div3TotalGTime` String,
`Div3WheelsOff` String, `Div3LongestGTime` String,
`Div3TailNum` String, `Div3WheelsOff` String,
`Div4Airport` String, `Div3TailNum` String,
`Div4AirportID` Int32, `Div4Airport` String,
`Div4AirportSeqID` Int32, `Div4AirportID` Int32,
`Div4WheelsOn` String, `Div4AirportSeqID` Int32,
`Div4TotalGTime` String, `Div4WheelsOn` String,
`Div4LongestGTime` String, `Div4TotalGTime` String,
`Div4WheelsOff` String, `Div4LongestGTime` String,
`Div4TailNum` String, `Div4WheelsOff` String,
`Div5Airport` String, `Div4TailNum` String,
`Div5AirportID` Int32, `Div5Airport` String,
`Div5AirportSeqID` Int32, `Div5AirportID` Int32,
`Div5WheelsOn` String, `Div5AirportSeqID` Int32,
`Div5TotalGTime` String, `Div5WheelsOn` String,
`Div5LongestGTime` String, `Div5TotalGTime` String,
`Div5WheelsOff` String, `Div5LongestGTime` String,
`Div5TailNum` String `Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree ) ENGINE = MergeTree
PARTITION BY Year PARTITION BY Year
ORDER BY (Carrier, FlightDate) ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192; SETTINGS index_granularity = 8192;
``` ```
Loading data with multiple threads: Loading data with multiple threads:
@ -206,7 +207,7 @@ LIMIT 10;
Q4. The number of delays by carrier for 2007 Q4. The number of delays by carrier for 2007
``` sql ``` sql
SELECT Carrier, count(*) SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*)
FROM ontime FROM ontime
WHERE DepDelay>10 AND Year=2007 WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier GROUP BY Carrier
@ -220,29 +221,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM FROM
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c count(*) AS c
FROM ontime FROM ontime
WHERE DepDelay>10 WHERE DepDelay>10
AND Year=2007 AND Year=2007
GROUP BY Carrier GROUP BY Carrier
) ) q
JOIN JOIN
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2 count(*) AS c2
FROM ontime FROM ontime
WHERE Year=2007 WHERE Year=2007
GROUP BY Carrier GROUP BY Carrier
) USING Carrier ) qq USING Carrier
ORDER BY c3 DESC; ORDER BY c3 DESC;
``` ```
Better version of the same query: Better version of the same query:
``` sql ``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3 SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime FROM ontime
WHERE Year=2007 WHERE Year=2007
GROUP BY Carrier GROUP BY Carrier
@ -256,29 +257,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM FROM
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c count(*) AS c
FROM ontime FROM ontime
WHERE DepDelay>10 WHERE DepDelay>10
AND Year>=2000 AND Year<=2008 AND Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
) ) q
JOIN JOIN
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2 count(*) AS c2
FROM ontime FROM ontime
WHERE Year>=2000 AND Year<=2008 WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
) USING Carrier ) qq USING Carrier
ORDER BY c3 DESC; ORDER BY c3 DESC;
``` ```
Better version of the same query: Better version of the same query:
``` sql ``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3 SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime FROM ontime
WHERE Year>=2000 AND Year<=2008 WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
@ -297,7 +298,7 @@ FROM
from ontime from ontime
WHERE DepDelay>10 WHERE DepDelay>10
GROUP BY Year GROUP BY Year
) ) q
JOIN JOIN
( (
select select
@ -305,7 +306,7 @@ JOIN
count(*) as c2 count(*) as c2
from ontime from ontime
GROUP BY Year GROUP BY Year
) USING (Year) ) qq USING (Year)
ORDER BY Year; ORDER BY Year;
``` ```
@ -340,7 +341,7 @@ Q10.
``` sql ``` sql
SELECT SELECT
min(Year), max(Year), Carrier, count(*) AS cnt, min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed, sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime FROM ontime

View File

@ -29,126 +29,127 @@ done
テーブルの作成: テーブルの作成:
``` sql ``` sql
CREATE TABLE `ontime` ( CREATE TABLE `ontime`
`Year` UInt16, (
`Quarter` UInt8, `Year` UInt16,
`Month` UInt8, `Quarter` UInt8,
`DayofMonth` UInt8, `Month` UInt8,
`DayOfWeek` UInt8, `DayofMonth` UInt8,
`FlightDate` Date, `DayOfWeek` UInt8,
`UniqueCarrier` FixedString(7), `FlightDate` Date,
`AirlineID` Int32, `Reporting_Airline` String,
`Carrier` FixedString(2), `DOT_ID_Reporting_Airline` Int32,
`TailNum` String, `IATA_CODE_Reporting_Airline` String,
`FlightNum` String, `Tail_Number` Int32,
`OriginAirportID` Int32, `Flight_Number_Reporting_Airline` String,
`OriginAirportSeqID` Int32, `OriginAirportID` Int32,
`OriginCityMarketID` Int32, `OriginAirportSeqID` Int32,
`Origin` FixedString(5), `OriginCityMarketID` Int32,
`OriginCityName` String, `Origin` FixedString(5),
`OriginState` FixedString(2), `OriginCityName` String,
`OriginStateFips` String, `OriginState` FixedString(2),
`OriginStateName` String, `OriginStateFips` String,
`OriginWac` Int32, `OriginStateName` String,
`DestAirportID` Int32, `OriginWac` Int32,
`DestAirportSeqID` Int32, `DestAirportID` Int32,
`DestCityMarketID` Int32, `DestAirportSeqID` Int32,
`Dest` FixedString(5), `DestCityMarketID` Int32,
`DestCityName` String, `Dest` FixedString(5),
`DestState` FixedString(2), `DestCityName` String,
`DestStateFips` String, `DestState` FixedString(2),
`DestStateName` String, `DestStateFips` String,
`DestWac` Int32, `DestStateName` String,
`CRSDepTime` Int32, `DestWac` Int32,
`DepTime` Int32, `CRSDepTime` Int32,
`DepDelay` Int32, `DepTime` Int32,
`DepDelayMinutes` Int32, `DepDelay` Int32,
`DepDel15` Int32, `DepDelayMinutes` Int32,
`DepartureDelayGroups` String, `DepDel15` Int32,
`DepTimeBlk` String, `DepartureDelayGroups` String,
`TaxiOut` Int32, `DepTimeBlk` String,
`WheelsOff` Int32, `TaxiOut` Int32,
`WheelsOn` Int32, `WheelsOff` Int32,
`TaxiIn` Int32, `WheelsOn` Int32,
`CRSArrTime` Int32, `TaxiIn` Int32,
`ArrTime` Int32, `CRSArrTime` Int32,
`ArrDelay` Int32, `ArrTime` Int32,
`ArrDelayMinutes` Int32, `ArrDelay` Int32,
`ArrDel15` Int32, `ArrDelayMinutes` Int32,
`ArrivalDelayGroups` Int32, `ArrDel15` Int32,
`ArrTimeBlk` String, `ArrivalDelayGroups` Int32,
`Cancelled` UInt8, `ArrTimeBlk` String,
`CancellationCode` FixedString(1), `Cancelled` UInt8,
`Diverted` UInt8, `CancellationCode` FixedString(1),
`CRSElapsedTime` Int32, `Diverted` UInt8,
`ActualElapsedTime` Int32, `CRSElapsedTime` Int32,
`AirTime` Int32, `ActualElapsedTime` Int32,
`Flights` Int32, `AirTime` Nullable(Int32),
`Distance` Int32, `Flights` Int32,
`DistanceGroup` UInt8, `Distance` Int32,
`CarrierDelay` Int32, `DistanceGroup` UInt8,
`WeatherDelay` Int32, `CarrierDelay` Int32,
`NASDelay` Int32, `WeatherDelay` Int32,
`SecurityDelay` Int32, `NASDelay` Int32,
`LateAircraftDelay` Int32, `SecurityDelay` Int32,
`FirstDepTime` String, `LateAircraftDelay` Int32,
`TotalAddGTime` String, `FirstDepTime` String,
`LongestAddGTime` String, `TotalAddGTime` String,
`DivAirportLandings` String, `LongestAddGTime` String,
`DivReachedDest` String, `DivAirportLandings` String,
`DivActualElapsedTime` String, `DivReachedDest` String,
`DivArrDelay` String, `DivActualElapsedTime` String,
`DivDistance` String, `DivArrDelay` String,
`Div1Airport` String, `DivDistance` String,
`Div1AirportID` Int32, `Div1Airport` String,
`Div1AirportSeqID` Int32, `Div1AirportID` Int32,
`Div1WheelsOn` String, `Div1AirportSeqID` Int32,
`Div1TotalGTime` String, `Div1WheelsOn` String,
`Div1LongestGTime` String, `Div1TotalGTime` String,
`Div1WheelsOff` String, `Div1LongestGTime` String,
`Div1TailNum` String, `Div1WheelsOff` String,
`Div2Airport` String, `Div1TailNum` String,
`Div2AirportID` Int32, `Div2Airport` String,
`Div2AirportSeqID` Int32, `Div2AirportID` Int32,
`Div2WheelsOn` String, `Div2AirportSeqID` Int32,
`Div2TotalGTime` String, `Div2WheelsOn` String,
`Div2LongestGTime` String, `Div2TotalGTime` String,
`Div2WheelsOff` String, `Div2LongestGTime` String,
`Div2TailNum` String, `Div2WheelsOff` String,
`Div3Airport` String, `Div2TailNum` String,
`Div3AirportID` Int32, `Div3Airport` String,
`Div3AirportSeqID` Int32, `Div3AirportID` Int32,
`Div3WheelsOn` String, `Div3AirportSeqID` Int32,
`Div3TotalGTime` String, `Div3WheelsOn` String,
`Div3LongestGTime` String, `Div3TotalGTime` String,
`Div3WheelsOff` String, `Div3LongestGTime` String,
`Div3TailNum` String, `Div3WheelsOff` String,
`Div4Airport` String, `Div3TailNum` String,
`Div4AirportID` Int32, `Div4Airport` String,
`Div4AirportSeqID` Int32, `Div4AirportID` Int32,
`Div4WheelsOn` String, `Div4AirportSeqID` Int32,
`Div4TotalGTime` String, `Div4WheelsOn` String,
`Div4LongestGTime` String, `Div4TotalGTime` String,
`Div4WheelsOff` String, `Div4LongestGTime` String,
`Div4TailNum` String, `Div4WheelsOff` String,
`Div5Airport` String, `Div4TailNum` String,
`Div5AirportID` Int32, `Div5Airport` String,
`Div5AirportSeqID` Int32, `Div5AirportID` Int32,
`Div5WheelsOn` String, `Div5AirportSeqID` Int32,
`Div5TotalGTime` String, `Div5WheelsOn` String,
`Div5LongestGTime` String, `Div5TotalGTime` String,
`Div5WheelsOff` String, `Div5LongestGTime` String,
`Div5TailNum` String `Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree ) ENGINE = MergeTree
PARTITION BY Year PARTITION BY Year
ORDER BY (Carrier, FlightDate) ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192; SETTINGS index_granularity = 8192;
``` ```
データのロード: データのロード:
``` bash ``` bash
$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'"
``` ```
## パーティション済みデータのダウンロード {#download-of-prepared-partitions} ## パーティション済みデータのダウンロード {#download-of-prepared-partitions}
@ -212,10 +213,10 @@ LIMIT 10;
Q4. 2007年のキャリア別の遅延の数 Q4. 2007年のキャリア別の遅延の数
``` sql ``` sql
SELECT Carrier, count(*) SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*)
FROM ontime FROM ontime
WHERE DepDelay>10 AND Year=2007 WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier GROUP BY IATA_CODE_Reporting_Airline
ORDER BY count(*) DESC; ORDER BY count(*) DESC;
``` ```
@ -226,32 +227,32 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM FROM
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c count(*) AS c
FROM ontime FROM ontime
WHERE DepDelay>10 WHERE DepDelay>10
AND Year=2007 AND Year=2007
GROUP BY Carrier GROUP BY Carrier
) ) q
JOIN JOIN
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2 count(*) AS c2
FROM ontime FROM ontime
WHERE Year=2007 WHERE Year=2007
GROUP BY Carrier GROUP BY Carrier
) USING Carrier ) qq USING Carrier
ORDER BY c3 DESC; ORDER BY c3 DESC;
``` ```
同じクエリのより良いバージョン: 同じクエリのより良いバージョン:
``` sql ``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3 SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime FROM ontime
WHERE Year=2007 WHERE Year=2007
GROUP BY Carrier GROUP BY IATA_CODE_Reporting_Airline
ORDER BY c3 DESC ORDER BY c3 DESC
``` ```
@ -262,29 +263,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM FROM
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c count(*) AS c
FROM ontime FROM ontime
WHERE DepDelay>10 WHERE DepDelay>10
AND Year>=2000 AND Year<=2008 AND Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
) ) q
JOIN JOIN
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2 count(*) AS c2
FROM ontime FROM ontime
WHERE Year>=2000 AND Year<=2008 WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
) USING Carrier ) qq USING Carrier
ORDER BY c3 DESC; ORDER BY c3 DESC;
``` ```
同じクエリのより良いバージョン: 同じクエリのより良いバージョン:
``` sql ``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3 SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime FROM ontime
WHERE Year>=2000 AND Year<=2008 WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
@ -303,7 +304,7 @@ FROM
from ontime from ontime
WHERE DepDelay>10 WHERE DepDelay>10
GROUP BY Year GROUP BY Year
) ) q
JOIN JOIN
( (
select select
@ -311,7 +312,7 @@ JOIN
count(*) as c2 count(*) as c2
from ontime from ontime
GROUP BY Year GROUP BY Year
) USING (Year) ) qq USING (Year)
ORDER BY Year; ORDER BY Year;
``` ```
@ -346,7 +347,7 @@ Q10.
``` sql ``` sql
SELECT SELECT
min(Year), max(Year), Carrier, count(*) AS cnt, min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed, sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime FROM ontime

View File

@ -27,126 +27,127 @@ done
Создание таблицы: Создание таблицы:
``` sql ``` sql
CREATE TABLE `ontime` ( CREATE TABLE `ontime`
`Year` UInt16, (
`Quarter` UInt8, `Year` UInt16,
`Month` UInt8, `Quarter` UInt8,
`DayofMonth` UInt8, `Month` UInt8,
`DayOfWeek` UInt8, `DayofMonth` UInt8,
`FlightDate` Date, `DayOfWeek` UInt8,
`UniqueCarrier` FixedString(7), `FlightDate` Date,
`AirlineID` Int32, `Reporting_Airline` String,
`Carrier` FixedString(2), `DOT_ID_Reporting_Airline` Int32,
`TailNum` String, `IATA_CODE_Reporting_Airline` String,
`FlightNum` String, `Tail_Number` Int32,
`OriginAirportID` Int32, `Flight_Number_Reporting_Airline` String,
`OriginAirportSeqID` Int32, `OriginAirportID` Int32,
`OriginCityMarketID` Int32, `OriginAirportSeqID` Int32,
`Origin` FixedString(5), `OriginCityMarketID` Int32,
`OriginCityName` String, `Origin` FixedString(5),
`OriginState` FixedString(2), `OriginCityName` String,
`OriginStateFips` String, `OriginState` FixedString(2),
`OriginStateName` String, `OriginStateFips` String,
`OriginWac` Int32, `OriginStateName` String,
`DestAirportID` Int32, `OriginWac` Int32,
`DestAirportSeqID` Int32, `DestAirportID` Int32,
`DestCityMarketID` Int32, `DestAirportSeqID` Int32,
`Dest` FixedString(5), `DestCityMarketID` Int32,
`DestCityName` String, `Dest` FixedString(5),
`DestState` FixedString(2), `DestCityName` String,
`DestStateFips` String, `DestState` FixedString(2),
`DestStateName` String, `DestStateFips` String,
`DestWac` Int32, `DestStateName` String,
`CRSDepTime` Int32, `DestWac` Int32,
`DepTime` Int32, `CRSDepTime` Int32,
`DepDelay` Int32, `DepTime` Int32,
`DepDelayMinutes` Int32, `DepDelay` Int32,
`DepDel15` Int32, `DepDelayMinutes` Int32,
`DepartureDelayGroups` String, `DepDel15` Int32,
`DepTimeBlk` String, `DepartureDelayGroups` String,
`TaxiOut` Int32, `DepTimeBlk` String,
`WheelsOff` Int32, `TaxiOut` Int32,
`WheelsOn` Int32, `WheelsOff` Int32,
`TaxiIn` Int32, `WheelsOn` Int32,
`CRSArrTime` Int32, `TaxiIn` Int32,
`ArrTime` Int32, `CRSArrTime` Int32,
`ArrDelay` Int32, `ArrTime` Int32,
`ArrDelayMinutes` Int32, `ArrDelay` Int32,
`ArrDel15` Int32, `ArrDelayMinutes` Int32,
`ArrivalDelayGroups` Int32, `ArrDel15` Int32,
`ArrTimeBlk` String, `ArrivalDelayGroups` Int32,
`Cancelled` UInt8, `ArrTimeBlk` String,
`CancellationCode` FixedString(1), `Cancelled` UInt8,
`Diverted` UInt8, `CancellationCode` FixedString(1),
`CRSElapsedTime` Int32, `Diverted` UInt8,
`ActualElapsedTime` Int32, `CRSElapsedTime` Int32,
`AirTime` Int32, `ActualElapsedTime` Int32,
`Flights` Int32, `AirTime` Nullable(Int32),
`Distance` Int32, `Flights` Int32,
`DistanceGroup` UInt8, `Distance` Int32,
`CarrierDelay` Int32, `DistanceGroup` UInt8,
`WeatherDelay` Int32, `CarrierDelay` Int32,
`NASDelay` Int32, `WeatherDelay` Int32,
`SecurityDelay` Int32, `NASDelay` Int32,
`LateAircraftDelay` Int32, `SecurityDelay` Int32,
`FirstDepTime` String, `LateAircraftDelay` Int32,
`TotalAddGTime` String, `FirstDepTime` String,
`LongestAddGTime` String, `TotalAddGTime` String,
`DivAirportLandings` String, `LongestAddGTime` String,
`DivReachedDest` String, `DivAirportLandings` String,
`DivActualElapsedTime` String, `DivReachedDest` String,
`DivArrDelay` String, `DivActualElapsedTime` String,
`DivDistance` String, `DivArrDelay` String,
`Div1Airport` String, `DivDistance` String,
`Div1AirportID` Int32, `Div1Airport` String,
`Div1AirportSeqID` Int32, `Div1AirportID` Int32,
`Div1WheelsOn` String, `Div1AirportSeqID` Int32,
`Div1TotalGTime` String, `Div1WheelsOn` String,
`Div1LongestGTime` String, `Div1TotalGTime` String,
`Div1WheelsOff` String, `Div1LongestGTime` String,
`Div1TailNum` String, `Div1WheelsOff` String,
`Div2Airport` String, `Div1TailNum` String,
`Div2AirportID` Int32, `Div2Airport` String,
`Div2AirportSeqID` Int32, `Div2AirportID` Int32,
`Div2WheelsOn` String, `Div2AirportSeqID` Int32,
`Div2TotalGTime` String, `Div2WheelsOn` String,
`Div2LongestGTime` String, `Div2TotalGTime` String,
`Div2WheelsOff` String, `Div2LongestGTime` String,
`Div2TailNum` String, `Div2WheelsOff` String,
`Div3Airport` String, `Div2TailNum` String,
`Div3AirportID` Int32, `Div3Airport` String,
`Div3AirportSeqID` Int32, `Div3AirportID` Int32,
`Div3WheelsOn` String, `Div3AirportSeqID` Int32,
`Div3TotalGTime` String, `Div3WheelsOn` String,
`Div3LongestGTime` String, `Div3TotalGTime` String,
`Div3WheelsOff` String, `Div3LongestGTime` String,
`Div3TailNum` String, `Div3WheelsOff` String,
`Div4Airport` String, `Div3TailNum` String,
`Div4AirportID` Int32, `Div4Airport` String,
`Div4AirportSeqID` Int32, `Div4AirportID` Int32,
`Div4WheelsOn` String, `Div4AirportSeqID` Int32,
`Div4TotalGTime` String, `Div4WheelsOn` String,
`Div4LongestGTime` String, `Div4TotalGTime` String,
`Div4WheelsOff` String, `Div4LongestGTime` String,
`Div4TailNum` String, `Div4WheelsOff` String,
`Div5Airport` String, `Div4TailNum` String,
`Div5AirportID` Int32, `Div5Airport` String,
`Div5AirportSeqID` Int32, `Div5AirportID` Int32,
`Div5WheelsOn` String, `Div5AirportSeqID` Int32,
`Div5TotalGTime` String, `Div5WheelsOn` String,
`Div5LongestGTime` String, `Div5TotalGTime` String,
`Div5WheelsOff` String, `Div5LongestGTime` String,
`Div5TailNum` String `Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree ) ENGINE = MergeTree
PARTITION BY Year PARTITION BY Year
ORDER BY (Carrier, FlightDate) ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192; SETTINGS index_granularity = 8192;
``` ```
Загрузка данных: Загрузка данных:
``` bash ``` bash
$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'"
``` ```
## Скачивание готовых партиций {#skachivanie-gotovykh-partitsii} ## Скачивание готовых партиций {#skachivanie-gotovykh-partitsii}
@ -211,7 +212,7 @@ LIMIT 10;
Q4. Количество задержек по перевозчикам за 2007 год Q4. Количество задержек по перевозчикам за 2007 год
``` sql ``` sql
SELECT Carrier, count(*) SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*)
FROM ontime FROM ontime
WHERE DepDelay>10 AND Year=2007 WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier GROUP BY Carrier
@ -225,29 +226,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM FROM
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c count(*) AS c
FROM ontime FROM ontime
WHERE DepDelay>10 WHERE DepDelay>10
AND Year=2007 AND Year=2007
GROUP BY Carrier GROUP BY Carrier
) ) q
JOIN JOIN
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2 count(*) AS c2
FROM ontime FROM ontime
WHERE Year=2007 WHERE Year=2007
GROUP BY Carrier GROUP BY Carrier
) USING Carrier ) qq USING Carrier
ORDER BY c3 DESC; ORDER BY c3 DESC;
``` ```
Более оптимальная версия того же запроса: Более оптимальная версия того же запроса:
``` sql ``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3 SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime FROM ontime
WHERE Year=2007 WHERE Year=2007
GROUP BY Carrier GROUP BY Carrier
@ -261,29 +262,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM FROM
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c count(*) AS c
FROM ontime FROM ontime
WHERE DepDelay>10 WHERE DepDelay>10
AND Year>=2000 AND Year<=2008 AND Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
) ) q
JOIN JOIN
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2 count(*) AS c2
FROM ontime FROM ontime
WHERE Year>=2000 AND Year<=2008 WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
) USING Carrier ) qq USING Carrier
ORDER BY c3 DESC; ORDER BY c3 DESC;
``` ```
Более оптимальная версия того же запроса: Более оптимальная версия того же запроса:
``` sql ``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3 SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime FROM ontime
WHERE Year>=2000 AND Year<=2008 WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
@ -302,7 +303,7 @@ FROM
from ontime from ontime
WHERE DepDelay>10 WHERE DepDelay>10
GROUP BY Year GROUP BY Year
) ) q
JOIN JOIN
( (
select select
@ -310,7 +311,7 @@ JOIN
count(*) as c2 count(*) as c2
from ontime from ontime
GROUP BY Year GROUP BY Year
) USING (Year) ) qq USING (Year)
ORDER BY Year; ORDER BY Year;
``` ```
@ -346,7 +347,7 @@ Q10.
``` sql ``` sql
SELECT SELECT
min(Year), max(Year), Carrier, count(*) AS cnt, min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed, sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime FROM ontime

View File

@ -29,126 +29,127 @@ done
创建表结构: 创建表结构:
``` sql ``` sql
CREATE TABLE `ontime` ( CREATE TABLE `ontime`
`Year` UInt16, (
`Quarter` UInt8, `Year` UInt16,
`Month` UInt8, `Quarter` UInt8,
`DayofMonth` UInt8, `Month` UInt8,
`DayOfWeek` UInt8, `DayofMonth` UInt8,
`FlightDate` Date, `DayOfWeek` UInt8,
`UniqueCarrier` FixedString(7), `FlightDate` Date,
`AirlineID` Int32, `Reporting_Airline` String,
`Carrier` FixedString(2), `DOT_ID_Reporting_Airline` Int32,
`TailNum` String, `IATA_CODE_Reporting_Airline` String,
`FlightNum` String, `Tail_Number` Int32,
`OriginAirportID` Int32, `Flight_Number_Reporting_Airline` String,
`OriginAirportSeqID` Int32, `OriginAirportID` Int32,
`OriginCityMarketID` Int32, `OriginAirportSeqID` Int32,
`Origin` FixedString(5), `OriginCityMarketID` Int32,
`OriginCityName` String, `Origin` FixedString(5),
`OriginState` FixedString(2), `OriginCityName` String,
`OriginStateFips` String, `OriginState` FixedString(2),
`OriginStateName` String, `OriginStateFips` String,
`OriginWac` Int32, `OriginStateName` String,
`DestAirportID` Int32, `OriginWac` Int32,
`DestAirportSeqID` Int32, `DestAirportID` Int32,
`DestCityMarketID` Int32, `DestAirportSeqID` Int32,
`Dest` FixedString(5), `DestCityMarketID` Int32,
`DestCityName` String, `Dest` FixedString(5),
`DestState` FixedString(2), `DestCityName` String,
`DestStateFips` String, `DestState` FixedString(2),
`DestStateName` String, `DestStateFips` String,
`DestWac` Int32, `DestStateName` String,
`CRSDepTime` Int32, `DestWac` Int32,
`DepTime` Int32, `CRSDepTime` Int32,
`DepDelay` Int32, `DepTime` Int32,
`DepDelayMinutes` Int32, `DepDelay` Int32,
`DepDel15` Int32, `DepDelayMinutes` Int32,
`DepartureDelayGroups` String, `DepDel15` Int32,
`DepTimeBlk` String, `DepartureDelayGroups` String,
`TaxiOut` Int32, `DepTimeBlk` String,
`WheelsOff` Int32, `TaxiOut` Int32,
`WheelsOn` Int32, `WheelsOff` Int32,
`TaxiIn` Int32, `WheelsOn` Int32,
`CRSArrTime` Int32, `TaxiIn` Int32,
`ArrTime` Int32, `CRSArrTime` Int32,
`ArrDelay` Int32, `ArrTime` Int32,
`ArrDelayMinutes` Int32, `ArrDelay` Int32,
`ArrDel15` Int32, `ArrDelayMinutes` Int32,
`ArrivalDelayGroups` Int32, `ArrDel15` Int32,
`ArrTimeBlk` String, `ArrivalDelayGroups` Int32,
`Cancelled` UInt8, `ArrTimeBlk` String,
`CancellationCode` FixedString(1), `Cancelled` UInt8,
`Diverted` UInt8, `CancellationCode` FixedString(1),
`CRSElapsedTime` Int32, `Diverted` UInt8,
`ActualElapsedTime` Int32, `CRSElapsedTime` Int32,
`AirTime` Int32, `ActualElapsedTime` Int32,
`Flights` Int32, `AirTime` Nullable(Int32),
`Distance` Int32, `Flights` Int32,
`DistanceGroup` UInt8, `Distance` Int32,
`CarrierDelay` Int32, `DistanceGroup` UInt8,
`WeatherDelay` Int32, `CarrierDelay` Int32,
`NASDelay` Int32, `WeatherDelay` Int32,
`SecurityDelay` Int32, `NASDelay` Int32,
`LateAircraftDelay` Int32, `SecurityDelay` Int32,
`FirstDepTime` String, `LateAircraftDelay` Int32,
`TotalAddGTime` String, `FirstDepTime` String,
`LongestAddGTime` String, `TotalAddGTime` String,
`DivAirportLandings` String, `LongestAddGTime` String,
`DivReachedDest` String, `DivAirportLandings` String,
`DivActualElapsedTime` String, `DivReachedDest` String,
`DivArrDelay` String, `DivActualElapsedTime` String,
`DivDistance` String, `DivArrDelay` String,
`Div1Airport` String, `DivDistance` String,
`Div1AirportID` Int32, `Div1Airport` String,
`Div1AirportSeqID` Int32, `Div1AirportID` Int32,
`Div1WheelsOn` String, `Div1AirportSeqID` Int32,
`Div1TotalGTime` String, `Div1WheelsOn` String,
`Div1LongestGTime` String, `Div1TotalGTime` String,
`Div1WheelsOff` String, `Div1LongestGTime` String,
`Div1TailNum` String, `Div1WheelsOff` String,
`Div2Airport` String, `Div1TailNum` String,
`Div2AirportID` Int32, `Div2Airport` String,
`Div2AirportSeqID` Int32, `Div2AirportID` Int32,
`Div2WheelsOn` String, `Div2AirportSeqID` Int32,
`Div2TotalGTime` String, `Div2WheelsOn` String,
`Div2LongestGTime` String, `Div2TotalGTime` String,
`Div2WheelsOff` String, `Div2LongestGTime` String,
`Div2TailNum` String, `Div2WheelsOff` String,
`Div3Airport` String, `Div2TailNum` String,
`Div3AirportID` Int32, `Div3Airport` String,
`Div3AirportSeqID` Int32, `Div3AirportID` Int32,
`Div3WheelsOn` String, `Div3AirportSeqID` Int32,
`Div3TotalGTime` String, `Div3WheelsOn` String,
`Div3LongestGTime` String, `Div3TotalGTime` String,
`Div3WheelsOff` String, `Div3LongestGTime` String,
`Div3TailNum` String, `Div3WheelsOff` String,
`Div4Airport` String, `Div3TailNum` String,
`Div4AirportID` Int32, `Div4Airport` String,
`Div4AirportSeqID` Int32, `Div4AirportID` Int32,
`Div4WheelsOn` String, `Div4AirportSeqID` Int32,
`Div4TotalGTime` String, `Div4WheelsOn` String,
`Div4LongestGTime` String, `Div4TotalGTime` String,
`Div4WheelsOff` String, `Div4LongestGTime` String,
`Div4TailNum` String, `Div4WheelsOff` String,
`Div5Airport` String, `Div4TailNum` String,
`Div5AirportID` Int32, `Div5Airport` String,
`Div5AirportSeqID` Int32, `Div5AirportID` Int32,
`Div5WheelsOn` String, `Div5AirportSeqID` Int32,
`Div5TotalGTime` String, `Div5WheelsOn` String,
`Div5LongestGTime` String, `Div5TotalGTime` String,
`Div5WheelsOff` String, `Div5LongestGTime` String,
`Div5TailNum` String `Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree ) ENGINE = MergeTree
PARTITION BY Year PARTITION BY Year
ORDER BY (Carrier, FlightDate) ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192; SETTINGS index_granularity = 8192;
``` ```
加载数据: 加载数据:
``` bash ``` bash
$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'"
``` ```
## 下载预处理好的分区数据 {#xia-zai-yu-chu-li-hao-de-fen-qu-shu-ju} ## 下载预处理好的分区数据 {#xia-zai-yu-chu-li-hao-de-fen-qu-shu-ju}
@ -212,7 +213,7 @@ LIMIT 10;
Q4. 查询2007年各航空公司延误超过10分钟以上的次数 Q4. 查询2007年各航空公司延误超过10分钟以上的次数
``` sql ``` sql
SELECT Carrier, count(*) SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*)
FROM ontime FROM ontime
WHERE DepDelay>10 AND Year=2007 WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier GROUP BY Carrier
@ -226,29 +227,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM FROM
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c count(*) AS c
FROM ontime FROM ontime
WHERE DepDelay>10 WHERE DepDelay>10
AND Year=2007 AND Year=2007
GROUP BY Carrier GROUP BY Carrier
) ) q
JOIN JOIN
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2 count(*) AS c2
FROM ontime FROM ontime
WHERE Year=2007 WHERE Year=2007
GROUP BY Carrier GROUP BY Carrier
) USING Carrier ) qq USING Carrier
ORDER BY c3 DESC; ORDER BY c3 DESC;
``` ```
更好的查询版本: 更好的查询版本:
``` sql ``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3 SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime FROM ontime
WHERE Year=2007 WHERE Year=2007
GROUP BY Carrier GROUP BY Carrier
@ -262,29 +263,29 @@ SELECT Carrier, c, c2, c*100/c2 as c3
FROM FROM
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c count(*) AS c
FROM ontime FROM ontime
WHERE DepDelay>10 WHERE DepDelay>10
AND Year>=2000 AND Year<=2008 AND Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
) ) q
JOIN JOIN
( (
SELECT SELECT
Carrier, IATA_CODE_Reporting_Airline AS Carrier,
count(*) AS c2 count(*) AS c2
FROM ontime FROM ontime
WHERE Year>=2000 AND Year<=2008 WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
) USING Carrier ) qq USING Carrier
ORDER BY c3 DESC; ORDER BY c3 DESC;
``` ```
更好的查询版本: 更好的查询版本:
``` sql ``` sql
SELECT Carrier, avg(DepDelay>10)*100 AS c3 SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime FROM ontime
WHERE Year>=2000 AND Year<=2008 WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier GROUP BY Carrier
@ -303,7 +304,7 @@ FROM
from ontime from ontime
WHERE DepDelay>10 WHERE DepDelay>10
GROUP BY Year GROUP BY Year
) ) q
JOIN JOIN
( (
select select
@ -311,7 +312,7 @@ JOIN
count(*) as c2 count(*) as c2
from ontime from ontime
GROUP BY Year GROUP BY Year
) USING (Year) ) qq USING (Year)
ORDER BY Year; ORDER BY Year;
``` ```
@ -346,7 +347,7 @@ Q10.
``` sql ``` sql
SELECT SELECT
min(Year), max(Year), Carrier, count(*) AS cnt, min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed, sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime FROM ontime