Added description of toy datasets [#METR-17973].

This commit is contained in:
Alexey Milovidov 2016-02-08 02:03:27 +03:00
parent 3269eb987f
commit 92c6267af8
5 changed files with 455 additions and 0 deletions

View File

@ -0,0 +1,260 @@
This benchmark was created by Vadim Tkachenko, see:
https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/
https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/
https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/
https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/
https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/
http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html
1. https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh
2.
CREATE TABLE `ontime` (
`Year` UInt16,
`Quarter` UInt8,
`Month` UInt8,
`DayofMonth` UInt8,
`DayOfWeek` UInt8,
`FlightDate` Date,
`UniqueCarrier` FixedString(7),
`AirlineID` Int32,
`Carrier` FixedString(2),
`TailNum` String,
`FlightNum` String,
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,
`OriginCityMarketID` Int32,
`Origin` FixedString(5),
`OriginCityName` String,
`OriginState` FixedString(2),
`OriginStateFips` String,
`OriginStateName` String,
`OriginWac` Int32,
`DestAirportID` Int32,
`DestAirportSeqID` Int32,
`DestCityMarketID` Int32,
`Dest` FixedString(5),
`DestCityName` String,
`DestState` FixedString(2),
`DestStateFips` String,
`DestStateName` String,
`DestWac` Int32,
`CRSDepTime` Int32,
`DepTime` Int32,
`DepDelay` Int32,
`DepDelayMinutes` Int32,
`DepDel15` Int32,
`DepartureDelayGroups` String,
`DepTimeBlk` String,
`TaxiOut` Int32,
`WheelsOff` Int32,
`WheelsOn` Int32,
`TaxiIn` Int32,
`CRSArrTime` Int32,
`ArrTime` Int32,
`ArrDelay` Int32,
`ArrDelayMinutes` Int32,
`ArrDel15` Int32,
`ArrivalDelayGroups` Int32,
`ArrTimeBlk` String,
`Cancelled` UInt8,
`CancellationCode` FixedString(1),
`Diverted` UInt8,
`CRSElapsedTime` Int32,
`ActualElapsedTime` Int32,
`AirTime` Int32,
`Flights` Int32,
`Distance` Int32,
`DistanceGroup` UInt8,
`CarrierDelay` Int32,
`WeatherDelay` Int32,
`NASDelay` Int32,
`SecurityDelay` Int32,
`LateAircraftDelay` Int32,
`FirstDepTime` String,
`TotalAddGTime` String,
`LongestAddGTime` String,
`DivAirportLandings` String,
`DivReachedDest` String,
`DivActualElapsedTime` String,
`DivArrDelay` String,
`DivDistance` String,
`Div1Airport` String,
`Div1AirportID` Int32,
`Div1AirportSeqID` Int32,
`Div1WheelsOn` String,
`Div1TotalGTime` String,
`Div1LongestGTime` String,
`Div1WheelsOff` String,
`Div1TailNum` String,
`Div2Airport` String,
`Div2AirportID` Int32,
`Div2AirportSeqID` Int32,
`Div2WheelsOn` String,
`Div2TotalGTime` String,
`Div2LongestGTime` String,
`Div2WheelsOff` String,
`Div2TailNum` String,
`Div3Airport` String,
`Div3AirportID` Int32,
`Div3AirportSeqID` Int32,
`Div3WheelsOn` String,
`Div3TotalGTime` String,
`Div3LongestGTime` String,
`Div3WheelsOff` String,
`Div3TailNum` String,
`Div4Airport` String,
`Div4AirportID` Int32,
`Div4AirportSeqID` Int32,
`Div4WheelsOn` String,
`Div4TotalGTime` String,
`Div4LongestGTime` String,
`Div4WheelsOff` String,
`Div4TailNum` String,
`Div5Airport` String,
`Div5AirportID` Int32,
`Div5AirportSeqID` Int32,
`Div5WheelsOn` String,
`Div5TotalGTime` String,
`Div5LongestGTime` String,
`Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192)
3.
for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
4.
SELECT count() FROM ontime
Q0. select avg(c1) from (select Year, Month, count(*) as c1 from ontime group by Year, Month);
Q1. Count flights per day from 2000 to 2008 years
SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC;
Q2. Count of flights delayed more than 10min per day of week for 2000-2008 years
SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC
Q3. Count of delays per airport for years 2000-2008
SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY Origin ORDER BY c DESC LIMIT 10
Q4. Count of delays per Carrier for 2007 year
SELECT carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year = 2007 GROUP BY carrier ORDER BY 2 DESC
Q5. Percentage of delays for each carrier for 2007 year.
SELECT Carrier, c, c2, c*1000/c2 as c3
FROM
(
SELECT
Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year=2007
GROUP BY Carrier
)
ANY INNER JOIN
(
SELECT
Carrier,
count(*) AS c2
FROM ontime
WHERE Year=2007
GROUP BY Carrier
) USING Carrier
ORDER BY c3 DESC;
More optimal version of same query:
SELECT Carrier, avg(DepDelay > 10) * 1000 AS c3 FROM ontime WHERE Year = 2007 GROUP BY Carrier ORDER BY Carrier
Q6. Lets try the same query for wide range of years 2000-2008.
SELECT Carrier, c, c2, c*1000/c2 as c3
FROM
(
SELECT
Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year >= 2000 AND Year <= 2008
GROUP BY Carrier
)
ANY INNER JOIN
(
SELECT
Carrier,
count(*) AS c2
FROM ontime
WHERE Year >= 2000 AND Year <= 2008
GROUP BY Carrier
) USING Carrier
ORDER BY c3 DESC;
More optimal version of same query:
SELECT Carrier, avg(DepDelay > 10) * 1000 AS c3 FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY Carrier ORDER BY Carrier
Q7. Percent of delayed (more 10mins) flights per year.
SELECT Year, c1/c2
FROM
(
select
Year,
count(*)*1000 as c1
from ontime
WHERE DepDelay>10
GROUP BY Year
)
ANY INNER JOIN
(
select
Year,
count(*) as c2
from ontime
GROUP BY Year
) USING (Year)
ORDER BY Year
More optimal version of same query:
SELECT Year, avg(DepDelay > 10) FROM ontime GROUP BY Year ORDER BY Year
Q8. Most popular destination in sense count of direct connected cities for different diapason of years.
SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
Q9. select Year, count(*) as c1 from ontime group by Year;
Q10.
select
min(Year), max(Year), Carrier, count(*) as cnt,
sum(ArrDelayMinutes>30) as flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) as rate
FROM ontime
WHERE
DayOfWeek not in (6,7) and OriginState not in ('AK', 'HI', 'PR', 'VI')
and DestState not in ('AK', 'HI', 'PR', 'VI')
and FlightDate < '2010-01-01'
GROUP by Carrier
HAVING cnt > 100000 and max(Year) > 1990
ORDER by rate DESC
LIMIT 1000;
Bonus:
SELECT avg(cnt) FROM (SELECT Year,Month,count(*) AS cnt FROM ontime WHERE DepDel15=1 GROUP BY Year,Month)
select avg(c1) from (select Year,Month,count(*) as c1 from ontime group by Year,Month)
SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
SELECT OriginCityName, DestCityName, count() AS c FROM ontime GROUP BY OriginCityName, DestCityName ORDER BY c DESC LIMIT 10;
SELECT OriginCityName count() AS c FROM ontime GROUP BY OriginCityName ORDER BY c DESC LIMIT 10;

View File

@ -0,0 +1,120 @@
AMPLab Big Data Benchmark: https://amplab.cs.berkeley.edu/benchmark/
1. Register free account on aws.amazon.com. You will need credit card, e-mail and phone number.
2. Create new access key on https://console.aws.amazon.com/iam/home?nc2=h_m_sc#security_credential
3. sudo apt-get install s3cmd
4.
mkdir tiny; cd tiny;
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ .
cd ..
mkdir 1node; cd 1node;
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ .
cd ..
mkdir 5nodes; cd 5nodes;
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ .
cd ..
5.
CREATE TABLE rankings_tiny
(
pageURL String,
pageRank UInt32,
avgDuration UInt32
) ENGINE = Log;
CREATE TABLE uservisits_tiny
(
sourceIP String,
destinationURL String,
visitDate Date,
adRevenue Float32,
UserAgent String,
cCode FixedString(3),
lCode FixedString(6),
searchWord String,
duration UInt32
) ENGINE = MergeTree(visitDate, visitDate, 8192);
CREATE TABLE rankings_1node
(
pageURL String,
pageRank UInt32,
avgDuration UInt32
) ENGINE = Log;
CREATE TABLE uservisits_1node
(
sourceIP String,
destinationURL String,
visitDate Date,
adRevenue Float32,
UserAgent String,
cCode FixedString(3),
lCode FixedString(6),
searchWord String,
duration UInt32
) ENGINE = MergeTree(visitDate, visitDate, 8192);
CREATE TABLE rankings_5nodes_on_single
(
pageURL String,
pageRank UInt32,
avgDuration UInt32
) ENGINE = Log;
CREATE TABLE uservisits_5nodes_on_single
(
sourceIP String,
destinationURL String,
visitDate Date,
adRevenue Float32,
UserAgent String,
cCode FixedString(3),
lCode FixedString(6),
searchWord String,
duration UInt32
) ENGINE = MergeTree(visitDate, visitDate, 8192);
6.
for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done
for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done
for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done
for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done
for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done
for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done
7.
SELECT pageURL, pageRank FROM rankings_1node WHERE pageRank > 1000
SELECT substring(sourceIP, 1, 8), sum(adRevenue) FROM uservisits_1node GROUP BY substring(sourceIP, 1, 8)
SELECT
sourceIP,
sum(adRevenue) AS totalRevenue,
avg(pageRank) AS pageRank
FROM rankings_1node ALL INNER JOIN
(
SELECT
sourceIP,
destinationURL AS pageURL,
adRevenue
FROM uservisits_1node
WHERE (visitDate > '1980-01-01') AND (visitDate < '1980-04-01')
) USING pageURL
GROUP BY sourceIP
ORDER BY totalRevenue DESC
LIMIT 1

View File

@ -0,0 +1,61 @@
1. Download all data from http://labs.criteo.com/downloads/download-terabyte-click-logs/
2.
CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log
3.
for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done
4.
CREATE TABLE criteo
(
date Date,
clicked UInt8,
int1 Int32,
int2 Int32,
int3 Int32,
int4 Int32,
int5 Int32,
int6 Int32,
int7 Int32,
int8 Int32,
int9 Int32,
int10 Int32,
int11 Int32,
int12 Int32,
int13 Int32,
icat1 UInt32,
icat2 UInt32,
icat3 UInt32,
icat4 UInt32,
icat5 UInt32,
icat6 UInt32,
icat7 UInt32,
icat8 UInt32,
icat9 UInt32,
icat10 UInt32,
icat11 UInt32,
icat12 UInt32,
icat13 UInt32,
icat14 UInt32,
icat15 UInt32,
icat16 UInt32,
icat17 UInt32,
icat18 UInt32,
icat19 UInt32,
icat20 UInt32,
icat21 UInt32,
icat22 UInt32,
icat23 UInt32,
icat24 UInt32,
icat25 UInt32,
icat26 UInt32
) ENGINE = MergeTree(date, intHash32(icat1), (date, intHash32(icat1)), 8192)
5.
INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log;
DROP TABLE criteo_log;

View File

@ -0,0 +1,8 @@
git clone https://github.com/electrum/ssb-dbgen.git
In shared.h file change MAXAGG_LEN from 10 to 20.
In makefile change MACHINE to Linux
make

View File

@ -0,0 +1,6 @@
http://dumps.wikimedia.org/other/pagecounts-raw/
for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sS "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt
cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done