--- slug: /en/getting-started/example-datasets/amplab-benchmark sidebar_label: AMPLab Big Data Benchmark description: A benchmark dataset used for comparing the performance of data warehousing solutions. --- # AMPLab Big Data Benchmark See https://amplab.cs.berkeley.edu/benchmark/ Sign up for a free account at https://aws.amazon.com. It requires a credit card, email, and phone number. Get a new access key at https://console.aws.amazon.com/iam/home?nc2=h_m_sc#security_credential Run the following in the console: ``` bash $ sudo apt-get install s3cmd $ mkdir tiny; cd tiny; $ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . $ cd .. $ mkdir 1node; cd 1node; $ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . $ cd .. $ mkdir 5nodes; cd 5nodes; $ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . $ cd .. ``` Run the following ClickHouse queries: ``` sql CREATE TABLE rankings_tiny ( pageURL String, pageRank UInt32, avgDuration UInt32 ) ENGINE = Log; CREATE TABLE uservisits_tiny ( sourceIP String, destinationURL String, visitDate Date, adRevenue Float32, UserAgent String, cCode FixedString(3), lCode FixedString(6), searchWord String, duration UInt32 ) ENGINE = MergeTree(visitDate, visitDate, 8192); CREATE TABLE rankings_1node ( pageURL String, pageRank UInt32, avgDuration UInt32 ) ENGINE = Log; CREATE TABLE uservisits_1node ( sourceIP String, destinationURL String, visitDate Date, adRevenue Float32, UserAgent String, cCode FixedString(3), lCode FixedString(6), searchWord String, duration UInt32 ) ENGINE = MergeTree(visitDate, visitDate, 8192); CREATE TABLE rankings_5nodes_on_single ( pageURL String, pageRank UInt32, avgDuration UInt32 ) ENGINE = Log; CREATE TABLE uservisits_5nodes_on_single ( sourceIP String, destinationURL String, visitDate Date, adRevenue Float32, UserAgent String, cCode FixedString(3), lCode FixedString(6), searchWord String, duration UInt32 ) ENGINE = MergeTree(visitDate, visitDate, 8192); ``` Go back to the console: ``` bash $ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done $ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done $ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done $ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done $ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done $ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done ``` Queries for obtaining data samples: ``` sql SELECT pageURL, pageRank FROM rankings_1node WHERE pageRank > 1000 SELECT substring(sourceIP, 1, 8), sum(adRevenue) FROM uservisits_1node GROUP BY substring(sourceIP, 1, 8) SELECT sourceIP, sum(adRevenue) AS totalRevenue, avg(pageRank) AS pageRank FROM rankings_1node ALL INNER JOIN ( SELECT sourceIP, destinationURL AS pageURL, adRevenue FROM uservisits_1node WHERE (visitDate > '1980-01-01') AND (visitDate < '1980-04-01') ) USING pageURL GROUP BY sourceIP ORDER BY totalRevenue DESC LIMIT 1 ```