mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-11 08:52:06 +00:00
167 lines
3.6 KiB
Markdown
167 lines
3.6 KiB
Markdown
---
|
|
slug: /en/getting-started/example-datasets/criteo
|
|
sidebar_label: Terabyte Click Logs from Criteo
|
|
---
|
|
|
|
# Terabyte of Click Logs from Criteo
|
|
|
|
Download the data from http://labs.criteo.com/downloads/download-terabyte-click-logs/
|
|
|
|
Create a table to import the log to:
|
|
|
|
``` sql
|
|
CREATE TABLE criteo_log (
|
|
date Date,
|
|
clicked UInt8,
|
|
int1 Int32,
|
|
int2 Int32,
|
|
int3 Int32,
|
|
int4 Int32,
|
|
int5 Int32,
|
|
int6 Int32,
|
|
int7 Int32,
|
|
int8 Int32,
|
|
int9 Int32,
|
|
int10 Int32,
|
|
int11 Int32,
|
|
int12 Int32,
|
|
int13 Int32,
|
|
cat1 String,
|
|
cat2 String,
|
|
cat3 String,
|
|
cat4 String,
|
|
cat5 String,
|
|
cat6 String,
|
|
cat7 String,
|
|
cat8 String,
|
|
cat9 String,
|
|
cat10 String,
|
|
cat11 String,
|
|
cat12 String,
|
|
cat13 String,
|
|
cat14 String,
|
|
cat15 String,
|
|
cat16 String,
|
|
cat17 String,
|
|
cat18 String,
|
|
cat19 String,
|
|
cat20 String,
|
|
cat21 String,
|
|
cat22 String,
|
|
cat23 String,
|
|
cat24 String,
|
|
cat25 String,
|
|
cat26 String
|
|
) ENGINE = Log;
|
|
```
|
|
|
|
Download the data:
|
|
|
|
``` bash
|
|
$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done
|
|
```
|
|
|
|
Create a table for the converted data:
|
|
|
|
``` sql
|
|
CREATE TABLE criteo
|
|
(
|
|
date Date,
|
|
clicked UInt8,
|
|
int1 Int32,
|
|
int2 Int32,
|
|
int3 Int32,
|
|
int4 Int32,
|
|
int5 Int32,
|
|
int6 Int32,
|
|
int7 Int32,
|
|
int8 Int32,
|
|
int9 Int32,
|
|
int10 Int32,
|
|
int11 Int32,
|
|
int12 Int32,
|
|
int13 Int32,
|
|
icat1 UInt32,
|
|
icat2 UInt32,
|
|
icat3 UInt32,
|
|
icat4 UInt32,
|
|
icat5 UInt32,
|
|
icat6 UInt32,
|
|
icat7 UInt32,
|
|
icat8 UInt32,
|
|
icat9 UInt32,
|
|
icat10 UInt32,
|
|
icat11 UInt32,
|
|
icat12 UInt32,
|
|
icat13 UInt32,
|
|
icat14 UInt32,
|
|
icat15 UInt32,
|
|
icat16 UInt32,
|
|
icat17 UInt32,
|
|
icat18 UInt32,
|
|
icat19 UInt32,
|
|
icat20 UInt32,
|
|
icat21 UInt32,
|
|
icat22 UInt32,
|
|
icat23 UInt32,
|
|
icat24 UInt32,
|
|
icat25 UInt32,
|
|
icat26 UInt32
|
|
) ENGINE = MergeTree()
|
|
PARTITION BY toYYYYMM(date)
|
|
ORDER BY (date, icat1)
|
|
```
|
|
|
|
Transform data from the raw log and put it in the second table:
|
|
|
|
``` sql
|
|
INSERT INTO
|
|
criteo
|
|
SELECT
|
|
date,
|
|
clicked,
|
|
int1,
|
|
int2,
|
|
int3,
|
|
int4,
|
|
int5,
|
|
int6,
|
|
int7,
|
|
int8,
|
|
int9,
|
|
int10,
|
|
int11,
|
|
int12,
|
|
int13,
|
|
reinterpretAsUInt32(unhex(cat1)) AS icat1,
|
|
reinterpretAsUInt32(unhex(cat2)) AS icat2,
|
|
reinterpretAsUInt32(unhex(cat3)) AS icat3,
|
|
reinterpretAsUInt32(unhex(cat4)) AS icat4,
|
|
reinterpretAsUInt32(unhex(cat5)) AS icat5,
|
|
reinterpretAsUInt32(unhex(cat6)) AS icat6,
|
|
reinterpretAsUInt32(unhex(cat7)) AS icat7,
|
|
reinterpretAsUInt32(unhex(cat8)) AS icat8,
|
|
reinterpretAsUInt32(unhex(cat9)) AS icat9,
|
|
reinterpretAsUInt32(unhex(cat10)) AS icat10,
|
|
reinterpretAsUInt32(unhex(cat11)) AS icat11,
|
|
reinterpretAsUInt32(unhex(cat12)) AS icat12,
|
|
reinterpretAsUInt32(unhex(cat13)) AS icat13,
|
|
reinterpretAsUInt32(unhex(cat14)) AS icat14,
|
|
reinterpretAsUInt32(unhex(cat15)) AS icat15,
|
|
reinterpretAsUInt32(unhex(cat16)) AS icat16,
|
|
reinterpretAsUInt32(unhex(cat17)) AS icat17,
|
|
reinterpretAsUInt32(unhex(cat18)) AS icat18,
|
|
reinterpretAsUInt32(unhex(cat19)) AS icat19,
|
|
reinterpretAsUInt32(unhex(cat20)) AS icat20,
|
|
reinterpretAsUInt32(unhex(cat21)) AS icat21,
|
|
reinterpretAsUInt32(unhex(cat22)) AS icat22,
|
|
reinterpretAsUInt32(unhex(cat23)) AS icat23,
|
|
reinterpretAsUInt32(unhex(cat24)) AS icat24,
|
|
reinterpretAsUInt32(unhex(cat25)) AS icat25,
|
|
reinterpretAsUInt32(unhex(cat26)) AS icat26
|
|
FROM
|
|
criteo_log;
|
|
|
|
DROP TABLE criteo_log;
|
|
```
|