mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-05 15:21:43 +00:00
831 lines
33 KiB
HTML
831 lines
33 KiB
HTML
|
<!DOCTYPE html>
|
|||
|
<html lang="ru">
|
|||
|
<head>
|
|||
|
<meta charset="utf-8"/>
|
|||
|
<title>ClickHouse — quick start guide</title>
|
|||
|
|
|||
|
<link rel="shortcut icon" href="favicon.ico"/>
|
|||
|
|
|||
|
<meta name="description" content="Quick start guide to ClickHouse — open-source distributed column-oriented DBMS"/>
|
|||
|
<meta name="keywords"
|
|||
|
content="ClickHouse, DBMS, OLAP, relational, analytics, analytical, big data, open-source, SQL, web-analytics"/>
|
|||
|
|
|||
|
<style type="text/css">
|
|||
|
@font-face {
|
|||
|
font-family: 'Yandex Sans Text Web';
|
|||
|
src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot);
|
|||
|
src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot?#iefix) format('embedded-opentype'),
|
|||
|
url(https://yastatic.net/adv-www/_/CYblzLEXzCqQIvrYs7QKQe2omRk.woff2) format('woff2'),
|
|||
|
url(https://yastatic.net/adv-www/_/pUcnOdRwl83MvPPzrNomhyletnA.woff) format('woff'),
|
|||
|
url(https://yastatic.net/adv-www/_/vNFEmXOcGYKJ4AAidUprHWoXrLU.ttf) format('truetype'),
|
|||
|
url(https://yastatic.net/adv-www/_/0w7OcWZM_QLP8x-LQUXFOgXO6dE.svg#YandexSansTextWeb-Bold) format('svg');
|
|||
|
font-weight: 700;
|
|||
|
font-style: normal;
|
|||
|
font-stretch: normal
|
|||
|
}
|
|||
|
|
|||
|
@font-face {
|
|||
|
font-family: 'Yandex Sans Text Web';
|
|||
|
src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot);
|
|||
|
src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot?#iefix) format('embedded-opentype'),
|
|||
|
url(https://yastatic.net/adv-www/_/z3MYElcut0R2MF_Iw1RDNrstgYs.woff2) format('woff2'),
|
|||
|
url(https://yastatic.net/adv-www/_/1jvKJ_-hCXl3s7gmFl-y_-UHTaI.woff) format('woff'),
|
|||
|
url(https://yastatic.net/adv-www/_/9nzjfpCR2QHvK1EzHpDEIoVFGuY.ttf) format('truetype'),
|
|||
|
url(https://yastatic.net/adv-www/_/gwyBTpxSwkFCF1looxqs6JokKls.svg#YandexSansTextWeb-Regular) format('svg');
|
|||
|
font-weight: 400;
|
|||
|
font-style: normal;
|
|||
|
font-stretch: normal
|
|||
|
}
|
|||
|
|
|||
|
@font-face {
|
|||
|
font-family: 'Yandex Sans Text Web';
|
|||
|
src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot);
|
|||
|
src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot?#iefix) format('embedded-opentype'),
|
|||
|
url(https://yastatic.net/adv-www/_/lGQcYklLVV0hyvz1HFmFsUTj8_0.woff2) format('woff2'),
|
|||
|
url(https://yastatic.net/adv-www/_/f0AAJ9GJ4iiwEmhG-7PWMHk6vUY.woff) format('woff'),
|
|||
|
url(https://yastatic.net/adv-www/_/4UDe4nlVvgEJ-VmLWNVq3SxCsA.ttf) format('truetype'),
|
|||
|
url(https://yastatic.net/adv-www/_/EKLr1STNokPqxLAQa_RyN82pL98.svg#YandexSansTextWeb-Light) format('svg');
|
|||
|
font-weight: 300;
|
|||
|
font-style: normal;
|
|||
|
font-stretch: normal
|
|||
|
}
|
|||
|
|
|||
|
@font-face {
|
|||
|
font-family: 'Yandex Sans Display Web';
|
|||
|
src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot);
|
|||
|
src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'),
|
|||
|
url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'),
|
|||
|
url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'),
|
|||
|
url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'),
|
|||
|
url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg');
|
|||
|
font-weight: 400;
|
|||
|
font-style: normal;
|
|||
|
font-stretch: normal
|
|||
|
}
|
|||
|
|
|||
|
@font-face {
|
|||
|
font-family: 'Yandex Sans Display Web';
|
|||
|
src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot);
|
|||
|
src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot?#iefix) format('embedded-opentype'),
|
|||
|
url(https://yastatic.net/adv-www/_/LGiRvlfqQHlWR9YKLhsw5e7KGNA.woff2) format('woff2'),
|
|||
|
url(https://yastatic.net/adv-www/_/40vXwNl4eYYMgteIVgLP49dwmfc.woff) format('woff'),
|
|||
|
url(https://yastatic.net/adv-www/_/X6zG5x_wO8-AtwJ-vDLJcKC5228.ttf) format('truetype'),
|
|||
|
url(https://yastatic.net/adv-www/_/ZKhaR0m08c8CRRL77GtFKoHcLYA.svg#YandexSansDisplayWeb-Light) format('svg');
|
|||
|
font-weight: 300;
|
|||
|
font-style: normal;
|
|||
|
font-stretch: normal
|
|||
|
}
|
|||
|
|
|||
|
body {
|
|||
|
background: #fff;
|
|||
|
font: 300 12pt/150% 'Yandex Sans Text Web', Arial, sans-serif;
|
|||
|
}
|
|||
|
|
|||
|
.page {
|
|||
|
width: 900px;
|
|||
|
margin: auto;
|
|||
|
}
|
|||
|
|
|||
|
h1 {
|
|||
|
font-family: 'Yandex Sans Display Web', Arial, sans-serif;
|
|||
|
font-size: 100px;
|
|||
|
font-weight: normal;
|
|||
|
margin-top: 100px;
|
|||
|
margin-bottom: 0;
|
|||
|
text-align: center;
|
|||
|
padding-top: 27px;
|
|||
|
}
|
|||
|
|
|||
|
.title_link, .title_link:active, .title_link:visited, .title_link:link, .title_link:hover {
|
|||
|
text-decoration: none;
|
|||
|
color: #000;
|
|||
|
}
|
|||
|
|
|||
|
h2 {
|
|||
|
font: normal 50px 'Yandex Sans Display Web', Arial, sans-serif;
|
|||
|
text-align: center;
|
|||
|
margin-top: 35px;
|
|||
|
margin-bottom: 50px;
|
|||
|
}
|
|||
|
|
|||
|
h3 {
|
|||
|
font: normal 24px 'Yandex Sans Display Web', Arial, sans-serif;
|
|||
|
margin-top: 36px;
|
|||
|
}
|
|||
|
|
|||
|
a:link, a:visited {
|
|||
|
color: #08f;
|
|||
|
text-decoration: none;
|
|||
|
}
|
|||
|
|
|||
|
a:hover, a:active {
|
|||
|
color: #f00;
|
|||
|
text-decoration: underline;
|
|||
|
}
|
|||
|
|
|||
|
.footer {
|
|||
|
text-align: right;
|
|||
|
margin-top: 40px;
|
|||
|
border-top: 1px solid #EEE;
|
|||
|
padding: 10px 0 0;
|
|||
|
color: #888;
|
|||
|
font-size: 10pt;
|
|||
|
}
|
|||
|
|
|||
|
pre {
|
|||
|
font: 13px/18px monospace, "Courier New";
|
|||
|
display: block;
|
|||
|
border-left: 5px solid #ffdb4d;
|
|||
|
padding: 5px 10px;
|
|||
|
background-color: #FFF8E8;
|
|||
|
}
|
|||
|
|
|||
|
.spoiler {
|
|||
|
margin-bottom: 10px;
|
|||
|
}
|
|||
|
|
|||
|
.spoiler_body {
|
|||
|
display: none;
|
|||
|
}
|
|||
|
|
|||
|
.spoiler_title {
|
|||
|
color: #08f;
|
|||
|
border-bottom: 1px dotted #08f;
|
|||
|
}
|
|||
|
|
|||
|
.spoiler_title:hover {
|
|||
|
cursor: pointer;
|
|||
|
color: #f00;
|
|||
|
border-bottom: 1px dashed #f00;
|
|||
|
text-decoration: none;
|
|||
|
}
|
|||
|
|
|||
|
.tip {
|
|||
|
background-color: #EEE;
|
|||
|
border: 1px solid #EEE;
|
|||
|
padding: 5px 10px 5px 10px;
|
|||
|
}
|
|||
|
|
|||
|
.tip b {
|
|||
|
font-size: 150%;
|
|||
|
color: #888;
|
|||
|
}
|
|||
|
|
|||
|
.warranty {
|
|||
|
font-size: 10pt;
|
|||
|
color: #888;
|
|||
|
line-height: 150%;
|
|||
|
}
|
|||
|
|
|||
|
.orange {
|
|||
|
fill: #fc0;
|
|||
|
}
|
|||
|
|
|||
|
.red {
|
|||
|
fill: #f00
|
|||
|
}
|
|||
|
</style>
|
|||
|
</head>
|
|||
|
<body>
|
|||
|
|
|||
|
<div class="page">
|
|||
|
|
|||
|
<div>
|
|||
|
<div style="float: left; margin-right: -100%; margin-top: 0; margin-left: 3px;">
|
|||
|
<a href="/">
|
|||
|
<svg xmlns="http://www.w3.org/2000/svg" width="90" height="80" viewBox="0 0 9 8">
|
|||
|
<path class="red" d="M0,7 h1 v1 h-1 z"></path>
|
|||
|
<path class="orange" d="M0,0 h1 v7 h-1 z"></path>
|
|||
|
<path class="orange" d="M2,0 h1 v8 h-1 z"></path>
|
|||
|
<path class="orange" d="M4,0 h1 v8 h-1 z"></path>
|
|||
|
<path class="orange" d="M6,0 h1 v8 h-1 z"></path>
|
|||
|
<path class="orange" d="M8,3.25 h1 v1.5 h-1 z"></path>
|
|||
|
</svg>
|
|||
|
</a>
|
|||
|
</div>
|
|||
|
|
|||
|
<h1 id="main_title"><a class="title_link" href="/">ClickHouse</a></h1>
|
|||
|
<h2>Tutorial</h2>
|
|||
|
</div>
|
|||
|
|
|||
|
<p>Let's get started with sample dataset from open sources. We will use USA civil flights data since 1987 till 2015.
|
|||
|
It's hard to call this sample a Big Data (contains 166 millions rows, 63 Gb of uncompressed data) but this
|
|||
|
allows us to quickly get to work. Dataset is available for download <a href="https://yadi.sk/d/pOZxpa42sDdgm">here</a>.
|
|||
|
Also you may download it from the original datasource <a
|
|||
|
href="https://github.com/yandex/ClickHouse/raw/master/doc/example_datasets/1_ontime.txt"
|
|||
|
rel="external nofollow">as described
|
|||
|
here</a>.</p>
|
|||
|
|
|||
|
<p>Firstly we will deploy ClickHouse to a single server. Below that we will also review the process of deployment to
|
|||
|
a cluster with support for sharding and replication.</p>
|
|||
|
|
|||
|
<p>On Ubuntu and Debian Linux ClickHouse can be installed from <a href="/#download">packages</a>.
|
|||
|
For other Linux distributions you can <a href="https://github.com/yandex/ClickHouse/blob/master/doc/build.md"
|
|||
|
rel="external nofollow">compile
|
|||
|
ClickHouse from sources</a> and then install.</p>
|
|||
|
|
|||
|
<p><b>clickhouse-client</b> package contains <a
|
|||
|
href="/reference_en.html#Command-line%20client">clickhouse-client</a> application —
|
|||
|
interactive ClickHouse client. <b>clickhouse-server-base</b> contains a clickhouse-server binary file. <b>clickhouse-server-common</b>
|
|||
|
— contains config files for the clickhouse-server.</p>
|
|||
|
|
|||
|
<p>Server config files are located in /etc/clickhouse-server/. Before getting to work please notice the <b>path</b>
|
|||
|
element in config. <b>Path</b> determines the location for data storage. It's not really handy to directly
|
|||
|
edit <b>config.xml</b> file considering package updates. Recommended way is to override the config elements in
|
|||
|
<a href="/reference_en.html#Configuration%20files">files of config.d directory</a>.
|
|||
|
Also you may want to <a href="/reference_en.html#Access%20rights">set up access
|
|||
|
rights</a> at the start.</p>
|
|||
|
|
|||
|
<p><b>clickhouse-server</b> won't be launched automatically after package installation. It won't be automatically
|
|||
|
restarted after updates either. Start the server with:
|
|||
|
<pre>sudo service clickhouse-server start</pre>
|
|||
|
Default location for server logs is /var/log/clickhouse-server/
|
|||
|
Server is ready to handle client conections once "Ready for connections" message was logged.</p>
|
|||
|
|
|||
|
<p>Use <b>clickhouse-client</b> to connect to the server.</p>
|
|||
|
|
|||
|
<div class="spoiler"><a class="spoiler_title">Tips for clickhouse-client</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
Interactive mode:
|
|||
|
<pre>
|
|||
|
clickhouse-client
|
|||
|
clickhouse-client --host=... --port=... --user=... --password=...
|
|||
|
</pre>
|
|||
|
Enable multiline queries:
|
|||
|
<pre>
|
|||
|
clickhouse-client -m
|
|||
|
clickhouse-client --multiline
|
|||
|
</pre>
|
|||
|
Run queries in batch-mode:
|
|||
|
<pre>
|
|||
|
clickhouse-client --query='SELECT 1'
|
|||
|
echo 'SELECT 1' | clickhouse-client
|
|||
|
</pre>
|
|||
|
Inser data from file of a specified format:
|
|||
|
<pre>
|
|||
|
clickhouse-client --query='INSERT INTO table VALUES' < data.txt
|
|||
|
clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv
|
|||
|
</pre>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
<h3>Create table for sample dataset</h3>
|
|||
|
<div class="spoiler"><a class="spoiler_title">Create table query</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
$ clickhouse-client --multiline
|
|||
|
ClickHouse client version 0.0.53720.
|
|||
|
Connecting to localhost:9000.
|
|||
|
Connected to ClickHouse server version 0.0.53720.
|
|||
|
|
|||
|
:) CREATE TABLE ontime
|
|||
|
(
|
|||
|
Year UInt16,
|
|||
|
Quarter UInt8,
|
|||
|
Month UInt8,
|
|||
|
DayofMonth UInt8,
|
|||
|
DayOfWeek UInt8,
|
|||
|
FlightDate Date,
|
|||
|
UniqueCarrier FixedString(7),
|
|||
|
AirlineID Int32,
|
|||
|
Carrier FixedString(2),
|
|||
|
TailNum String,
|
|||
|
FlightNum String,
|
|||
|
OriginAirportID Int32,
|
|||
|
OriginAirportSeqID Int32,
|
|||
|
OriginCityMarketID Int32,
|
|||
|
Origin FixedString(5),
|
|||
|
OriginCityName String,
|
|||
|
OriginState FixedString(2),
|
|||
|
OriginStateFips String,
|
|||
|
OriginStateName String,
|
|||
|
OriginWac Int32,
|
|||
|
DestAirportID Int32,
|
|||
|
DestAirportSeqID Int32,
|
|||
|
DestCityMarketID Int32,
|
|||
|
Dest FixedString(5),
|
|||
|
DestCityName String,
|
|||
|
DestState FixedString(2),
|
|||
|
DestStateFips String,
|
|||
|
DestStateName String,
|
|||
|
DestWac Int32,
|
|||
|
CRSDepTime Int32,
|
|||
|
DepTime Int32,
|
|||
|
DepDelay Int32,
|
|||
|
DepDelayMinutes Int32,
|
|||
|
DepDel15 Int32,
|
|||
|
DepartureDelayGroups String,
|
|||
|
DepTimeBlk String,
|
|||
|
TaxiOut Int32,
|
|||
|
WheelsOff Int32,
|
|||
|
WheelsOn Int32,
|
|||
|
TaxiIn Int32,
|
|||
|
CRSArrTime Int32,
|
|||
|
ArrTime Int32,
|
|||
|
ArrDelay Int32,
|
|||
|
ArrDelayMinutes Int32,
|
|||
|
ArrDel15 Int32,
|
|||
|
ArrivalDelayGroups Int32,
|
|||
|
ArrTimeBlk String,
|
|||
|
Cancelled UInt8,
|
|||
|
CancellationCode FixedString(1),
|
|||
|
Diverted UInt8,
|
|||
|
CRSElapsedTime Int32,
|
|||
|
ActualElapsedTime Int32,
|
|||
|
AirTime Int32,
|
|||
|
Flights Int32,
|
|||
|
Distance Int32,
|
|||
|
DistanceGroup UInt8,
|
|||
|
CarrierDelay Int32,
|
|||
|
WeatherDelay Int32,
|
|||
|
NASDelay Int32,
|
|||
|
SecurityDelay Int32,
|
|||
|
LateAircraftDelay Int32,
|
|||
|
FirstDepTime String,
|
|||
|
TotalAddGTime String,
|
|||
|
LongestAddGTime String,
|
|||
|
DivAirportLandings String,
|
|||
|
DivReachedDest String,
|
|||
|
DivActualElapsedTime String,
|
|||
|
DivArrDelay String,
|
|||
|
DivDistance String,
|
|||
|
Div1Airport String,
|
|||
|
Div1AirportID Int32,
|
|||
|
Div1AirportSeqID Int32,
|
|||
|
Div1WheelsOn String,
|
|||
|
Div1TotalGTime String,
|
|||
|
Div1LongestGTime String,
|
|||
|
Div1WheelsOff String,
|
|||
|
Div1TailNum String,
|
|||
|
Div2Airport String,
|
|||
|
Div2AirportID Int32,
|
|||
|
Div2AirportSeqID Int32,
|
|||
|
Div2WheelsOn String,
|
|||
|
Div2TotalGTime String,
|
|||
|
Div2LongestGTime String,
|
|||
|
Div2WheelsOff String,
|
|||
|
Div2TailNum String,
|
|||
|
Div3Airport String,
|
|||
|
Div3AirportID Int32,
|
|||
|
Div3AirportSeqID Int32,
|
|||
|
Div3WheelsOn String,
|
|||
|
Div3TotalGTime String,
|
|||
|
Div3LongestGTime String,
|
|||
|
Div3WheelsOff String,
|
|||
|
Div3TailNum String,
|
|||
|
Div4Airport String,
|
|||
|
Div4AirportID Int32,
|
|||
|
Div4AirportSeqID Int32,
|
|||
|
Div4WheelsOn String,
|
|||
|
Div4TotalGTime String,
|
|||
|
Div4LongestGTime String,
|
|||
|
Div4WheelsOff String,
|
|||
|
Div4TailNum String,
|
|||
|
Div5Airport String,
|
|||
|
Div5AirportID Int32,
|
|||
|
Div5AirportSeqID Int32,
|
|||
|
Div5WheelsOn String,
|
|||
|
Div5TotalGTime String,
|
|||
|
Div5LongestGTime String,
|
|||
|
Div5WheelsOff String,
|
|||
|
Div5TailNum String
|
|||
|
)
|
|||
|
ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);
|
|||
|
</pre>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
<p>Now we have a table of <a href="/reference_en.html#MergeTree">MergeTree type</a>.
|
|||
|
MergeTree table type is recommended for usage in production. Table of this kind has a primary key used for
|
|||
|
incremental sort of table data. This allows fast execution of queries in ranges of a primary key.</p>
|
|||
|
|
|||
|
|
|||
|
<p><b>Note</b>
|
|||
|
We store ad network banners impressions logs in ClickHouse. Each table entry looks like:
|
|||
|
<source>
|
|||
|
[Advertiser ID, Impression ID, attribute1, attribute2, …]</pre>
|
|||
|
Let assume that our aim is to provide a set of reports for each advertiser. Common and frequently demanded query
|
|||
|
would be to count impressions for a specific Advertiser ID. This means that table primary key should start with
|
|||
|
<source>
|
|||
|
Advertiser ID</pre>. In this case ClickHouse needs to read smaller amount of data to perform the query for a
|
|||
|
given
|
|||
|
<source>
|
|||
|
Advertiser ID</pre>.
|
|||
|
</p>
|
|||
|
|
|||
|
<h3>Load data</h3>
|
|||
|
<pre>xz -v -c -d < ontime.csv.xz | clickhouse-client --query="INSERT INTO ontime FORMAT CSV"</pre>
|
|||
|
<p>ClickHouse INSERT query allows to load data in any <a href="/reference_en.html#Formats">supported
|
|||
|
format</a>. Data load requires just O(1) RAM consumption. INSERT query can receive any data volume as input.
|
|||
|
It's strongly recommended to insert data with <a
|
|||
|
href="/reference_en.html#Performance%20on%20data%20insertion.">not too small
|
|||
|
size blocks</a>. Notice that insert of blocks with size up to max_insert_block_size (= 1 048 576
|
|||
|
rows by default) is an atomic operation: data block will be inserted completely or not inserted at all. In case
|
|||
|
of disconnect during insert operation you may not know if the block was inserted successfully. To achieve
|
|||
|
exactly-once semantics ClickHouse supports idempotency for <a
|
|||
|
href="/reference_en.html#Data%20replication">replicated tables</a>. This means
|
|||
|
that you may retry insert of the same data block (possibly on a different replicas) but this block will be
|
|||
|
inserted just once. Anyway in this guide we will load data from our localhost so we may not take care about data
|
|||
|
blocks generation and exactly-once semantics.</p>
|
|||
|
|
|||
|
<p>INSERT query into tables of MergeTree type is non-blocking (so does a SELECT query). You can execute SELECT
|
|||
|
queries right after of during insert operation.</p>
|
|||
|
|
|||
|
<p>Our sample dataset is a bit not optimal. There are two reasons.</p>
|
|||
|
|
|||
|
<p>The first is that String data type is used in cases when <a
|
|||
|
href="/reference_en.html#Enum">Enum</a> or numeric type would fit best.</p>
|
|||
|
|
|||
|
<p class="tip"><b>⚖</b> When set of possible values is determined and known to be small. (E.g. OS name, browser
|
|||
|
vendors etc.) it's recommended to use Enums or numbers to improve performance.
|
|||
|
When set of possible values is not limited (search query, URL, etc.) just go ahead with String.</p>
|
|||
|
|
|||
|
<p>The second is that dataset contains redundant fields like Year, Quarter, Month, DayOfMonth, DayOfWeek. In fact a
|
|||
|
single FlightDate would be enough. Most likely they have been added to improve performance for other DBMS'es
|
|||
|
which DateTime handling functions may be not efficient.</p>
|
|||
|
|
|||
|
<p class="tip"><b>✯</b> ClickHouse <a
|
|||
|
href="/reference_en.html#Functions%20for%20working%20with%20dates%20and%20times">functions
|
|||
|
for operating with DateTime fields</a> are well-optimized so such redundancy is not required. Anyway much
|
|||
|
columns is not a reason to worry — ClickHouse is a <a href="https://en.wikipedia.org/wiki/Column-oriented_DBMS">column-oriented
|
|||
|
DBMS</a>. This allows you to have as much fields as you need. Hundreds of columns in a table is fine for
|
|||
|
ClickHouse.</p>
|
|||
|
|
|||
|
<h3>Querying the sample dataset</h3>
|
|||
|
|
|||
|
<p>Here are some examples of the queries from our test data.</p>
|
|||
|
|
|||
|
<ul>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">the most popular destinations in 2015;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT
|
|||
|
OriginCityName,
|
|||
|
DestCityName,
|
|||
|
count(*) AS flights,
|
|||
|
bar(flights, 0, 20000, 40)
|
|||
|
FROM ontime WHERE Year = 2015 GROUP BY OriginCityName, DestCityName ORDER BY flights DESC LIMIT 20
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/a85/18a/200/a8518a200d6d405a95ee80ea1c8e1c90.png"/>
|
|||
|
<pre>
|
|||
|
SELECT
|
|||
|
OriginCityName < DestCityName ? OriginCityName : DestCityName AS a,
|
|||
|
OriginCityName < DestCityName ? DestCityName : OriginCityName AS b,
|
|||
|
count(*) AS flights,
|
|||
|
bar(flights, 0, 40000, 40)
|
|||
|
FROM ontime WHERE Year = 2015 GROUP BY a, b ORDER BY flights DESC LIMIT 20
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/d35/78d/b55/d3578db55e304bd7b5eba818abdb53f5.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">the most popular cities of departure;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT OriginCityName, count(*) AS flights
|
|||
|
FROM ontime GROUP BY OriginCityName ORDER BY flights DESC LIMIT 20
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/ef4/141/f34/ef4141f348234773a5349c4bd3e8f804.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">cities of departure which offer maximum variety of
|
|||
|
destinations;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT OriginCityName, uniq(Dest) AS u
|
|||
|
FROM ontime GROUP BY OriginCityName ORDER BY u DESC LIMIT 20
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/240/9f4/9d1/2409f49d11fb4aa1b8b5ff34cf9ca75d.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">flight delay dependence on the day of week;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT DayOfWeek, count() AS c, avg(DepDelay > 60) AS delays
|
|||
|
FROM ontime GROUP BY DayOfWeek ORDER BY DayOfWeek
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/885/e50/793/885e507930e34b7c8f788d25e7ca2bcf.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">cities of departure with most frequent delays for 1 hour or
|
|||
|
longer;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT OriginCityName, count() AS c, avg(DepDelay > 60) AS delays
|
|||
|
FROM ontime
|
|||
|
GROUP BY OriginCityName
|
|||
|
HAVING c > 100000
|
|||
|
ORDER BY delays DESC
|
|||
|
LIMIT 20
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/ac2/926/56d/ac292656d03946d0aba35c75783a31f2.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">flights of maximum duration;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT OriginCityName, DestCityName, count(*) AS flights, avg(AirTime) AS duration
|
|||
|
FROM ontime
|
|||
|
GROUP BY OriginCityName, DestCityName
|
|||
|
ORDER BY duration DESC
|
|||
|
LIMIT 20
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/7b3/c2e/685/7b3c2e685832439b8c373bf2015131d2.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">distribution of arrival time delays split by aircompanies;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT Carrier, count() AS c, round(quantileTDigest(0.99)(DepDelay), 2) AS q
|
|||
|
FROM ontime GROUP BY Carrier ORDER BY q DESC
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/49c/332/e3d/49c332e3d93146ba8f46beef6b2b02b0.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">aircompanies who stopped flights operation;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT Carrier, min(Year), max(Year), count()
|
|||
|
FROM ontime GROUP BY Carrier HAVING max(Year) < 2015 ORDER BY count() DESC
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/249/56f/1a2/24956f1a2efc48d78212586958aa036c.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">most trending destination cities in 2015;</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT
|
|||
|
DestCityName,
|
|||
|
sum(Year = 2014) AS c2014,
|
|||
|
sum(Year = 2015) AS c2015,
|
|||
|
c2015 / c2014 AS diff
|
|||
|
FROM ontime
|
|||
|
WHERE Year IN (2014, 2015)
|
|||
|
GROUP BY DestCityName
|
|||
|
HAVING c2014 > 10000 AND c2015 > 1000 AND diff > 1
|
|||
|
ORDER BY diff DESC
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/f31/32f/4d1/f3132f4d1c0d42eab26d9111afe7771a.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
<li>
|
|||
|
<div class="spoiler"><a class="spoiler_title">destination cities with maximum popularity-season
|
|||
|
dependency.</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
SELECT
|
|||
|
DestCityName,
|
|||
|
any(total),
|
|||
|
avg(abs(monthly * 12 - total) / total) AS avg_month_diff
|
|||
|
FROM
|
|||
|
(
|
|||
|
SELECT DestCityName, count() AS total
|
|||
|
FROM ontime GROUP BY DestCityName HAVING total > 100000
|
|||
|
)
|
|||
|
ALL INNER JOIN
|
|||
|
(
|
|||
|
SELECT DestCityName, Month, count() AS monthly
|
|||
|
FROM ontime GROUP BY DestCityName, Month HAVING monthly > 10000
|
|||
|
)
|
|||
|
USING DestCityName
|
|||
|
GROUP BY DestCityName
|
|||
|
ORDER BY avg_month_diff DESC
|
|||
|
LIMIT 20
|
|||
|
</pre>
|
|||
|
<img src="https://habrastorage.org/files/26b/2c7/aae/26b2c7aae21a4c76800cb1c7a33a374d.png"/></div>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
</ul>
|
|||
|
|
|||
|
<h3>ClickHouse deployment to cluster</h3>
|
|||
|
<p>ClickHouse cluster is a homogenous cluster. Steps to set up:
|
|||
|
<ol>
|
|||
|
<li>Install ClickHouse server on all machines of the cluster</li>
|
|||
|
<li>Set up cluster configs in configuration file</li>
|
|||
|
<li>Create local tables on each instance</li>
|
|||
|
<li>Create a <a href="/reference_en.html#Distributed">Distributed table</a></li>
|
|||
|
</ol>
|
|||
|
</p>
|
|||
|
|
|||
|
<p><a href="/reference_en.html#Distributed">Distributed-table</a> is actually a kind of
|
|||
|
"view" to local tables of ClickHouse cluster. SELECT query from a distributed table will be executed using
|
|||
|
resources of all cluster's shards. You may specify configs for multiple clusters and create multiple
|
|||
|
Distributed-tables providing views to different clusters.</p>
|
|||
|
|
|||
|
<div class="spoiler"><a class="spoiler_title">Config for cluster of three shards. Each shard stores data on a single
|
|||
|
replica</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
<remote_servers>
|
|||
|
<perftest_3shards_1replicas>
|
|||
|
<shard>
|
|||
|
<replica>
|
|||
|
<host>example-perftest01j.yandex.ru</host>
|
|||
|
<port>9000</port>
|
|||
|
</replica>
|
|||
|
</shard>
|
|||
|
<shard>
|
|||
|
<replica>
|
|||
|
<host>example-perftest02j.yandex.ru</host>
|
|||
|
<port>9000</port>
|
|||
|
</replica>
|
|||
|
</shard>
|
|||
|
<shard>
|
|||
|
<replica>
|
|||
|
<host>example-perftest03j.yandex.ru</host>
|
|||
|
<port>9000</port>
|
|||
|
</replica>
|
|||
|
</shard>
|
|||
|
</perftest_3shards_1replicas>
|
|||
|
</remote_servers>
|
|||
|
</pre>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
Creating a local table:
|
|||
|
<pre>CREATE TABLE ontime_local (...) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);</pre>
|
|||
|
Creating a distributed table providing a view into local tables of the cluster:
|
|||
|
<pre>CREATE TABLE ontime_all AS ontime_local
|
|||
|
ENGINE = Distributed(perftest_3shards_1replicas, default, ontime_local, rand());</pre>
|
|||
|
|
|||
|
<p>You can create a Distributed table on all machines in the cluster. This would allow to run distributed queries on
|
|||
|
any machine of the cluster. Besides distributed table you can also use <a
|
|||
|
href="/reference_en.html#remote">*remote* table function</a>.</p>
|
|||
|
|
|||
|
<p>Let's run <a href="/reference_en.html#INSERT">INSERT SELECT</a> into Distributed table
|
|||
|
to spread the table to multiple servers.</p>
|
|||
|
|
|||
|
<pre>INSERT INTO ontime_all SELECT * FROM ontime;</pre>
|
|||
|
|
|||
|
<p class="tip"><b>⚠</b> Worth to notice that the approach given above wouldn't fit for sharding of large
|
|||
|
tables.<br/>Please use <a href="/reference_en.html#Resharding">built-in sharding
|
|||
|
feature</a>.</p>
|
|||
|
|
|||
|
<p>As you could expect heavy queries are executed N times faster being launched on 3 servers instead of one.</p>
|
|||
|
<div class="spoiler"><a class="spoiler_title">See here</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<img src="https://habrastorage.org/files/ece/020/129/ece020129fdf4a18a6e75daf2e699cb9.png"/>
|
|||
|
|
|||
|
<p>You may have noticed that quantiles calculation are slightly different. This happens due to <a
|
|||
|
href="https://github.com/tdunning/t-digest/raw/master/docs/t-digest-paper/histo.pdf">t-digest</a>
|
|||
|
algorithm implementation which is non-deterministic — it depends on the order of data processing.</p>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
<p>In this case we have used a cluster with 3 shards each contains a single replica.</p>
|
|||
|
|
|||
|
<p>To provide for resilience in production environment we recommend that each shard should contain 2-3 replicas
|
|||
|
distributed between multiple data-centers. Note that ClickHouse supports unlimited number of replicas.</p>
|
|||
|
|
|||
|
<div class="spoiler"><a class="spoiler_title">Config for cluster of one shard containing three replicas</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
<remote_servers>
|
|||
|
...
|
|||
|
<perftest_1shards_3replicas>
|
|||
|
<shard>
|
|||
|
<replica>
|
|||
|
<host>example-perftest01j.yandex.ru</host>
|
|||
|
<port>9000</port>
|
|||
|
</replica>
|
|||
|
<replica>
|
|||
|
<host>example-perftest02j.yandex.ru</host>
|
|||
|
<port>9000</port>
|
|||
|
</replica>
|
|||
|
<replica>
|
|||
|
<host>example-perftest03j.yandex.ru</host>
|
|||
|
<port>9000</port>
|
|||
|
</replica>
|
|||
|
</shard>
|
|||
|
</perftest_1shards_3replicas>
|
|||
|
</remote_servers>
|
|||
|
</pre>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
<p>To enable replication <a href="http://zookeeper.apache.org/">ZooKeeper</a> is required. ClickHouse will take care
|
|||
|
of data consistency on all replicas and run restore procedure after failure automatically. It's recommended to
|
|||
|
deploy ZooKeeper cluster to separate servers.</p>
|
|||
|
|
|||
|
<p>ZooKeeper is not a requirement — in some simple cases you can duplicate the data by writing it into all the
|
|||
|
replicas from your application code. This approach is not recommended — in this case ClickHouse is not able to
|
|||
|
guarantee data consistency on all replicas. This remains the responsibility of your application.</p>
|
|||
|
|
|||
|
<div class="spoiler"><a class="spoiler_title">Set ZooKeeper locations in configuration file</a>
|
|||
|
<div class="spoiler_body">
|
|||
|
<pre>
|
|||
|
<zookeeper-servers>
|
|||
|
<node>
|
|||
|
<host>zoo01.yandex.ru</host>
|
|||
|
<port>2181</port>
|
|||
|
</node>
|
|||
|
<node>
|
|||
|
<host>zoo02.yandex.ru</host>
|
|||
|
<port>2181</port>
|
|||
|
</node>
|
|||
|
<node>
|
|||
|
<host>zoo03.yandex.ru</host>
|
|||
|
<port>2181</port>
|
|||
|
</node>
|
|||
|
</zookeeper-servers>
|
|||
|
</pre>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
<p>Also we need to set macros for identifying shard and replica — it will be used on table creation</p>
|
|||
|
<pre>
|
|||
|
<macros>
|
|||
|
<shard>01</shard>
|
|||
|
<replica>01</replica>
|
|||
|
</macros>
|
|||
|
</pre>
|
|||
|
<p>If there are no replicas at the moment on replicated table creation — a new first replica will be instantiated.
|
|||
|
If there are already live replicas — new replica will clone the data from existing ones. You have an option to
|
|||
|
create all replicated tables first and that insert data to it. Another option is to create some replicas and add
|
|||
|
the others after or during data insertion.</p>
|
|||
|
|
|||
|
<pre>
|
|||
|
CREATE TABLE ontime_replica (...)
|
|||
|
ENGINE = ReplicatedMergeTree(
|
|||
|
'/clickhouse_perftest/tables/{shard}/ontime',
|
|||
|
'{replica}',
|
|||
|
FlightDate,
|
|||
|
(Year, FlightDate),
|
|||
|
8192);
|
|||
|
</pre>
|
|||
|
<p>Here we use <a href="/reference_en.html#ReplicatedMergeTree">ReplicatedMergeTree</a>
|
|||
|
table type. In parameters we specify ZooKeeper path containing shard and replica identifiers.</p>
|
|||
|
|
|||
|
<pre>INSERT INTO ontime_replica SELECT * FROM ontime;</pre>
|
|||
|
<p>Replication operates in multi-master mode. Data can be loaded into any replica — it will be synced with other
|
|||
|
instances automatically. Replication is asynchronous so at a given moment of time not all replicas may contain
|
|||
|
recently inserted data. To allow data insertion at least one replica should be up. Others will sync up data and
|
|||
|
repair consistency once they will become active again. Please notice that such scheme allows for the possibility
|
|||
|
of just appended data loss.</p>
|
|||
|
|
|||
|
<h3>Feedback</h3>
|
|||
|
|
|||
|
<p>Ask any questions on <a href="https://stackoverflow.com/questions/tagged/clickhouse" rel="external nofollow">Stack
|
|||
|
Overflow</a>.</p>
|
|||
|
<p>Discuss with real users in Telegram chat in <a href="https://telegram.me/clickhouse_en" rel="external nofollow">English</a>
|
|||
|
or in <a
|
|||
|
href="https://telegram.me/clickhouse_ru" rel="external nofollow">Russian</a>.</p>
|
|||
|
<p>Use <a href="https://groups.google.com/group/clickhouse" rel="external nofollow">Google Group</a> for discussion.
|
|||
|
</p>
|
|||
|
<p>Or send private message to developers:
|
|||
|
<a id="feedback_email" href="">turn on JavaScript to see email address</a>.
|
|||
|
</p>
|
|||
|
<p class="warranty">Software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|||
|
either express or implied.</p>
|
|||
|
|
|||
|
<p class="footer">© 2016–2017 <a href="https://yandex.com/company/" rel="external nofollow">YANDEX</a> LLC</p>
|
|||
|
</div>
|
|||
|
|
|||
|
<script type="text/javascript" src="https://yastatic.net/jquery/3.1.1/jquery.min.js"></script>
|
|||
|
<script type="text/javascript">
|
|||
|
function getParams() {
|
|||
|
var matches = document.cookie.match(/yandex_login=([\w\-]+)/);
|
|||
|
return (matches && matches.length == 2) ? {"login": matches[1]} : {};
|
|||
|
}
|
|||
|
$('.spoiler_title').click(function () {
|
|||
|
$(this).next('.spoiler_body').toggle(100);
|
|||
|
});
|
|||
|
|
|||
|
var name = document.getElementById('main_title').textContent.trim().toLowerCase();
|
|||
|
var feedback_address = name + '-feedback' + '@yandex-team.com';
|
|||
|
var feedback_email = document.getElementById('feedback_email');
|
|||
|
feedback_email.setAttribute('href', 'mailto:' + feedback_address);
|
|||
|
feedback_email.textContent = feedback_address;
|
|||
|
</script>
|
|||
|
<!-- Yandex.Metrika counter -->
|
|||
|
<script src="https://mc.yandex.ru/metrika/watch.js" type="text/javascript"></script>
|
|||
|
<script type="text/javascript">
|
|||
|
try {
|
|||
|
var yaCounter18343495 = new Ya.Metrika({
|
|||
|
id: 18343495,
|
|||
|
webvisor: true,
|
|||
|
clickmap: true,
|
|||
|
trackLinks: true,
|
|||
|
accurateTrackBounce: true,
|
|||
|
trackHash: true,
|
|||
|
params: getParams()
|
|||
|
});
|
|||
|
} catch (e) {
|
|||
|
}
|
|||
|
</script>
|
|||
|
<noscript>
|
|||
|
<div><img src="https://mc.yandex.ru/watch/18343495" style="position:absolute; left:-9999px;" alt=""/></div>
|
|||
|
</noscript>
|
|||
|
<!-- /Yandex.Metrika counter -->
|
|||
|
|
|||
|
|
|||
|
</body>
|
|||
|
</html>
|