mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-14 18:32:29 +00:00
839 lines
33 KiB
HTML
839 lines
33 KiB
HTML
<!DOCTYPE html>
|
||
<html lang="ru">
|
||
<head>
|
||
<meta charset="utf-8"/>
|
||
<title>ClickHouse — quick start guide</title>
|
||
|
||
<link rel="shortcut icon" href="favicon.ico"/>
|
||
|
||
<meta name="description" content="Quick start guide to ClickHouse — open-source distributed column-oriented DBMS"/>
|
||
<meta name="keywords"
|
||
content="ClickHouse, DBMS, OLAP, relational, analytics, analytical, big data, open-source, SQL, web-analytics"/>
|
||
|
||
<style type="text/css">
|
||
@font-face {
|
||
font-family: 'Yandex Sans Text Web';
|
||
src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot);
|
||
src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot?#iefix) format('embedded-opentype'),
|
||
url(https://yastatic.net/adv-www/_/CYblzLEXzCqQIvrYs7QKQe2omRk.woff2) format('woff2'),
|
||
url(https://yastatic.net/adv-www/_/pUcnOdRwl83MvPPzrNomhyletnA.woff) format('woff'),
|
||
url(https://yastatic.net/adv-www/_/vNFEmXOcGYKJ4AAidUprHWoXrLU.ttf) format('truetype'),
|
||
url(https://yastatic.net/adv-www/_/0w7OcWZM_QLP8x-LQUXFOgXO6dE.svg#YandexSansTextWeb-Bold) format('svg');
|
||
font-weight: 700;
|
||
font-style: normal;
|
||
font-stretch: normal
|
||
}
|
||
|
||
@font-face {
|
||
font-family: 'Yandex Sans Text Web';
|
||
src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot);
|
||
src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot?#iefix) format('embedded-opentype'),
|
||
url(https://yastatic.net/adv-www/_/z3MYElcut0R2MF_Iw1RDNrstgYs.woff2) format('woff2'),
|
||
url(https://yastatic.net/adv-www/_/1jvKJ_-hCXl3s7gmFl-y_-UHTaI.woff) format('woff'),
|
||
url(https://yastatic.net/adv-www/_/9nzjfpCR2QHvK1EzHpDEIoVFGuY.ttf) format('truetype'),
|
||
url(https://yastatic.net/adv-www/_/gwyBTpxSwkFCF1looxqs6JokKls.svg#YandexSansTextWeb-Regular) format('svg');
|
||
font-weight: 400;
|
||
font-style: normal;
|
||
font-stretch: normal
|
||
}
|
||
|
||
@font-face {
|
||
font-family: 'Yandex Sans Text Web';
|
||
src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot);
|
||
src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot?#iefix) format('embedded-opentype'),
|
||
url(https://yastatic.net/adv-www/_/lGQcYklLVV0hyvz1HFmFsUTj8_0.woff2) format('woff2'),
|
||
url(https://yastatic.net/adv-www/_/f0AAJ9GJ4iiwEmhG-7PWMHk6vUY.woff) format('woff'),
|
||
url(https://yastatic.net/adv-www/_/4UDe4nlVvgEJ-VmLWNVq3SxCsA.ttf) format('truetype'),
|
||
url(https://yastatic.net/adv-www/_/EKLr1STNokPqxLAQa_RyN82pL98.svg#YandexSansTextWeb-Light) format('svg');
|
||
font-weight: 300;
|
||
font-style: normal;
|
||
font-stretch: normal
|
||
}
|
||
|
||
@font-face {
|
||
font-family: 'Yandex Sans Display Web';
|
||
src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot);
|
||
src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'),
|
||
url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'),
|
||
url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'),
|
||
url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'),
|
||
url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg');
|
||
font-weight: 400;
|
||
font-style: normal;
|
||
font-stretch: normal
|
||
}
|
||
|
||
@font-face {
|
||
font-family: 'Yandex Sans Display Web';
|
||
src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot);
|
||
src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot?#iefix) format('embedded-opentype'),
|
||
url(https://yastatic.net/adv-www/_/LGiRvlfqQHlWR9YKLhsw5e7KGNA.woff2) format('woff2'),
|
||
url(https://yastatic.net/adv-www/_/40vXwNl4eYYMgteIVgLP49dwmfc.woff) format('woff'),
|
||
url(https://yastatic.net/adv-www/_/X6zG5x_wO8-AtwJ-vDLJcKC5228.ttf) format('truetype'),
|
||
url(https://yastatic.net/adv-www/_/ZKhaR0m08c8CRRL77GtFKoHcLYA.svg#YandexSansDisplayWeb-Light) format('svg');
|
||
font-weight: 300;
|
||
font-style: normal;
|
||
font-stretch: normal
|
||
}
|
||
|
||
body {
|
||
background: #fff;
|
||
font: 300 12pt/150% 'Yandex Sans Text Web', Arial, sans-serif;
|
||
}
|
||
|
||
.page {
|
||
width: 900px;
|
||
margin: auto;
|
||
}
|
||
|
||
h1 {
|
||
font-family: 'Yandex Sans Display Web', Arial, sans-serif;
|
||
font-size: 100px;
|
||
font-weight: normal;
|
||
margin-top: 100px;
|
||
margin-bottom: 0;
|
||
text-align: center;
|
||
padding-top: 27px;
|
||
}
|
||
|
||
.title_link, .title_link:active, .title_link:visited, .title_link:link, .title_link:hover {
|
||
text-decoration: none;
|
||
color: #000;
|
||
}
|
||
|
||
h2 {
|
||
font: normal 50px 'Yandex Sans Display Web', Arial, sans-serif;
|
||
text-align: center;
|
||
margin-top: 35px;
|
||
margin-bottom: 50px;
|
||
}
|
||
|
||
h3 {
|
||
font: normal 24px 'Yandex Sans Display Web', Arial, sans-serif;
|
||
margin-top: 36px;
|
||
}
|
||
|
||
a:link, a:visited {
|
||
color: #08f;
|
||
text-decoration: none;
|
||
}
|
||
|
||
a:hover, a:active {
|
||
color: #f00;
|
||
text-decoration: underline;
|
||
}
|
||
|
||
.footer {
|
||
text-align: right;
|
||
margin-top: 40px;
|
||
border-top: 1px solid #EEE;
|
||
padding: 10px 0 0;
|
||
color: #888;
|
||
font-size: 10pt;
|
||
}
|
||
|
||
pre {
|
||
font: 13px/18px monospace, "Courier New";
|
||
display: block;
|
||
border-left: 5px solid #ffdb4d;
|
||
padding: 5px 10px;
|
||
background-color: #FFF8E8;
|
||
}
|
||
|
||
.spoiler {
|
||
margin-bottom: 10px;
|
||
}
|
||
|
||
.spoiler_body {
|
||
display: none;
|
||
}
|
||
|
||
.spoiler_title {
|
||
color: #08f;
|
||
border-bottom: 1px dotted #08f;
|
||
}
|
||
|
||
.spoiler_title:hover {
|
||
cursor: pointer;
|
||
color: #f00;
|
||
border-bottom: 1px dashed #f00;
|
||
text-decoration: none;
|
||
}
|
||
|
||
.tip {
|
||
background-color: #EEE;
|
||
border: 1px solid #EEE;
|
||
padding: 5px 10px 5px 10px;
|
||
}
|
||
|
||
.tip b {
|
||
font-size: 150%;
|
||
color: #888;
|
||
}
|
||
|
||
.warranty {
|
||
font-size: 10pt;
|
||
color: #888;
|
||
line-height: 150%;
|
||
}
|
||
|
||
.orange {
|
||
fill: #fc0;
|
||
}
|
||
|
||
.red {
|
||
fill: #f00
|
||
}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
|
||
<div class="page">
|
||
|
||
<div>
|
||
<div style="float: left; margin-right: -100%; margin-top: 0; margin-left: 3px;">
|
||
<a href="/">
|
||
<svg xmlns="http://www.w3.org/2000/svg" width="90" height="80" viewBox="0 0 9 8">
|
||
<path class="red" d="M0,7 h1 v1 h-1 z"></path>
|
||
<path class="orange" d="M0,0 h1 v7 h-1 z"></path>
|
||
<path class="orange" d="M2,0 h1 v8 h-1 z"></path>
|
||
<path class="orange" d="M4,0 h1 v8 h-1 z"></path>
|
||
<path class="orange" d="M6,0 h1 v8 h-1 z"></path>
|
||
<path class="orange" d="M8,3.25 h1 v1.5 h-1 z"></path>
|
||
</svg>
|
||
</a>
|
||
</div>
|
||
|
||
<h1 id="main_title"><a class="title_link" href="/">ClickHouse</a></h1>
|
||
<h2>Tutorial</h2>
|
||
</div>
|
||
|
||
<p>Let's get started with sample dataset from open sources. We will use USA civil flights data since 1987 till 2015.
|
||
It's hard to call this sample a Big Data (contains 166 millions rows, 63 Gb of uncompressed data) but this
|
||
allows us to quickly get to work. Dataset is available for download <a href="https://yadi.sk/d/pOZxpa42sDdgm">here</a>.
|
||
Also you may download it from the original datasource <a
|
||
href="https://github.com/yandex/ClickHouse/raw/master/doc/example_datasets/1_ontime.txt"
|
||
rel="external nofollow">as described
|
||
here</a>.</p>
|
||
|
||
<p>Firstly we will deploy ClickHouse to a single server. Below that we will also review the process of deployment to
|
||
a cluster with support for sharding and replication.</p>
|
||
|
||
<p>On Ubuntu and Debian Linux ClickHouse can be installed from <a href="/#download">packages</a>.
|
||
For other Linux distributions you can <a href="https://github.com/yandex/ClickHouse/blob/master/doc/build.md"
|
||
rel="external nofollow">compile
|
||
ClickHouse from sources</a> and then install.</p>
|
||
|
||
<p><b>clickhouse-client</b> package contains <a
|
||
href="docs/en/interfaces/cli.html">clickhouse-client</a> application —
|
||
interactive ClickHouse client. <b>clickhouse-server-base</b> contains a clickhouse-server binary file. <b>clickhouse-server-common</b>
|
||
— contains config files for the clickhouse-server.</p>
|
||
|
||
<p>Server config files are located in /etc/clickhouse-server/. Before getting to work please notice the <b>path</b>
|
||
element in config. <b>Path</b> determines the location for data storage. It's not really handy to directly
|
||
edit <b>config.xml</b> file considering package updates. Recommended way is to override the config elements in
|
||
<a href="docs/en/configuration_files.html">files of config.d directory</a>.
|
||
Also you may want to <a href="docs/en/access_rights.html">set up access
|
||
rights</a> at the start.</p>
|
||
|
||
<p><b>clickhouse-server</b> won't be launched automatically after package installation. It won't be automatically
|
||
restarted after updates either. Start the server with:
|
||
<pre>sudo service clickhouse-server start</pre>
|
||
Default location for server logs is /var/log/clickhouse-server/
|
||
Server is ready to handle client conections once "Ready for connections" message was logged.</p>
|
||
|
||
<p>Use <b>clickhouse-client</b> to connect to the server.</p>
|
||
|
||
<div class="spoiler"><a class="spoiler_title">Tips for clickhouse-client</a>
|
||
<div class="spoiler_body">
|
||
Interactive mode:
|
||
<pre>
|
||
clickhouse-client
|
||
clickhouse-client --host=... --port=... --user=... --password=...
|
||
</pre>
|
||
Enable multiline queries:
|
||
<pre>
|
||
clickhouse-client -m
|
||
clickhouse-client --multiline
|
||
</pre>
|
||
Run queries in batch-mode:
|
||
<pre>
|
||
clickhouse-client --query='SELECT 1'
|
||
echo 'SELECT 1' | clickhouse-client
|
||
</pre>
|
||
Inser data from file of a specified format:
|
||
<pre>
|
||
clickhouse-client --query='INSERT INTO table VALUES' < data.txt
|
||
clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv
|
||
</pre>
|
||
</div>
|
||
</div>
|
||
|
||
<h3>Create table for sample dataset</h3>
|
||
<div class="spoiler"><a class="spoiler_title">Create table query</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
$ clickhouse-client --multiline
|
||
ClickHouse client version 0.0.53720.
|
||
Connecting to localhost:9000.
|
||
Connected to ClickHouse server version 0.0.53720.
|
||
|
||
:) CREATE TABLE ontime
|
||
(
|
||
Year UInt16,
|
||
Quarter UInt8,
|
||
Month UInt8,
|
||
DayofMonth UInt8,
|
||
DayOfWeek UInt8,
|
||
FlightDate Date,
|
||
UniqueCarrier FixedString(7),
|
||
AirlineID Int32,
|
||
Carrier FixedString(2),
|
||
TailNum String,
|
||
FlightNum String,
|
||
OriginAirportID Int32,
|
||
OriginAirportSeqID Int32,
|
||
OriginCityMarketID Int32,
|
||
Origin FixedString(5),
|
||
OriginCityName String,
|
||
OriginState FixedString(2),
|
||
OriginStateFips String,
|
||
OriginStateName String,
|
||
OriginWac Int32,
|
||
DestAirportID Int32,
|
||
DestAirportSeqID Int32,
|
||
DestCityMarketID Int32,
|
||
Dest FixedString(5),
|
||
DestCityName String,
|
||
DestState FixedString(2),
|
||
DestStateFips String,
|
||
DestStateName String,
|
||
DestWac Int32,
|
||
CRSDepTime Int32,
|
||
DepTime Int32,
|
||
DepDelay Int32,
|
||
DepDelayMinutes Int32,
|
||
DepDel15 Int32,
|
||
DepartureDelayGroups String,
|
||
DepTimeBlk String,
|
||
TaxiOut Int32,
|
||
WheelsOff Int32,
|
||
WheelsOn Int32,
|
||
TaxiIn Int32,
|
||
CRSArrTime Int32,
|
||
ArrTime Int32,
|
||
ArrDelay Int32,
|
||
ArrDelayMinutes Int32,
|
||
ArrDel15 Int32,
|
||
ArrivalDelayGroups Int32,
|
||
ArrTimeBlk String,
|
||
Cancelled UInt8,
|
||
CancellationCode FixedString(1),
|
||
Diverted UInt8,
|
||
CRSElapsedTime Int32,
|
||
ActualElapsedTime Int32,
|
||
AirTime Int32,
|
||
Flights Int32,
|
||
Distance Int32,
|
||
DistanceGroup UInt8,
|
||
CarrierDelay Int32,
|
||
WeatherDelay Int32,
|
||
NASDelay Int32,
|
||
SecurityDelay Int32,
|
||
LateAircraftDelay Int32,
|
||
FirstDepTime String,
|
||
TotalAddGTime String,
|
||
LongestAddGTime String,
|
||
DivAirportLandings String,
|
||
DivReachedDest String,
|
||
DivActualElapsedTime String,
|
||
DivArrDelay String,
|
||
DivDistance String,
|
||
Div1Airport String,
|
||
Div1AirportID Int32,
|
||
Div1AirportSeqID Int32,
|
||
Div1WheelsOn String,
|
||
Div1TotalGTime String,
|
||
Div1LongestGTime String,
|
||
Div1WheelsOff String,
|
||
Div1TailNum String,
|
||
Div2Airport String,
|
||
Div2AirportID Int32,
|
||
Div2AirportSeqID Int32,
|
||
Div2WheelsOn String,
|
||
Div2TotalGTime String,
|
||
Div2LongestGTime String,
|
||
Div2WheelsOff String,
|
||
Div2TailNum String,
|
||
Div3Airport String,
|
||
Div3AirportID Int32,
|
||
Div3AirportSeqID Int32,
|
||
Div3WheelsOn String,
|
||
Div3TotalGTime String,
|
||
Div3LongestGTime String,
|
||
Div3WheelsOff String,
|
||
Div3TailNum String,
|
||
Div4Airport String,
|
||
Div4AirportID Int32,
|
||
Div4AirportSeqID Int32,
|
||
Div4WheelsOn String,
|
||
Div4TotalGTime String,
|
||
Div4LongestGTime String,
|
||
Div4WheelsOff String,
|
||
Div4TailNum String,
|
||
Div5Airport String,
|
||
Div5AirportID Int32,
|
||
Div5AirportSeqID Int32,
|
||
Div5WheelsOn String,
|
||
Div5TotalGTime String,
|
||
Div5LongestGTime String,
|
||
Div5WheelsOff String,
|
||
Div5TailNum String
|
||
)
|
||
ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);
|
||
</pre>
|
||
</div>
|
||
</div>
|
||
|
||
<p>Now we have a table of <a href="docs/en/table_engines/mergetree.html">MergeTree type</a>.
|
||
MergeTree table type is recommended for usage in production. Table of this kind has a primary key used for
|
||
incremental sort of table data. This allows fast execution of queries in ranges of a primary key.</p>
|
||
|
||
|
||
<p><b>Note</b>
|
||
We store ad network banners impressions logs in ClickHouse. Each table entry looks like:
|
||
<source>
|
||
[Advertiser ID, Impression ID, attribute1, attribute2, …]</pre>
|
||
Let assume that our aim is to provide a set of reports for each advertiser. Common and frequently demanded query
|
||
would be to count impressions for a specific Advertiser ID. This means that table primary key should start with
|
||
<source>
|
||
Advertiser ID</pre>. In this case ClickHouse needs to read smaller amount of data to perform the query for a
|
||
given
|
||
<source>
|
||
Advertiser ID</pre>.
|
||
</p>
|
||
|
||
<h3>Load data</h3>
|
||
<pre>xz -v -c -d < ontime.csv.xz | clickhouse-client --query="INSERT INTO ontime FORMAT CSV"</pre>
|
||
<p>ClickHouse INSERT query allows to load data in any <a href="docs/en/formats/index.html">supported
|
||
format</a>. Data load requires just O(1) RAM consumption. INSERT query can receive any data volume as input.
|
||
It's strongly recommended to insert data with <a
|
||
href="docs/en/introduction/performance.html#performance-on-data-insertion">not too small
|
||
size blocks</a>. Notice that insert of blocks with size up to max_insert_block_size (= 1 048 576
|
||
rows by default) is an atomic operation: data block will be inserted completely or not inserted at all. In case
|
||
of disconnect during insert operation you may not know if the block was inserted successfully. To achieve
|
||
exactly-once semantics ClickHouse supports idempotency for <a
|
||
href="docs/en/table_engines/replication.html">replicated tables</a>. This means
|
||
that you may retry insert of the same data block (possibly on a different replicas) but this block will be
|
||
inserted just once. Anyway in this guide we will load data from our localhost so we may not take care about data
|
||
blocks generation and exactly-once semantics.</p>
|
||
|
||
<p>INSERT query into tables of MergeTree type is non-blocking (so does a SELECT query). You can execute SELECT
|
||
queries right after of during insert operation.</p>
|
||
|
||
<p>Our sample dataset is a bit not optimal. There are two reasons.</p>
|
||
|
||
<p>The first is that String data type is used in cases when <a
|
||
href="docs/en/data_types/enum.html">Enum</a> or numeric type would fit best.</p>
|
||
|
||
<p class="tip"><b>⚖</b> When set of possible values is determined and known to be small. (E.g. OS name, browser
|
||
vendors etc.) it's recommended to use Enums or numbers to improve performance.
|
||
When set of possible values is not limited (search query, URL, etc.) just go ahead with String.</p>
|
||
|
||
<p>The second is that dataset contains redundant fields like Year, Quarter, Month, DayOfMonth, DayOfWeek. In fact a
|
||
single FlightDate would be enough. Most likely they have been added to improve performance for other DBMS'es
|
||
which DateTime handling functions may be not efficient.</p>
|
||
|
||
<p class="tip"><b>✯</b> ClickHouse <a
|
||
href="docs/en/functions/date_time_functions.html">functions
|
||
for operating with DateTime fields</a> are well-optimized so such redundancy is not required. Anyway much
|
||
columns is not a reason to worry — ClickHouse is a <a href="https://en.wikipedia.org/wiki/Column-oriented_DBMS"
|
||
rel="external nofollow">column-oriented
|
||
DBMS</a>. This allows you to have as much fields as you need. Hundreds of columns in a table is fine for
|
||
ClickHouse.</p>
|
||
|
||
<h3>Querying the sample dataset</h3>
|
||
|
||
<p>Here are some examples of the queries from our test data.</p>
|
||
|
||
<ul>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">the most popular destinations in 2015;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT
|
||
OriginCityName,
|
||
DestCityName,
|
||
count(*) AS flights,
|
||
bar(flights, 0, 20000, 40)
|
||
FROM ontime WHERE Year = 2015 GROUP BY OriginCityName, DestCityName ORDER BY flights DESC LIMIT 20
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/a85/18a/200/a8518a200d6d405a95ee80ea1c8e1c90.png"/>
|
||
<pre>
|
||
SELECT
|
||
OriginCityName < DestCityName ? OriginCityName : DestCityName AS a,
|
||
OriginCityName < DestCityName ? DestCityName : OriginCityName AS b,
|
||
count(*) AS flights,
|
||
bar(flights, 0, 40000, 40)
|
||
FROM ontime WHERE Year = 2015 GROUP BY a, b ORDER BY flights DESC LIMIT 20
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/d35/78d/b55/d3578db55e304bd7b5eba818abdb53f5.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">the most popular cities of departure;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT OriginCityName, count(*) AS flights
|
||
FROM ontime GROUP BY OriginCityName ORDER BY flights DESC LIMIT 20
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/ef4/141/f34/ef4141f348234773a5349c4bd3e8f804.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">cities of departure which offer maximum variety of
|
||
destinations;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT OriginCityName, uniq(Dest) AS u
|
||
FROM ontime GROUP BY OriginCityName ORDER BY u DESC LIMIT 20
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/240/9f4/9d1/2409f49d11fb4aa1b8b5ff34cf9ca75d.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">flight delay dependence on the day of week;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT DayOfWeek, count() AS c, avg(DepDelay > 60) AS delays
|
||
FROM ontime GROUP BY DayOfWeek ORDER BY DayOfWeek
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/885/e50/793/885e507930e34b7c8f788d25e7ca2bcf.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">cities of departure with most frequent delays for 1 hour or
|
||
longer;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT OriginCityName, count() AS c, avg(DepDelay > 60) AS delays
|
||
FROM ontime
|
||
GROUP BY OriginCityName
|
||
HAVING c > 100000
|
||
ORDER BY delays DESC
|
||
LIMIT 20
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/ac2/926/56d/ac292656d03946d0aba35c75783a31f2.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">flights of maximum duration;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT OriginCityName, DestCityName, count(*) AS flights, avg(AirTime) AS duration
|
||
FROM ontime
|
||
GROUP BY OriginCityName, DestCityName
|
||
ORDER BY duration DESC
|
||
LIMIT 20
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/7b3/c2e/685/7b3c2e685832439b8c373bf2015131d2.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">distribution of arrival time delays split by aircompanies;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT Carrier, count() AS c, round(quantileTDigest(0.99)(DepDelay), 2) AS q
|
||
FROM ontime GROUP BY Carrier ORDER BY q DESC
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/49c/332/e3d/49c332e3d93146ba8f46beef6b2b02b0.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">aircompanies who stopped flights operation;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT Carrier, min(Year), max(Year), count()
|
||
FROM ontime GROUP BY Carrier HAVING max(Year) < 2015 ORDER BY count() DESC
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/249/56f/1a2/24956f1a2efc48d78212586958aa036c.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">most trending destination cities in 2015;</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT
|
||
DestCityName,
|
||
sum(Year = 2014) AS c2014,
|
||
sum(Year = 2015) AS c2015,
|
||
c2015 / c2014 AS diff
|
||
FROM ontime
|
||
WHERE Year IN (2014, 2015)
|
||
GROUP BY DestCityName
|
||
HAVING c2014 > 10000 AND c2015 > 1000 AND diff > 1
|
||
ORDER BY diff DESC
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/f31/32f/4d1/f3132f4d1c0d42eab26d9111afe7771a.png"/></div>
|
||
</div>
|
||
</li>
|
||
<li>
|
||
<div class="spoiler"><a class="spoiler_title">destination cities with maximum popularity-season
|
||
dependency.</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
SELECT
|
||
DestCityName,
|
||
any(total),
|
||
avg(abs(monthly * 12 - total) / total) AS avg_month_diff
|
||
FROM
|
||
(
|
||
SELECT DestCityName, count() AS total
|
||
FROM ontime GROUP BY DestCityName HAVING total > 100000
|
||
)
|
||
ALL INNER JOIN
|
||
(
|
||
SELECT DestCityName, Month, count() AS monthly
|
||
FROM ontime GROUP BY DestCityName, Month HAVING monthly > 10000
|
||
)
|
||
USING DestCityName
|
||
GROUP BY DestCityName
|
||
ORDER BY avg_month_diff DESC
|
||
LIMIT 20
|
||
</pre>
|
||
<img src="https://habrastorage.org/files/26b/2c7/aae/26b2c7aae21a4c76800cb1c7a33a374d.png"/></div>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
|
||
<h3>ClickHouse deployment to cluster</h3>
|
||
<p>ClickHouse cluster is a homogenous cluster. Steps to set up:
|
||
<ol>
|
||
<li>Install ClickHouse server on all machines of the cluster</li>
|
||
<li>Set up cluster configs in configuration file</li>
|
||
<li>Create local tables on each instance</li>
|
||
<li>Create a <a href="/reference_en.html#Distributed">Distributed table</a></li>
|
||
</ol>
|
||
</p>
|
||
|
||
<p><a href="/reference_en.html#Distributed">Distributed-table</a> is actually a kind of
|
||
"view" to local tables of ClickHouse cluster. SELECT query from a distributed table will be executed using
|
||
resources of all cluster's shards. You may specify configs for multiple clusters and create multiple
|
||
Distributed-tables providing views to different clusters.</p>
|
||
|
||
<div class="spoiler"><a class="spoiler_title">Config for cluster of three shards. Each shard stores data on a single
|
||
replica</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
<remote_servers>
|
||
<perftest_3shards_1replicas>
|
||
<shard>
|
||
<replica>
|
||
<host>example-perftest01j.yandex.ru</host>
|
||
<port>9000</port>
|
||
</replica>
|
||
</shard>
|
||
<shard>
|
||
<replica>
|
||
<host>example-perftest02j.yandex.ru</host>
|
||
<port>9000</port>
|
||
</replica>
|
||
</shard>
|
||
<shard>
|
||
<replica>
|
||
<host>example-perftest03j.yandex.ru</host>
|
||
<port>9000</port>
|
||
</replica>
|
||
</shard>
|
||
</perftest_3shards_1replicas>
|
||
</remote_servers>
|
||
</pre>
|
||
</div>
|
||
</div>
|
||
Creating a local table:
|
||
<pre>CREATE TABLE ontime_local (...) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);</pre>
|
||
Creating a distributed table providing a view into local tables of the cluster:
|
||
<pre>CREATE TABLE ontime_all AS ontime_local
|
||
ENGINE = Distributed(perftest_3shards_1replicas, default, ontime_local, rand());</pre>
|
||
|
||
<p>You can create a Distributed table on all machines in the cluster. This would allow to run distributed queries on
|
||
any machine of the cluster. Besides distributed table you can also use <a
|
||
href="/reference_en.html#remote">*remote* table function</a>.</p>
|
||
|
||
<p>Let's run <a href="/reference_en.html#INSERT">INSERT SELECT</a> into Distributed table
|
||
to spread the table to multiple servers.</p>
|
||
|
||
<pre>INSERT INTO ontime_all SELECT * FROM ontime;</pre>
|
||
|
||
<p class="tip"><b>⚠</b> Worth to notice that the approach given above wouldn't fit for sharding of large
|
||
tables.<br/>Please use <a href="/reference_en.html#Resharding">built-in sharding
|
||
feature</a>.</p>
|
||
|
||
<p>As you could expect heavy queries are executed N times faster being launched on 3 servers instead of one.</p>
|
||
<div class="spoiler"><a class="spoiler_title">See here</a>
|
||
<div class="spoiler_body">
|
||
<img src="https://habrastorage.org/files/ece/020/129/ece020129fdf4a18a6e75daf2e699cb9.png"/>
|
||
|
||
<p>You may have noticed that quantiles calculation are slightly different. This happens due to <a
|
||
href="https://github.com/tdunning/t-digest/raw/master/docs/t-digest-paper/histo.pdf">t-digest</a>
|
||
algorithm implementation which is non-deterministic — it depends on the order of data processing.</p>
|
||
</div>
|
||
</div>
|
||
|
||
<p>In this case we have used a cluster with 3 shards each contains a single replica.</p>
|
||
|
||
<p>To provide for resilience in production environment we recommend that each shard should contain 2-3 replicas
|
||
distributed between multiple data-centers. Note that ClickHouse supports unlimited number of replicas.</p>
|
||
|
||
<div class="spoiler"><a class="spoiler_title">Config for cluster of one shard containing three replicas</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
<remote_servers>
|
||
...
|
||
<perftest_1shards_3replicas>
|
||
<shard>
|
||
<replica>
|
||
<host>example-perftest01j.yandex.ru</host>
|
||
<port>9000</port>
|
||
</replica>
|
||
<replica>
|
||
<host>example-perftest02j.yandex.ru</host>
|
||
<port>9000</port>
|
||
</replica>
|
||
<replica>
|
||
<host>example-perftest03j.yandex.ru</host>
|
||
<port>9000</port>
|
||
</replica>
|
||
</shard>
|
||
</perftest_1shards_3replicas>
|
||
</remote_servers>
|
||
</pre>
|
||
</div>
|
||
</div>
|
||
|
||
<p>To enable replication <a href="http://zookeeper.apache.org/" rel="external nofollow">ZooKeeper</a> is required.
|
||
ClickHouse will take care of data consistency on all replicas and run restore procedure after failure
|
||
automatically. It's recommended to deploy ZooKeeper cluster to separate servers.</p>
|
||
|
||
<p>ZooKeeper is not a requirement — in some simple cases you can duplicate the data by writing it into all the
|
||
replicas from your application code. This approach is not recommended — in this case ClickHouse is not able to
|
||
guarantee data consistency on all replicas. This remains the responsibility of your application.</p>
|
||
|
||
<div class="spoiler"><a class="spoiler_title">Set ZooKeeper locations in configuration file</a>
|
||
<div class="spoiler_body">
|
||
<pre>
|
||
<zookeeper-servers>
|
||
<node>
|
||
<host>zoo01.yandex.ru</host>
|
||
<port>2181</port>
|
||
</node>
|
||
<node>
|
||
<host>zoo02.yandex.ru</host>
|
||
<port>2181</port>
|
||
</node>
|
||
<node>
|
||
<host>zoo03.yandex.ru</host>
|
||
<port>2181</port>
|
||
</node>
|
||
</zookeeper-servers>
|
||
</pre>
|
||
</div>
|
||
</div>
|
||
|
||
<p>Also we need to set macros for identifying shard and replica — it will be used on table creation</p>
|
||
<pre>
|
||
<macros>
|
||
<shard>01</shard>
|
||
<replica>01</replica>
|
||
</macros>
|
||
</pre>
|
||
<p>If there are no replicas at the moment on replicated table creation — a new first replica will be instantiated.
|
||
If there are already live replicas — new replica will clone the data from existing ones. You have an option to
|
||
create all replicated tables first and that insert data to it. Another option is to create some replicas and add
|
||
the others after or during data insertion.</p>
|
||
|
||
<pre>
|
||
CREATE TABLE ontime_replica (...)
|
||
ENGINE = ReplicatedMergeTree(
|
||
'/clickhouse_perftest/tables/{shard}/ontime',
|
||
'{replica}',
|
||
FlightDate,
|
||
(Year, FlightDate),
|
||
8192);
|
||
</pre>
|
||
<p>Here we use <a href="docs/en/table_engines/replication.html#replicatedmergetree">ReplicatedMergeTree</a>
|
||
table type. In parameters we specify ZooKeeper path containing shard and replica identifiers.</p>
|
||
|
||
<pre>INSERT INTO ontime_replica SELECT * FROM ontime;</pre>
|
||
<p>Replication operates in multi-master mode. Data can be loaded into any replica — it will be synced with other
|
||
instances automatically. Replication is asynchronous so at a given moment of time not all replicas may contain
|
||
recently inserted data. To allow data insertion at least one replica should be up. Others will sync up data and
|
||
repair consistency once they will become active again. Please notice that such scheme allows for the possibility
|
||
of just appended data loss.</p>
|
||
|
||
<h3>Feedback</h3>
|
||
|
||
<p>Ask any questions on <a href="https://stackoverflow.com/questions/tagged/clickhouse" rel="external nofollow">Stack
|
||
Overflow</a>.</p>
|
||
<p>Discuss with real users in Telegram chat in <a href="https://telegram.me/clickhouse_en" rel="external nofollow">English</a>
|
||
or in <a
|
||
href="https://telegram.me/clickhouse_ru" rel="external nofollow">Russian</a>.</p>
|
||
<p>Use <a href="https://groups.google.com/group/clickhouse" rel="external nofollow">Google Group</a> for discussion.
|
||
</p>
|
||
<p>Or send private message to developers:
|
||
<a id="feedback_email" href="">turn on JavaScript to see email address</a>.
|
||
</p>
|
||
<p class="warranty">Software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||
either express or implied.</p>
|
||
|
||
<p class="footer">© 2016–2017 <a href="https://yandex.com/company/" rel="external nofollow">YANDEX</a> LLC</p>
|
||
</div>
|
||
|
||
<script type="text/javascript" src="https://yastatic.net/jquery/3.1.1/jquery.min.js"></script>
|
||
<script type="text/javascript">
|
||
$('.spoiler_title').click(function () {
|
||
$(this).next('.spoiler_body').toggle(100);
|
||
});
|
||
|
||
var name = document.getElementById('main_title').textContent.trim().toLowerCase();
|
||
var feedback_address = name + '-feedback' + '@yandex-team.com';
|
||
var feedback_email = document.getElementById('feedback_email');
|
||
feedback_email.setAttribute('href', 'mailto:' + feedback_address);
|
||
feedback_email.textContent = feedback_address;
|
||
</script>
|
||
<!-- Yandex.Metrika counter -->
|
||
<script type="text/javascript">
|
||
(function (d, w, c) {
|
||
(w[c] = w[c] || []).push(function() {
|
||
try {
|
||
w.yaCounter18343495 = new Ya.Metrika2({
|
||
id:18343495,
|
||
clickmap:true,
|
||
trackLinks:true,
|
||
accurateTrackBounce:true,
|
||
webvisor:true
|
||
});
|
||
} catch(e) { }
|
||
});
|
||
|
||
var n = d.getElementsByTagName("script")[0],
|
||
s = d.createElement("script"),
|
||
f = function () { n.parentNode.insertBefore(s, n); };
|
||
s.type = "text/javascript";
|
||
s.async = true;
|
||
s.src = "https://mc.yandex.ru/metrika/tag.js";
|
||
|
||
if (w.opera == "[object Opera]") {
|
||
d.addEventListener("DOMContentLoaded", f, false);
|
||
} else { f(); }
|
||
})(document, window, "yandex_metrika_callbacks2");
|
||
</script>
|
||
<noscript>
|
||
<div><img src="https://mc.yandex.ru/watch/18343495" style="position:absolute; left:-9999px;" alt=""/></div>
|
||
</noscript>
|
||
<!-- /Yandex.Metrika counter -->
|
||
|
||
|
||
</body>
|
||
</html>
|