mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 09:32:01 +00:00
Tutorial: preparation [#METR-20000].
This commit is contained in:
parent
094900f86e
commit
a1bf8562bf
@ -1,16 +1,174 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="ru">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<title>ClickHouse — руководство</title>
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<title>ClickHouse — руководство</title>
|
||||
|
||||
<link rel="shortcut icon" href="favicon.ico"/>
|
||||
<link rel="stylesheet" href="reference.css"/>
|
||||
<link rel="shortcut icon" href="favicon.ico"/>
|
||||
|
||||
<meta name="description" content="ClickHouse — open-source distributed column-oriented DBMS"/>
|
||||
<meta name="keywords" content="ClickHouse, DBMS, OLAP, relational, analytics, analytical, big data, open-source, SQL, web-analytics"/>
|
||||
</head>
|
||||
<body>
|
||||
<meta name="description" content="ClickHouse — open-source distributed column-oriented DBMS"/>
|
||||
<meta name="keywords" content="ClickHouse, DBMS, OLAP, relational, analytics, analytical, big data, open-source, SQL, web-analytics"/>
|
||||
|
||||
<style type="text/css">
|
||||
@font-face {
|
||||
font-family: 'Yandex Sans Text Web';
|
||||
src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot);
|
||||
src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot?#iefix) format('embedded-opentype'),
|
||||
url(https://yastatic.net/adv-www/_/CYblzLEXzCqQIvrYs7QKQe2omRk.woff2) format('woff2'),
|
||||
url(https://yastatic.net/adv-www/_/pUcnOdRwl83MvPPzrNomhyletnA.woff) format('woff'),
|
||||
url(https://yastatic.net/adv-www/_/vNFEmXOcGYKJ4AAidUprHWoXrLU.ttf) format('truetype'),
|
||||
url(https://yastatic.net/adv-www/_/0w7OcWZM_QLP8x-LQUXFOgXO6dE.svg#YandexSansTextWeb-Bold) format('svg');
|
||||
font-weight: 700;
|
||||
font-style: normal;
|
||||
font-stretch: normal
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'Yandex Sans Text Web';
|
||||
src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot);
|
||||
src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot?#iefix) format('embedded-opentype'),
|
||||
url(https://yastatic.net/adv-www/_/z3MYElcut0R2MF_Iw1RDNrstgYs.woff2) format('woff2'),
|
||||
url(https://yastatic.net/adv-www/_/1jvKJ_-hCXl3s7gmFl-y_-UHTaI.woff) format('woff'),
|
||||
url(https://yastatic.net/adv-www/_/9nzjfpCR2QHvK1EzHpDEIoVFGuY.ttf) format('truetype'),
|
||||
url(https://yastatic.net/adv-www/_/gwyBTpxSwkFCF1looxqs6JokKls.svg#YandexSansTextWeb-Regular) format('svg');
|
||||
font-weight: 400;
|
||||
font-style: normal;
|
||||
font-stretch: normal
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'Yandex Sans Text Web';
|
||||
src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot);
|
||||
src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot?#iefix) format('embedded-opentype'),
|
||||
url(https://yastatic.net/adv-www/_/lGQcYklLVV0hyvz1HFmFsUTj8_0.woff2) format('woff2'),
|
||||
url(https://yastatic.net/adv-www/_/f0AAJ9GJ4iiwEmhG-7PWMHk6vUY.woff) format('woff'),
|
||||
url(https://yastatic.net/adv-www/_/4UDe4nlVvgEJ-VmLWNVq3SxCsA.ttf) format('truetype'),
|
||||
url(https://yastatic.net/adv-www/_/EKLr1STNokPqxLAQa_RyN82pL98.svg#YandexSansTextWeb-Light) format('svg');
|
||||
font-weight: 300;
|
||||
font-style: normal;
|
||||
font-stretch: normal
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'Yandex Sans Display Web';
|
||||
src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot);
|
||||
src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'),
|
||||
url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'),
|
||||
url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'),
|
||||
url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'),
|
||||
url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg');
|
||||
font-weight: 400;
|
||||
font-style: normal;
|
||||
font-stretch: normal
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'Yandex Sans Display Web';
|
||||
src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot);
|
||||
src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot?#iefix) format('embedded-opentype'),
|
||||
url(https://yastatic.net/adv-www/_/LGiRvlfqQHlWR9YKLhsw5e7KGNA.woff2) format('woff2'),
|
||||
url(https://yastatic.net/adv-www/_/40vXwNl4eYYMgteIVgLP49dwmfc.woff) format('woff'),
|
||||
url(https://yastatic.net/adv-www/_/X6zG5x_wO8-AtwJ-vDLJcKC5228.ttf) format('truetype'),
|
||||
url(https://yastatic.net/adv-www/_/ZKhaR0m08c8CRRL77GtFKoHcLYA.svg#YandexSansDisplayWeb-Light) format('svg');
|
||||
font-weight: 300;
|
||||
font-style: normal;
|
||||
font-stretch: normal
|
||||
}
|
||||
|
||||
body {
|
||||
background: #fff;
|
||||
font: 300 12pt/150% 'Yandex Sans Text Web', Arial, sans-serif;
|
||||
}
|
||||
|
||||
.page {
|
||||
width: 900px;
|
||||
margin: auto;
|
||||
}
|
||||
|
||||
h1
|
||||
{
|
||||
font-family: 'Yandex Sans Display Web', Arial, sans-serif;
|
||||
font-size: 100px;
|
||||
font-weight: normal;
|
||||
margin-top: 100px;
|
||||
margin-bottom: 0;
|
||||
text-align: center;
|
||||
padding-top: 27px;
|
||||
}
|
||||
|
||||
.title_link, .title_link:active, .title_link:visited, .title_link:link, .title_link:hover
|
||||
{
|
||||
text-decoration: none;
|
||||
color: #000;
|
||||
}
|
||||
|
||||
h2
|
||||
{
|
||||
font: normal 50px 'Yandex Sans Display Web', Arial, sans-serif;
|
||||
text-align: center;
|
||||
margin-top: 35px;
|
||||
margin-bottom: 50px;
|
||||
}
|
||||
|
||||
h3
|
||||
{
|
||||
font: normal 24px 'Yandex Sans Display Web', Arial, sans-serif;
|
||||
margin-top: 36px;
|
||||
}
|
||||
|
||||
a:link, a:visited {
|
||||
color: #08f;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover, a:active {
|
||||
color: #f00;
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.footer {
|
||||
text-align: right;
|
||||
margin-top: 40px;
|
||||
border-top: 1px solid #EEE;
|
||||
padding: 10px 0 0;
|
||||
color: #888;
|
||||
font-size: 10pt;
|
||||
}
|
||||
|
||||
pre {
|
||||
font: 13px/18px monospace, "Courier New";
|
||||
display: block;
|
||||
border-left: 5px solid #ffdb4d;
|
||||
padding: 5px 10px;
|
||||
background-color: #FFF8E8;
|
||||
}
|
||||
|
||||
.spoiler
|
||||
{
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.spoiler_body
|
||||
{
|
||||
display: none;
|
||||
}
|
||||
|
||||
.spoiler_title
|
||||
{
|
||||
color: #08f;
|
||||
border-bottom: 1px dotted #08f;
|
||||
}
|
||||
|
||||
.spoiler_title:hover
|
||||
{
|
||||
cursor: pointer;
|
||||
color: #f00;
|
||||
border-bottom: 1px dashed #f00;
|
||||
text-decoration: none;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<script type="text/javascript">
|
||||
function getParams() {
|
||||
@ -34,7 +192,31 @@ try { var yaCounter18343495 = new Ya.Metrika({id:18343495,
|
||||
<noscript><div><img src="https://mc.yandex.ru/watch/18343495" style="position:absolute; left:-9999px;" alt="" /></div></noscript>
|
||||
<!-- /Yandex.Metrika counter -->
|
||||
|
||||
<h1>ClickHouse quickstart guide</h1>
|
||||
<script type="text/javascript" src="https://yandex.st/jquery/1.7.2/jquery.min.js"></script>
|
||||
|
||||
<div class="page">
|
||||
|
||||
<div>
|
||||
<div style="float: left; margin-right: -100%; margin-top: 0; margin-left: 3px;">
|
||||
<a href="/">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="90" height="80" viewBox="0 0 9 8">
|
||||
<style>
|
||||
.o{fill:#fc0}
|
||||
.r{fill:#f00}
|
||||
</style>
|
||||
<path class="r" d="M0,7 h1 v1 h-1 z"/>
|
||||
<path class="o" d="M0,0 h1 v7 h-1 z"/>
|
||||
<path class="o" d="M2,0 h1 v8 h-1 z"/>
|
||||
<path class="o" d="M4,0 h1 v8 h-1 z"/>
|
||||
<path class="o" d="M6,0 h1 v8 h-1 z"/>
|
||||
<path class="o" d="M8,3.25 h1 v1.5 h-1 z"/>
|
||||
</svg>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<h1><a class="title_link" href="/">ClickHouse</a></h1>
|
||||
<h2>Quick start guide</h2>
|
||||
</div>
|
||||
|
||||
<p>Let's get started with sample dataset from open sources. We will use USA civil flights data since 1987 till 2015. It's hard to call this sample a Big Data (contains 166 millions rows, 63 Gb of uncompressed data) but this allows us to quickly get to work. Dataset is available for download <a href="https://yadi.sk/d/pOZxpa42sDdgm">here</a>. Also you may download it from the original datasource <a href="https://github.com/yandex/ClickHouse/raw/master/doc/example_datasets/1_ontime.txt">as described here</a>.</p>
|
||||
|
||||
@ -44,17 +226,18 @@ try { var yaCounter18343495 = new Ya.Metrika({id:18343495,
|
||||
|
||||
<p><b>clickhouse-client</b> package contains <a href="https://clickhouse.yandex/reference_en.html#Command-line%20client">clickhouse-client</a> application — interactive ClickHouse client. clickhouse-server-base contains a clickhouse-server binary file. clickhouse-server-common — contains config files for the clickhouse-server.</p>
|
||||
|
||||
<p>Server config files are located in /etc/clickhouse-server/. Before getting to work please notice the <b>path</b> element in config. <b>Path</b> determines the location for data storage. It's not really handy to directly edit <b>config.xml</b> file considering package updates. Recommended way is to override the config elements in <a href="https://clickhouse.yandex/reference_en.html#Configuration%20files">files of config.d directory</a>.
|
||||
<p>Server config files are located in /etc/clickhouse-server/. Before getting to work please notice the <b>path</b> element in config. <b>Path</b> determines the location for data storage. It's not really handy to directly edit <b>config.xml</b> file considering package updates. Recommended way is to override the config elements in <a href="https://clickhouse.yandex/reference_en.html#Configuration%20files">files of config.d directory</a>.
|
||||
Also you may want to <a href="https://clickhouse.yandex/reference_en.html#Access%20rights">set up access rights</a> at the start.</p>
|
||||
|
||||
<p><b>clickhouse-server</b> won't be launched automatically after package installation. It won't be automatically restarted after updates either. Start the server with:
|
||||
<pre>sudo service clickhouse-server start</pre>
|
||||
Default location for server logs is /var/log/clickhouse-server/
|
||||
Server is ready to handle client conections once "Ready for connections" message was logged.</p>
|
||||
Server is ready to handle client conections once "Ready for connections" message was logged.</p>
|
||||
|
||||
<p>Use <b>clickhouse-client</b> to connect to the server.</p>
|
||||
|
||||
<spoiler title="Tips for clickhouse-client">
|
||||
<div class="spoiler"><a class="spoiler_title">Tips for clickhouse-client</a>
|
||||
<div class="spoiler_body">
|
||||
Interactive mode:
|
||||
<pre>
|
||||
clickhouse-client
|
||||
@ -75,10 +258,11 @@ Inser data from file of a specified format:
|
||||
clickhouse-client --query='INSERT INTO table VALUES' < data.txt
|
||||
clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv
|
||||
</pre>
|
||||
</spoiler>
|
||||
</div></div>
|
||||
|
||||
<h3>Create table for sample dataset</h3>
|
||||
<spoiler title="Create table query">
|
||||
<div class="spoiler"><a class="spoiler_title">Create table query</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
$ clickhouse-client --multiline
|
||||
ClickHouse client version 0.0.53720.
|
||||
@ -199,13 +383,14 @@ Connected to ClickHouse server version 0.0.53720.
|
||||
)
|
||||
ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);
|
||||
</pre>
|
||||
</spoiler>
|
||||
</div></div>
|
||||
|
||||
<p>Now we have a table of <a href="https://clickhouse.yandex/reference_en.html#MergeTree">MergeTree type</a>. MergeTree table type is recommended for usage in production. Table of this kind has a primary key used for incremental sort of table data. This allows fast execution of queries in ranges of a primary key.</p>
|
||||
|
||||
|
||||
<p><b>Note</b>
|
||||
We store ad network banners impressions logs in ClickHouse. Each table entry looks like:
|
||||
<source>[Advertiser ID, Impression ID, attribute1, attribute2, ...]</pre>
|
||||
<source>[Advertiser ID, Impression ID, attribute1, attribute2, …]</pre>
|
||||
Let assume that our aim is to provide a set of reports for each advertiser. Common and frequently demanded query would be to count impressions for a specific Advertiser ID. This means that table primary key should start with <source>Advertiser ID</pre>. In this case ClickHouse needs to read smaller amount of data to perform the query for a given <source>Advertiser ID</pre>.</p>
|
||||
|
||||
<h3>Load data</h3>
|
||||
@ -225,14 +410,15 @@ When set of possible values is not limited (search query, URL, etc.) just go ahe
|
||||
<p>The second is that dataset contains redundant fields like Year, Quarter, Month, DayOfMonth, DayOfWeek. In fact a single FlightDate would be enough. Most likely they have been added to improve performance for other DBMS'es which DateTime handling functions may be not efficient.</p>
|
||||
|
||||
<b>Tip 2</b>
|
||||
<p>ClickHouse <a href="https://clickhouse.yandex/reference_en.html#Functions%20for%20working%20with%20dates%20and%20times">functions for operating with DateTime fields</a> are well-optimized so such redundancy is not required. Anyway much columns is not a reason to worry - ClickHouse is a <a href="https://en.wikipedia.org/wiki/Column-oriented_DBMS">column-oriented DBMS</a>. This allows you to have as much fields as you need. Hundreds of columns in a table is fine for ClickHouse.</p>
|
||||
<p>ClickHouse <a href="https://clickhouse.yandex/reference_en.html#Functions%20for%20working%20with%20dates%20and%20times">functions for operating with DateTime fields</a> are well-optimized so such redundancy is not required. Anyway much columns is not a reason to worry — ClickHouse is a <a href="https://en.wikipedia.org/wiki/Column-oriented_DBMS">column-oriented DBMS</a>. This allows you to have as much fields as you need. Hundreds of columns in a table is fine for ClickHouse.</p>
|
||||
|
||||
<h3>Querying the sample dataset</h3>
|
||||
|
||||
<p>Here are some examples of the queries from our test data.</p>
|
||||
|
||||
<ul>
|
||||
<li><spoiler title="the most popular destinations in 2015;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">the most popular destinations in 2015;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT
|
||||
OriginCityName,
|
||||
@ -248,25 +434,31 @@ SELECT
|
||||
count(*) AS flights,
|
||||
bar(flights, 0, 40000, 40)
|
||||
FROM ontime WHERE Year = 2015 GROUP BY a, b ORDER BY flights DESC LIMIT 20
|
||||
</pre><img src="https://habrastorage.org/files/d35/78d/b55/d3578db55e304bd7b5eba818abdb53f5.png"/>
|
||||
</spoiler>
|
||||
</pre><img src="https://habrastorage.org/files/d35/78d/b55/d3578db55e304bd7b5eba818abdb53f5.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="the most popular cities of departure;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">the most popular cities of departure;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT OriginCityName, count(*) AS flights FROM ontime GROUP BY OriginCityName ORDER BY flights DESC LIMIT 20
|
||||
</pre><img src="https://habrastorage.org/files/ef4/141/f34/ef4141f348234773a5349c4bd3e8f804.png"/></spoiler>
|
||||
SELECT OriginCityName, count(*) AS flights
|
||||
FROM ontime GROUP BY OriginCityName ORDER BY flights DESC LIMIT 20
|
||||
</pre><img src="https://habrastorage.org/files/ef4/141/f34/ef4141f348234773a5349c4bd3e8f804.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="cities of departure which offer maximum variety of destinations;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">cities of departure which offer maximum variety of destinations;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT OriginCityName, uniq(Dest) AS u FROM ontime GROUP BY OriginCityName ORDER BY u DESC LIMIT 20
|
||||
</pre><img src="https://habrastorage.org/files/240/9f4/9d1/2409f49d11fb4aa1b8b5ff34cf9ca75d.png"/></spoiler>
|
||||
SELECT OriginCityName, uniq(Dest) AS u
|
||||
FROM ontime GROUP BY OriginCityName ORDER BY u DESC LIMIT 20
|
||||
</pre><img src="https://habrastorage.org/files/240/9f4/9d1/2409f49d11fb4aa1b8b5ff34cf9ca75d.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="flight delay dependence on the day of week;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">flight delay dependence on the day of week;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT DayOfWeek, count() AS c, avg(DepDelay > 60) AS delays FROM ontime GROUP BY DayOfWeek ORDER BY DayOfWeek
|
||||
</pre><img src="https://habrastorage.org/files/885/e50/793/885e507930e34b7c8f788d25e7ca2bcf.png"/></spoiler>
|
||||
SELECT DayOfWeek, count() AS c, avg(DepDelay > 60) AS delays
|
||||
FROM ontime GROUP BY DayOfWeek ORDER BY DayOfWeek
|
||||
</pre><img src="https://habrastorage.org/files/885/e50/793/885e507930e34b7c8f788d25e7ca2bcf.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="cities of departure with most frequent delays for 1 hour or longer;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">cities of departure with most frequent delays for 1 hour or longer;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT OriginCityName, count() AS c, avg(DepDelay > 60) AS delays
|
||||
FROM ontime
|
||||
@ -274,30 +466,34 @@ GROUP BY OriginCityName
|
||||
HAVING c > 100000
|
||||
ORDER BY delays DESC
|
||||
LIMIT 20
|
||||
</pre><img src="https://habrastorage.org/files/ac2/926/56d/ac292656d03946d0aba35c75783a31f2.png"/></spoiler>
|
||||
</pre><img src="https://habrastorage.org/files/ac2/926/56d/ac292656d03946d0aba35c75783a31f2.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="flights of maximum duration;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">flights of maximum duration;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT OriginCityName, DestCityName, count(*) AS flights, avg(AirTime) AS duration
|
||||
FROM ontime
|
||||
GROUP BY OriginCityName, DestCityName
|
||||
ORDER BY duration DESC
|
||||
LIMIT 20
|
||||
</pre><img src="https://habrastorage.org/files/7b3/c2e/685/7b3c2e685832439b8c373bf2015131d2.png"/></spoiler>
|
||||
</pre><img src="https://habrastorage.org/files/7b3/c2e/685/7b3c2e685832439b8c373bf2015131d2.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="distribution of arrival time delays split by aircompanies;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">distribution of arrival time delays split by aircompanies;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT Carrier, count() AS c, round(quantileTDigest(0.99)(DepDelay), 2) AS q
|
||||
FROM ontime GROUP BY Carrier ORDER BY q DESC
|
||||
</pre><img src="https://habrastorage.org/files/49c/332/e3d/49c332e3d93146ba8f46beef6b2b02b0.png"/></spoiler>
|
||||
</pre><img src="https://habrastorage.org/files/49c/332/e3d/49c332e3d93146ba8f46beef6b2b02b0.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="aircompanies who stopped flights operation;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">aircompanies who stopped flights operation;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT Carrier, min(Year), max(Year), count()
|
||||
FROM ontime GROUP BY Carrier HAVING max(Year) < 2015 ORDER BY count() DESC
|
||||
</pre><img src="https://habrastorage.org/files/249/56f/1a2/24956f1a2efc48d78212586958aa036c.png"/></spoiler>
|
||||
</pre><img src="https://habrastorage.org/files/249/56f/1a2/24956f1a2efc48d78212586958aa036c.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="most trending destination cities in 2015;">
|
||||
<li><div class="spoiler"><a class="spoiler_title">most trending destination cities in 2015;</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT
|
||||
DestCityName,
|
||||
@ -309,9 +505,10 @@ WHERE Year IN (2014, 2015)
|
||||
GROUP BY DestCityName
|
||||
HAVING c2014 > 10000 AND c2015 > 1000 AND diff > 1
|
||||
ORDER BY diff DESC
|
||||
</pre><img src="https://habrastorage.org/files/f31/32f/4d1/f3132f4d1c0d42eab26d9111afe7771a.png"/></spoiler>
|
||||
</pre><img src="https://habrastorage.org/files/f31/32f/4d1/f3132f4d1c0d42eab26d9111afe7771a.png"/></div></div>
|
||||
</li>
|
||||
<li><spoiler title="destination cities with maximum popularity-season dependency.">
|
||||
<li><div class="spoiler"><a class="spoiler_title">destination cities with maximum popularity-season dependency.</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
SELECT
|
||||
DestCityName,
|
||||
@ -331,7 +528,7 @@ USING DestCityName
|
||||
GROUP BY DestCityName
|
||||
ORDER BY avg_month_diff DESC
|
||||
LIMIT 20
|
||||
</pre><img src="https://habrastorage.org/files/26b/2c7/aae/26b2c7aae21a4c76800cb1c7a33a374d.png"/></spoiler>
|
||||
</pre><img src="https://habrastorage.org/files/26b/2c7/aae/26b2c7aae21a4c76800cb1c7a33a374d.png"/></div></div>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
@ -344,7 +541,8 @@ LIMIT 20
|
||||
|
||||
<p><a href="https://clickhouse.yandex/reference_en.html#Distributed">Distributed-table</a> is actually a kind of "view" to local tables of ClickHouse cluster. SELECT query from a distributed table will be executed using resources of all cluster's shards. You may specify configs for multiple clusters and create multiple Distributed-tables providing views to different clusters.</p>
|
||||
|
||||
<spoiler title="Config for cluster of three shards. Each shard stores data on a single replica">
|
||||
<div class="spoiler"><a class="spoiler_title">Config for cluster of three shards. Each shard stores data on a single replica</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
<remote_servers>
|
||||
<perftest_3shards_1replicas>
|
||||
@ -369,11 +567,12 @@ LIMIT 20
|
||||
</perftest_3shards_1replicas>
|
||||
</remote_servers>
|
||||
</pre>
|
||||
</spoiler>
|
||||
</div></div>
|
||||
Creating a local table:
|
||||
<pre>CREATE TABLE ontime_local (...) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);</pre>
|
||||
Creating a distributed table providing a view into local tables of the cluster:
|
||||
<pre>CREATE TABLE ontime_all AS ontime_local ENGINE = Distributed(perftest_3shards_1replicas, default, ontime_local, rand());</pre>
|
||||
<pre>CREATE TABLE ontime_all AS ontime_local
|
||||
ENGINE = Distributed(perftest_3shards_1replicas, default, ontime_local, rand());</pre>
|
||||
|
||||
<p>You can create a Distributed table on all machines in the cluster. This would allow to run distributed queries on any machine of the cluster. Besides distributed table you can also use <a href="https://clickhouse.yandex/reference_en.html#remote">*remote* table function</a>.</p>
|
||||
|
||||
@ -385,15 +584,19 @@ Creating a distributed table providing a view into local tables of the cluster:
|
||||
<p>Worth to notice that the approach given above wouldn't fit for sharding of large tables. Please use <a href="https://clickhouse.yandex/reference_en.html#Resharding">built-in sharding feature</a>.</p>
|
||||
|
||||
<p>As you could expect heavy queries are executed N times faster being launched on 3 servers instead of one.</p>
|
||||
<spoiler title="See here">
|
||||
<div class="spoiler"><a class="spoiler_title">See here</a>
|
||||
<div class="spoiler_body">
|
||||
<img src="https://habrastorage.org/files/ece/020/129/ece020129fdf4a18a6e75daf2e699cb9.png"/>
|
||||
|
||||
<p>You may have noticed that quantiles calculation are slightly different. This happens due to <a href="https://github.com/tdunning/t-digest/raw/master/docs/t-digest-paper/histo.pdf">t-digest</a> algorithm implementation which is non-deterministic — it depends on the order of data processing.</p></spoiler>
|
||||
<p>You may have noticed that quantiles calculation are slightly different. This happens due to <a href="https://github.com/tdunning/t-digest/raw/master/docs/t-digest-paper/histo.pdf">t-digest</a> algorithm implementation which is non-deterministic — it depends on the order of data processing.</p>
|
||||
</div></div>
|
||||
|
||||
<p>In this case we have used a cluster with 3 shards each contains a single replica.</p>
|
||||
|
||||
<p>To provide for resilience in production environment we recommend that each shard should contain 2-3 replicas distributed between multiple data-centers. Note that ClickHouse supports unlimited number of replicas.</p>
|
||||
|
||||
<spoiler title="Config for cluster of one shard containing three replicas">
|
||||
<div class="spoiler"><a class="spoiler_title">Config for cluster of one shard containing three replicas</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
<remote_servers>
|
||||
...
|
||||
@ -415,13 +618,14 @@ Creating a distributed table providing a view into local tables of the cluster:
|
||||
</perftest_1shards_3replicas>
|
||||
</remote_servers>
|
||||
</pre>
|
||||
</spoiler>
|
||||
</div></div>
|
||||
|
||||
<p>To enable replication <a href="http://zookeeper.apache.org/">ZooKeeper</a> is required. ClickHouse will take care of data consistency on all replicas and run restore procedure after failure automatically. It's recommended to deploy ZooKeeper cluster to separate servers.</p>
|
||||
|
||||
<p>ZooKeeper is not a requirement - in some simple cases you can duplicate the data by writing it into all the replicas from your application code. This approach is not recommended - in this case ClickHouse is not able to guarantee data consistency on all replicas. This remains the responsibility of your application.</p>
|
||||
<p>ZooKeeper is not a requirement — in some simple cases you can duplicate the data by writing it into all the replicas from your application code. This approach is not recommended — in this case ClickHouse is not able to guarantee data consistency on all replicas. This remains the responsibility of your application.</p>
|
||||
|
||||
<spoiler title="Set ZooKeeper locations in configuration file">
|
||||
<div class="spoiler"><a class="spoiler_title">Set ZooKeeper locations in configuration file</a>
|
||||
<div class="spoiler_body">
|
||||
<pre>
|
||||
<zookeeper-servers>
|
||||
<node>
|
||||
@ -438,16 +642,16 @@ Creating a distributed table providing a view into local tables of the cluster:
|
||||
</node>
|
||||
</zookeeper-servers>
|
||||
</pre>
|
||||
</spoiler>
|
||||
</div></div>
|
||||
|
||||
<p>Also we need to set macros for identifying shard and replica - it will be used on table creation</p>
|
||||
<p>Also we need to set macros for identifying shard and replica — it will be used on table creation</p>
|
||||
<pre>
|
||||
<macros>
|
||||
<shard>01</shard>
|
||||
<replica>01</replica>
|
||||
</macros>
|
||||
</pre>
|
||||
<p>If there are no replicas at the moment on replicated table creation - a new first replica will be instantiated. If there are already live replicas - new replica will clone the data from existing ones. You have an option to create all replicated tables first and that insert data to it. Another option is to create some replicas and add the others after or during data insertion.</p>
|
||||
<p>If there are no replicas at the moment on replicated table creation — a new first replica will be instantiated. If there are already live replicas — new replica will clone the data from existing ones. You have an option to create all replicated tables first and that insert data to it. Another option is to create some replicas and add the others after or during data insertion.</p>
|
||||
|
||||
<pre>
|
||||
CREATE TABLE ontime_replica (...)
|
||||
@ -461,7 +665,19 @@ ENGINE = ReplicatedMergeTree(
|
||||
<p>Here we use <a href="https://clickhouse.yandex/reference_en.html#ReplicatedMergeTree">ReplicatedMergeTree</a> table type. In parameters we specify ZooKeeper path containing shard and replica identifiers.</p>
|
||||
|
||||
<pre>INSERT INTO ontime_replica SELECT * FROM ontime;</pre>
|
||||
<p>Replication operates in multi-master mode. Data can be loaded into any replica - it will be synced with other instances automatically. Replication is asynchronous so at a given moment of time not all replicas may contain recently inserted data. To allow data insertion at least one replica should be up. Others will sync up data and repair consistency once they will become active again. Please notice that such scheme allows for the possibility of just appended data loss.</p>
|
||||
<p>Replication operates in multi-master mode. Data can be loaded into any replica — it will be synced with other instances automatically. Replication is asynchronous so at a given moment of time not all replicas may contain recently inserted data. To allow data insertion at least one replica should be up. Others will sync up data and repair consistency once they will become active again. Please notice that such scheme allows for the possibility of just appended data loss.</p>
|
||||
|
||||
<p class="footer">© 2016 YANDEX LLC</p>
|
||||
</div>
|
||||
|
||||
<script type="text/javascript">
|
||||
|
||||
$('.spoiler_title').click(function() {
|
||||
console.log($(this).parent());
|
||||
$(this).next('.spoiler_body').toggle(100);
|
||||
});
|
||||
|
||||
</script>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
Loading…
Reference in New Issue
Block a user