Merge branch 'master' into replace-memcpy

This commit is contained in:
Alexey Milovidov 2021-03-10 10:03:42 +03:00
commit af1a08e80c
8 changed files with 204 additions and 2 deletions

View File

@ -102,7 +102,7 @@ USING (equi_column1, ... equi_columnN, asof_column)
- При использовании обычного `JOIN` , запрос отправляется на удалённые серверы. На каждом из них выполняются подзапросы для формирования «правой» таблицы, и с этой таблицей выполняется соединение. То есть, «правая» таблица формируется на каждом сервере отдельно. - При использовании обычного `JOIN` , запрос отправляется на удалённые серверы. На каждом из них выполняются подзапросы для формирования «правой» таблицы, и с этой таблицей выполняется соединение. То есть, «правая» таблица формируется на каждом сервере отдельно.
- При использовании `GLOBAL ... JOIN`, сначала сервер-инициатор запроса запускает подзапрос для вычисления правой таблицы. Эта временная таблица передаётся на каждый удалённый сервер, и на них выполняются запросы с использованием переданных временных данных. - При использовании `GLOBAL ... JOIN`, сначала сервер-инициатор запроса запускает подзапрос для вычисления правой таблицы. Эта временная таблица передаётся на каждый удалённый сервер, и на них выполняются запросы с использованием переданных временных данных.
Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](#select-distributed-subqueries). Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](../../../sql-reference/operators/in.md#select-distributed-subqueries).
## Рекомендации по использованию {#usage-recommendations} ## Рекомендации по использованию {#usage-recommendations}

View File

@ -7,10 +7,12 @@
#include <IO/ReadHelpers.h> #include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFileDescriptor.h> #include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/WriteBufferFromOStream.h> #include <IO/WriteBufferFromOStream.h>
#include <Parsers/ASTInsertQuery.h>
#include <Parsers/ParserQuery.h> #include <Parsers/ParserQuery.h>
#include <Parsers/parseQuery.h>
#include <Parsers/formatAST.h> #include <Parsers/formatAST.h>
#include <Parsers/obfuscateQueries.h> #include <Parsers/obfuscateQueries.h>
#include <Parsers/parseQuery.h>
#include <Common/ErrorCodes.h>
#include <Common/TerminalSize.h> #include <Common/TerminalSize.h>
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
@ -28,6 +30,14 @@
#pragma GCC diagnostic ignored "-Wunused-function" #pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wmissing-declarations" #pragma GCC diagnostic ignored "-Wmissing-declarations"
namespace DB
{
namespace ErrorCodes
{
extern const int INVALID_FORMAT_INSERT_QUERY_WITH_DATA;
}
}
int mainEntryClickHouseFormat(int argc, char ** argv) int mainEntryClickHouseFormat(int argc, char ** argv)
{ {
using namespace DB; using namespace DB;
@ -128,6 +138,14 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
do do
{ {
ASTPtr res = parseQueryAndMovePosition(parser, pos, end, "query", multiple, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); ASTPtr res = parseQueryAndMovePosition(parser, pos, end, "query", multiple, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
/// For insert query with data(INSERT INTO ... VALUES ...), will lead to format fail,
/// should throw exception early and make exception message more readable.
if (const auto * insert_query = res->as<ASTInsertQuery>(); insert_query && insert_query->data)
{
throw Exception(
"Can't format ASTInsertQuery with data, since data will be lost",
DB::ErrorCodes::INVALID_FORMAT_INSERT_QUERY_WITH_DATA);
}
if (!quiet) if (!quiet)
{ {
WriteBufferFromOStream res_buf(std::cout, 4096); WriteBufferFromOStream res_buf(std::cout, 4096);
@ -137,6 +155,26 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
std::cout << "\n;\n"; std::cout << "\n;\n";
std::cout << std::endl; std::cout << std::endl;
} }
do
{
/// skip spaces to avoid throw exception after last query
while (pos != end && std::isspace(*pos))
++pos;
/// for skip comment after the last query and to not throw exception
if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-')
{
pos += 2;
/// skip until the end of the line
while (pos != end && *pos != '\n')
++pos;
}
/// need to parse next sql
else
break;
} while (pos != end);
} while (multiple && pos != end); } while (multiple && pos != end);
} }
} }

View File

@ -547,6 +547,7 @@
M(1001, STD_EXCEPTION) \ M(1001, STD_EXCEPTION) \
M(1002, UNKNOWN_EXCEPTION) \ M(1002, UNKNOWN_EXCEPTION) \
M(1003, INVALID_SHARD_ID) \ M(1003, INVALID_SHARD_ID) \
M(1004, INVALID_FORMAT_INSERT_QUERY_WITH_DATA)
/* See END */ /* See END */

View File

@ -0,0 +1,25 @@
SELECT 1
;
SELECT 1
UNION ALL
(
SELECT 1
UNION DISTINCT
SELECT 1
)
;
SELECT 1
;
SELECT 1
UNION ALL
(
SELECT 1
UNION DISTINCT
SELECT 1
)
;
OK

View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
echo "select 1; select 1 union all (select 1 union distinct select 1); " | $CLICKHOUSE_FORMAT -n;
echo "select 1; select 1 union all (select 1 union distinct select 1); -- comment " | $CLICKHOUSE_FORMAT -n;
echo "insert into t values (1); " | $CLICKHOUSE_FORMAT -n 2>&1 \ | grep -F -q "Code: 1004" && echo 'OK' || echo 'FAIL'

View File

@ -16,3 +16,9 @@ virtualenv build
# Open the web browser and go to http://localhost:8080/ # Open the web browser and go to http://localhost:8080/
``` ```
# How to quickly test the blog
```
./build.py --skip-multi-page --skip-single-page --skip-amp --skip-pdf --skip-git-log --skip-docs --skip-test-templates --livereload 8080
```

View File

@ -50,5 +50,57 @@
[0.024, 0.015, 0.021], [0.024, 0.015, 0.021],
[0.007, 0.013, 0.006] [0.007, 0.013, 0.006]
] ]
},
{
"system": "AMD Ryzen 9",
"system_full": "AMD Ryzen 9 3950X 16-Core Processor, 64 GiB RAM, Samsung evo 970 plus 1TB",
"time": "2021-03-08 00:00:00",
"kind": "desktop",
"result":
[
[0.002, 0.002, 0.002],
[0.013, 0.011, 0.010],
[0.030, 0.023, 0.023],
[0.071, 0.033, 0.031],
[0.090, 0.068, 0.066],
[0.165, 0.137, 0.137],
[0.015, 0.014, 0.015],
[0.013, 0.012, 0.012],
[0.317, 0.268, 0.261],
[0.337, 0.303, 0.301],
[0.108, 0.089, 0.091],
[0.127, 0.110, 0.114],
[0.714, 0.690, 0.643],
[0.888, 0.835, 0.809],
[0.748, 0.727, 0.704],
[0.666, 0.653, 0.652],
[1.868, 1.821, 1.826],
[1.007, 0.958, 0.957],
[4.466, 4.371, 4.377],
[0.074, 0.027, 0.027],
[0.748, 0.326, 0.307],
[0.844, 0.427, 0.408],
[2.040, 1.545, 1.552],
[1.392, 0.609, 0.560],
[0.237, 0.155, 0.142],
[0.140, 0.112, 0.114],
[0.233, 0.151, 0.146],
[0.790, 0.567, 0.545],
[0.981, 0.751, 0.752],
[3.532, 3.522, 3.516],
[0.505, 0.478, 0.456],
[1.078, 0.959, 0.959],
[5.653, 5.600, 5.570],
[3.572, 3.399, 3.405],
[3.619, 3.445, 3.429],
[1.176, 1.174, 1.165],
[0.140, 0.127, 0.124],
[0.054, 0.052, 0.052],
[0.052, 0.049, 0.048],
[0.275, 0.265, 0.265],
[0.025, 0.024, 0.025],
[0.020, 0.021, 0.019],
[0.006, 0.005, 0.005]
]
} }
] ]

View File

@ -0,0 +1,69 @@
---
title: 'A journey to io_uring, AIO and modern storage devices'
image: 'https://blog-images.clickhouse.tech/en/2021/reading-from-external-memory/all-single-read.png'
date: '2021-03-09'
author: '[Ruslan Savchenko](https://github.com/savrus)'
tags: ['Linux', 'benchmark', 'experiment']
---
*While main memory is considered to be rather cheap by some systems designers it is not always possible to store everything in the main memory. When data is stored in external memory one has to think carefully how to access the data. There are several kind of storage devices and more than one system call to read from them. We performed experiments to find out how different Linux system calls perform for available devices. In total HDD, SATA SSD, NVMe SSD, and Intel Optane were accessed via single-threaded and multi-threaded pread, Linux aio, and new io_uring interfaces. Full report is available in PDF format:* [link](https://arxiv.org/pdf/2102.11198). *We give one section from the report as an example.*
# Single Random Read
External memory devices are block devices which means data transfer between a device and a host is done in blocks rather than single bytes. Typically 512 bytes or 4 kilobytes blocks are used. These block sizes have been chosen by manufactures long time ago and may be not the best choice for modern devices. By requesing larger amount of contigious data we can emulate larger block size. Let's find out how modern devices perform with larger blocks.
Our goal is to pick the best block size for a random read. An application (or filesystem) can pick any block size and access data with respect to this block size. We vary block size from 4 kilobytes up to 32 megabytes. For each block size we make some random reads. Among these reads we calculate average, minimum and maximum latency as well as 99,0 and 99,9 percentiles. We use system call pread(2) in this experiment. We believe that lseek(2) followed by read(2) should have the same performance since the observed storage access time is far longer than a system call.
## Hard Disk Drive
This figure shows results for HDD.
![HDD single read latency](https://blog-images.clickhouse.tech/en/2021/reading-from-external-memory/hdd-single-read.png)
The latency is almost the same for all block sizes smaller than 256 kilobytes. This happens because seek time is much larger than the data transfer time. The seek time includes arm positioning to find the right track and awaiting for platter rotation to bring data under the head. A simple consequence is that for a HDD random read one should use blocks of size at least 256 kilobytes. Even if an application use smaller blocks the drive access time would be the same. However one could still decide to use smaller blocks for better cache utilization: if the amount of data per request is small and is expected to fit in cache then storing a large block along with the requested data would actually make cache capacity smaller in terms of useful data.
The 256 kilobyte block read takes 12 milliseconds on the average. We experienced variations from 4 milliseconds up to 25 milliseconds. This is really a huge amount of time for a computer. For example the typical process scheduling quantum is just a few milliseconds. An operating system can (and in fact does) execute other processes while our process waits for the data to arrive from the hard drive.
## SATA SSD
The figure below shows SATA SSD read latencies.
![SATA SSD single read latency](https://blog-images.clickhouse.tech/en/2021/reading-from-external-memory/ssd-single-read.png)
Note that the time at the lower part of the figure is in microseconds (we use standard shortenings ms for milliseconds and us for microseconds). Reading block of size 4 kilobytes takes 140 microseconds on the average and the time growth is linear when the block size increase. Compared to HDD reading a 4 kilobyte block from SSD is 80 times faster. For a 256 kilobyte block SSD is ten times faster than HDD. When block size is large enough (starting from 4 megabytes) SSD is only two times faster than HDD.
## NVMe SSD
The next figure shows results for NVMe SSD.
![NVMe SSD single read latency](https://blog-images.clickhouse.tech/en/2021/reading-from-external-memory/nvme-single-read.png)
The latency is better than those for SATA SSD. For a 4 kilobytes block size the average time improved only a little, but the 99 percentile is two times lower. It takes less than millisecond to read a megabyte block from NVMe SSD. For SATA SSD it took 3 milliseconds. As we see, upgrade from SATA SSD to NVMe SSD is not as dramatic as upgrade from HDD to SATA SSD. This is not surprising since both SATA and NVMe SSD are based on the same thechnology. Only interfaces differ.
## Intel Optane
This figure shows results for Intel Optane SSD.
![Intel Optane single read latency](https://blog-images.clickhouse.tech/en/2021/reading-from-external-memory/optane-single-read.png)
Minimal latency is 12 microseconds whih is 10 times lower than those of NVMe SSD. Average latency is 1000 lower than those of HDD. There is quite large variation for small block read latency: even though the average time is quite low and close to minimal latency the maximum latency and even 99 percentile are significantly worse. If somebody looks at these results and wishes to create an Intel Optane-based service with 12 microsecond latency for reads they would have to install larger number of Intel Optane drives or consider providing more realistic timings.
When latency is so small overheads of context switching and interrupt handling become noticeable. One can use polling mode to gain some improvement. In this mode the Linux kernel monitors the completion queue instead of switching to some other job and relying on hardware interrupt with interrupt handler to notify about completion. Clearly, it is considerable to use the polling mode only when hardware response is expected to arrive fast enough.
![Intel Optane single read latency in polling mode](https://blog-images.clickhouse.tech/en/2021/reading-from-external-memory/optane-single-hipri-read.png)
The figure above shows results for reading from Intel Optane in polling mode. The polling mode is used when an application calls preadv2(2) system call with RWF\_HIGHPRI flag. Compared to usual pread(2) the polling mode lowers the maximum latency by a factor of two for block sizes up to 256 kilobytes.
## Summary
To summarize our results the next figure shows single read latencies for all four storage types on a single chart.
![Single read latency for Optane, SSD and HDD](https://blog-images.clickhouse.tech/en/2021/reading-from-external-memory/all-single-read.png)
Starting from 4 megabytes the latency is easily predicted by linear extrapolation so we don't show larger blocks here. To show everything on a single figure we are forced to use quite an overloaded legend. We use vertical level to show the latency and we iterate the block size horizontally. For each block size we show four bars, from left to right: for Intel Optane, NVMe SSD, SATA SSD, and HDD. Storage type is represented by hatch and the latency by color.
We see that solid state device latencies are far better than HDD. For a single read the leader is Intel Optane, however as we shall see later it has it's own drawback compared to NVMe SSD. NVMe SSD and SATA SSD look quite close to each other when the block size is small. Our observations show that the best block size for random read is 256 kilobytes for HDD, 4 kilobytes for NVMe and SATA SSD and 8 kilobytes for Intel Optane.
So, how about testing modern IO interfaces in Linux? Continue reading the [full article](https://arxiv.org/pdf/2102.11198).
2021-03-09 [Ruslan Savchenko](https://github.com/savrus)