Merge pull request #38907 from evillique/compression_level

Add compression level for data export
This commit is contained in:
Alexey Milovidov 2022-07-07 22:27:21 +03:00 committed by GitHub
commit 74d02aeca7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 109 additions and 10 deletions

View File

@ -25,7 +25,7 @@ SELECT [DISTINCT [ON (column1, column2, ...)]] expr_list
[LIMIT [n, ]m] [WITH TIES] [LIMIT [n, ]m] [WITH TIES]
[SETTINGS ...] [SETTINGS ...]
[UNION ...] [UNION ...]
[INTO OUTFILE filename [COMPRESSION type] ] [INTO OUTFILE filename [COMPRESSION type [LEVEL level]] ]
[FORMAT format] [FORMAT format]
``` ```

View File

@ -6,16 +6,18 @@ sidebar_label: INTO OUTFILE
`INTO OUTFILE` clause redirects the result of a `SELECT` query to a file on the **client** side. `INTO OUTFILE` clause redirects the result of a `SELECT` query to a file on the **client** side.
Compressed files are supported. Compression type is detected by the extension of the file name (mode `'auto'` is used by default). Or it can be explicitly specified in a `COMPRESSION` clause. Compressed files are supported. Compression type is detected by the extension of the file name (mode `'auto'` is used by default). Or it can be explicitly specified in a `COMPRESSION` clause. The compression level for a certain compression type can be specified in a `LEVEL` clause.
**Syntax** **Syntax**
```sql ```sql
SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type] SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type [LEVEL level]]
``` ```
`file_name` and `type` are string literals. Supported compression types are: `'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`. `file_name` and `type` are string literals. Supported compression types are: `'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`.
`level` is a numeric literal. Positive integers in following ranges are supported: `1-12` for `lz4` type, `1-22` for `zstd` type and `1-9` for other compression types.
## Implementation Details ## Implementation Details
- This functionality is available in the [command-line client](../../../interfaces/cli.md) and [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Thus a query sent via [HTTP interface](../../../interfaces/http.md) will fail. - This functionality is available in the [command-line client](../../../interfaces/cli.md) and [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Thus a query sent via [HTTP interface](../../../interfaces/http.md) will fail.

View File

@ -24,7 +24,7 @@ SELECT [DISTINCT [ON (column1, column2, ...)]] expr_list
[LIMIT [n, ]m] [WITH TIES] [LIMIT [n, ]m] [WITH TIES]
[SETTINGS ...] [SETTINGS ...]
[UNION ALL ...] [UNION ALL ...]
[INTO OUTFILE filename [COMPRESSION type] ] [INTO OUTFILE filename [COMPRESSION type [LEVEL level]] ]
[FORMAT format] [FORMAT format]
``` ```

View File

@ -6,16 +6,18 @@ sidebar_label: INTO OUTFILE
Секция `INTO OUTFILE` перенаправляет результат запроса `SELECT` в файл на стороне **клиента**. Секция `INTO OUTFILE` перенаправляет результат запроса `SELECT` в файл на стороне **клиента**.
Поддерживаются сжатые файлы. Формат сжатия определяется по расширению файла (по умолчанию используется режим `'auto'`), либо он может быть задан явно в секции `COMPRESSION`. Поддерживаются сжатые файлы. Формат сжатия определяется по расширению файла (по умолчанию используется режим `'auto'`), либо он может быть задан явно в секции `COMPRESSION`. Уровень сжатия для конкретного алгоритма может быть задан в секции `LEVEL`.
**Синтаксис** **Синтаксис**
```sql ```sql
SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type] SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type [LEVEL level]]
``` ```
`file_name` и `type` задаются в виде строковых литералов. Поддерживаются форматы сжатия: `'none`', `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`. `file_name` и `type` задаются в виде строковых литералов. Поддерживаются форматы сжатия: `'none`', `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`.
`level` задается в виде числового литерала. Поддерживаются положительные значения в следующих диапазонах: `1-12` для формата `lz4`, `1-22` для формата `zstd` и `1-9` для остальных форматов.
## Детали реализации {#implementation-details} ## Детали реализации {#implementation-details}
- Эта функция доступна только в следующих интерфейсах: [клиент командной строки](../../../interfaces/cli.md) и [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Таким образом, запрос, отправленный через [HTTP интерфейс](../../../interfaces/http.md) вернет ошибку. - Эта функция доступна только в следующих интерфейсах: [клиент командной строки](../../../interfaces/cli.md) и [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Таким образом, запрос, отправленный через [HTTP интерфейс](../../../interfaces/http.md) вернет ошибку.

View File

@ -524,17 +524,35 @@ try
const auto & out_file_node = query_with_output->out_file->as<ASTLiteral &>(); const auto & out_file_node = query_with_output->out_file->as<ASTLiteral &>();
out_file = out_file_node.value.safeGet<std::string>(); out_file = out_file_node.value.safeGet<std::string>();
std::string compression_method; std::string compression_method_string;
if (query_with_output->compression) if (query_with_output->compression)
{ {
const auto & compression_method_node = query_with_output->compression->as<ASTLiteral &>(); const auto & compression_method_node = query_with_output->compression->as<ASTLiteral &>();
compression_method = compression_method_node.value.safeGet<std::string>(); compression_method_string = compression_method_node.value.safeGet<std::string>();
}
CompressionMethod compression_method = chooseCompressionMethod(out_file, compression_method_string);
UInt64 compression_level = 3;
if (query_with_output->compression_level)
{
const auto & compression_level_node = query_with_output->compression_level->as<ASTLiteral &>();
bool res = compression_level_node.value.tryGet<UInt64>(compression_level);
auto range = getCompressionLevelRange(compression_method);
if (!res || compression_level < range.first || compression_level > range.second)
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Invalid compression level, must be positive integer in range {}-{}",
range.first,
range.second);
} }
out_file_buf = wrapWriteBufferWithCompressionMethod( out_file_buf = wrapWriteBufferWithCompressionMethod(
std::make_unique<WriteBufferFromFile>(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), std::make_unique<WriteBufferFromFile>(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT),
chooseCompressionMethod(out_file, compression_method), compression_method,
/* compression level = */ 3 compression_level
); );
// We are writing to file, so default format is the same as in non-interactive mode. // We are writing to file, so default format is the same as in non-interactive mode.

View File

@ -98,6 +98,19 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s
ErrorCodes::NOT_IMPLEMENTED); ErrorCodes::NOT_IMPLEMENTED);
} }
std::pair<uint64_t, uint64_t> getCompressionLevelRange(const CompressionMethod & method)
{
switch (method)
{
case CompressionMethod::Zstd:
return {1, 22};
case CompressionMethod::Lz4:
return {1, 12};
default:
return {1, 9};
}
}
static std::unique_ptr<CompressedReadBufferWrapper> createCompressedWrapper( static std::unique_ptr<CompressedReadBufferWrapper> createCompressedWrapper(
std::unique_ptr<ReadBuffer> nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max) std::unique_ptr<ReadBuffer> nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max)
{ {

View File

@ -46,6 +46,9 @@ std::string toContentEncodingName(CompressionMethod method);
*/ */
CompressionMethod chooseCompressionMethod(const std::string & path, const std::string & hint); CompressionMethod chooseCompressionMethod(const std::string & path, const std::string & hint);
/// Get a range of the valid compression levels for the compression method.
std::pair<uint64_t, uint64_t> getCompressionLevelRange(const CompressionMethod & method);
std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod( std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod(
std::unique_ptr<ReadBuffer> nested, std::unique_ptr<ReadBuffer> nested,
CompressionMethod method, CompressionMethod method,

View File

@ -18,6 +18,7 @@ public:
ASTPtr format; ASTPtr format;
ASTPtr settings_ast; ASTPtr settings_ast;
ASTPtr compression; ASTPtr compression;
ASTPtr compression_level;
void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const final; void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const final;

View File

@ -96,6 +96,14 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec
ParserStringLiteral compression; ParserStringLiteral compression;
if (!compression.parse(pos, query_with_output.compression, expected)) if (!compression.parse(pos, query_with_output.compression, expected))
return false; return false;
ParserKeyword s_compression_level("LEVEL");
if (s_compression_level.ignore(pos, expected))
{
ParserNumber compression_level;
if (!compression_level.parse(pos, query_with_output.compression_level, expected))
return false;
}
} }
query_with_output.children.push_back(query_with_output.out_file); query_with_output.children.push_back(query_with_output.out_file);

View File

@ -0,0 +1,18 @@
6000 5999 13
6000 5999 13
Ok
6000 5999 13
6000 5999 13
Ok
6000 5999 13
6000 5999 13
Ok
6000 5999 13
6000 5999 13
Ok
6000 5999 13
6000 5999 13
Ok
6000 5999 13
6000 5999 13
Ok

View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
# Tags: no-fasttest, no-parallel
# Tag no-fasttest: depends on brotli and bzip2
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
WORKING_FOLDER_02353="${USER_FILES_PATH}/${CLICKHOUSE_DATABASE}"
rm -rf "${WORKING_FOLDER_02353}"
mkdir "${WORKING_FOLDER_02353}"
for m in gz br xz zst lz4 bz2
do
${CLICKHOUSE_CLIENT} --query "SELECT number, 'Hello, world!' FROM numbers(6000) INTO OUTFILE '${WORKING_FOLDER_02353}/${m}_1.${m}' COMPRESSION '${m}' LEVEL 1"
${CLICKHOUSE_CLIENT} --query "SELECT number, 'Hello, world!' FROM numbers(6000) INTO OUTFILE '${WORKING_FOLDER_02353}/${m}_9.${m}' COMPRESSION '${m}' LEVEL 9"
${CLICKHOUSE_CLIENT} --query "SELECT count(), max(x), avg(length(s)) FROM file('${WORKING_FOLDER_02353}/${m}_1.${m}', 'TabSeparated', 'x UInt32, s String')"
${CLICKHOUSE_CLIENT} --query "SELECT count(), max(x), avg(length(s)) FROM file('${WORKING_FOLDER_02353}/${m}_9.${m}', 'TabSeparated', 'x UInt32, s String')"
level_1=$(du -b ${WORKING_FOLDER_02353}/${m}_1.${m} | awk '{print $1}')
level_9=$(du -b ${WORKING_FOLDER_02353}/${m}_9.${m} | awk '{print $1}')
if [ "$level_1" != "$level_9" ]; then
echo "Ok"
else
echo "Failed"
fi
done
rm -rf "${WORKING_FOLDER_02353}"