2017-09-20 14:12:12 +00:00
|
|
|
#include <iostream>
|
2018-12-26 16:18:21 +00:00
|
|
|
#include <optional>
|
2017-09-20 14:12:12 +00:00
|
|
|
#include <boost/program_options.hpp>
|
2019-04-01 16:28:20 +00:00
|
|
|
#include <boost/algorithm/string/join.hpp>
|
2017-09-20 14:12:12 +00:00
|
|
|
|
|
|
|
#include <Common/Exception.h>
|
|
|
|
#include <IO/WriteBufferFromFileDescriptor.h>
|
|
|
|
#include <IO/ReadBufferFromFileDescriptor.h>
|
2020-12-29 19:12:29 +00:00
|
|
|
#include <IO/WriteBufferFromFile.h>
|
|
|
|
#include <IO/ReadBufferFromFile.h>
|
2018-12-28 18:15:26 +00:00
|
|
|
#include <Compression/CompressedWriteBuffer.h>
|
2024-10-20 01:11:16 +00:00
|
|
|
#include <Compression/ParallelCompressedWriteBuffer.h>
|
2018-12-28 18:15:26 +00:00
|
|
|
#include <Compression/CompressedReadBuffer.h>
|
2020-12-29 20:06:14 +00:00
|
|
|
#include <Compression/CompressedReadBufferFromFile.h>
|
2024-10-30 01:41:18 +00:00
|
|
|
#include <Compression/getCompressionCodecForFile.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
2017-09-20 14:12:12 +00:00
|
|
|
#include <IO/WriteHelpers.h>
|
|
|
|
#include <IO/copyData.h>
|
2019-04-01 16:28:20 +00:00
|
|
|
#include <Parsers/parseQuery.h>
|
2024-10-30 01:41:18 +00:00
|
|
|
#include <Parsers/queryToString.h>
|
2019-04-01 16:28:20 +00:00
|
|
|
#include <Parsers/ExpressionElementParsers.h>
|
2018-12-21 12:17:30 +00:00
|
|
|
#include <Compression/CompressionFactory.h>
|
2019-08-23 15:47:27 +00:00
|
|
|
#include <Common/TerminalSize.h>
|
2024-10-20 01:11:16 +00:00
|
|
|
#include <Common/ThreadPool.h>
|
|
|
|
#include <Common/CurrentMetrics.h>
|
2020-04-15 20:28:05 +00:00
|
|
|
#include <Core/Defines.h>
|
2019-08-23 13:19:12 +00:00
|
|
|
|
2017-09-20 14:12:12 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int TOO_LARGE_SIZE_COMPRESSED;
|
2018-12-26 15:01:26 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2017-09-20 14:12:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-20 01:11:16 +00:00
|
|
|
namespace CurrentMetrics
|
|
|
|
{
|
|
|
|
extern const Metric LocalThread;
|
|
|
|
extern const Metric LocalThreadActive;
|
|
|
|
extern const Metric LocalThreadScheduled;
|
|
|
|
}
|
|
|
|
|
2017-09-20 14:12:12 +00:00
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
2024-10-29 13:36:43 +00:00
|
|
|
/// Outputs method, sizes of uncompressed and compressed blocks for compressed file.
|
2017-09-20 14:12:12 +00:00
|
|
|
void checkAndWriteHeader(DB::ReadBuffer & in, DB::WriteBuffer & out)
|
|
|
|
{
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
2024-10-30 01:41:18 +00:00
|
|
|
UInt32 size_compressed;
|
|
|
|
UInt32 size_decompressed;
|
|
|
|
auto codec = DB::getCompressionCodecForFile(in, size_compressed, size_decompressed, true /* skip_to_next_block */);
|
2017-09-20 14:12:12 +00:00
|
|
|
|
|
|
|
if (size_compressed > DBMS_MAX_COMPRESSED_SIZE)
|
2023-01-23 21:13:58 +00:00
|
|
|
throw DB::Exception(DB::ErrorCodes::TOO_LARGE_SIZE_COMPRESSED, "Too large size_compressed. Most likely corrupted data.");
|
2017-09-20 14:12:12 +00:00
|
|
|
|
2024-10-30 01:41:18 +00:00
|
|
|
DB::writeText(queryToString(codec->getFullCodecDesc()), out);
|
2024-10-29 13:36:43 +00:00
|
|
|
DB::writeChar('\t', out);
|
2017-09-20 14:12:12 +00:00
|
|
|
DB::writeText(size_decompressed, out);
|
|
|
|
DB::writeChar('\t', out);
|
|
|
|
DB::writeText(size_compressed, out);
|
|
|
|
DB::writeChar('\n', out);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
int mainEntryClickHouseCompressor(int argc, char ** argv)
|
|
|
|
{
|
2020-05-04 00:11:49 +00:00
|
|
|
using namespace DB;
|
2020-12-29 20:07:35 +00:00
|
|
|
namespace po = boost::program_options;
|
2020-05-04 00:11:49 +00:00
|
|
|
|
2023-03-06 16:49:28 +00:00
|
|
|
bool print_stacktrace = false;
|
2017-09-20 14:12:12 +00:00
|
|
|
try
|
|
|
|
{
|
2022-04-14 14:49:49 +00:00
|
|
|
po::options_description desc = createOptionsDescription("Allowed options", getTerminalWidth());
|
|
|
|
desc.add_options()
|
|
|
|
("help,h", "produce help message")
|
|
|
|
("input", po::value<std::string>()->value_name("INPUT"), "input file")
|
|
|
|
("output", po::value<std::string>()->value_name("OUTPUT"), "output file")
|
|
|
|
("decompress,d", "decompress")
|
|
|
|
("offset-in-compressed-file", po::value<size_t>()->default_value(0ULL), "offset to the compressed block (i.e. physical file offset)")
|
|
|
|
("offset-in-decompressed-block", po::value<size_t>()->default_value(0ULL), "offset to the decompressed block (i.e. virtual offset)")
|
2024-10-20 01:11:16 +00:00
|
|
|
("block-size,b", po::value<size_t>()->default_value(DBMS_DEFAULT_BUFFER_SIZE), "compress in blocks of specified size")
|
2022-04-14 14:49:49 +00:00
|
|
|
("hc", "use LZ4HC instead of LZ4")
|
|
|
|
("zstd", "use ZSTD instead of LZ4")
|
|
|
|
("codec", po::value<std::vector<std::string>>()->multitoken(), "use codecs combination instead of LZ4")
|
|
|
|
("level", po::value<int>(), "compression level for codecs specified via flags")
|
2024-10-20 01:11:16 +00:00
|
|
|
("threads", po::value<size_t>()->default_value(1), "number of threads for parallel compression")
|
2022-04-14 14:49:49 +00:00
|
|
|
("none", "use no compression instead of LZ4")
|
|
|
|
("stat", "print block statistics of compressed data")
|
2023-03-06 16:49:28 +00:00
|
|
|
("stacktrace", "print stacktrace of exception")
|
2022-04-14 14:49:49 +00:00
|
|
|
;
|
|
|
|
|
|
|
|
po::positional_options_description positional_desc;
|
|
|
|
positional_desc.add("input", 1);
|
|
|
|
positional_desc.add("output", 1);
|
|
|
|
|
|
|
|
po::variables_map options;
|
|
|
|
po::store(po::command_line_parser(argc, argv).options(desc).positional(positional_desc).run(), options);
|
|
|
|
|
|
|
|
if (options.count("help"))
|
|
|
|
{
|
2023-04-20 22:54:34 +00:00
|
|
|
std::cout << "Usage: " << argv[0] << " [options] < INPUT > OUTPUT" << std::endl;
|
|
|
|
std::cout << "Usage: " << argv[0] << " [options] INPUT OUTPUT" << std::endl;
|
2022-04-14 14:49:49 +00:00
|
|
|
std::cout << desc << std::endl;
|
2024-02-11 19:00:37 +00:00
|
|
|
std::cout << "\nSee also: https://clickhouse.com/docs/en/operations/utilities/clickhouse-compressor/\n";
|
2022-04-14 14:49:49 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-09-20 14:12:12 +00:00
|
|
|
bool decompress = options.count("decompress");
|
|
|
|
bool use_lz4hc = options.count("hc");
|
|
|
|
bool use_zstd = options.count("zstd");
|
|
|
|
bool stat_mode = options.count("stat");
|
|
|
|
bool use_none = options.count("none");
|
2023-03-06 16:49:28 +00:00
|
|
|
print_stacktrace = options.count("stacktrace");
|
2024-10-20 01:11:16 +00:00
|
|
|
size_t block_size = options["block-size"].as<size_t>();
|
|
|
|
size_t num_threads = options["threads"].as<size_t>();
|
2018-12-26 15:01:26 +00:00
|
|
|
std::vector<std::string> codecs;
|
|
|
|
if (options.count("codec"))
|
|
|
|
codecs = options["codec"].as<std::vector<std::string>>();
|
|
|
|
|
2024-10-23 19:45:39 +00:00
|
|
|
if ((use_lz4hc || use_zstd || use_none) && !codecs.empty())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong options, codec flags like --zstd and --codec options are mutually exclusive");
|
2017-09-20 14:12:12 +00:00
|
|
|
|
2024-10-20 01:11:16 +00:00
|
|
|
if (num_threads < 1)
|
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid value of `threads` parameter");
|
|
|
|
|
|
|
|
if (num_threads > 1 && decompress)
|
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parallel mode is only implemented for compression (not for decompression)");
|
|
|
|
|
2019-04-01 16:28:20 +00:00
|
|
|
if (!codecs.empty() && options.count("level"))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong options, --level is not compatible with --codec list");
|
2019-04-01 16:28:20 +00:00
|
|
|
|
2018-12-21 12:17:30 +00:00
|
|
|
std::string method_family = "LZ4";
|
2017-09-20 14:12:12 +00:00
|
|
|
|
|
|
|
if (use_lz4hc)
|
2018-12-21 12:17:30 +00:00
|
|
|
method_family = "LZ4HC";
|
2017-09-20 14:12:12 +00:00
|
|
|
else if (use_zstd)
|
2018-12-21 12:17:30 +00:00
|
|
|
method_family = "ZSTD";
|
2017-09-20 14:12:12 +00:00
|
|
|
else if (use_none)
|
2018-12-21 12:17:30 +00:00
|
|
|
method_family = "NONE";
|
2017-09-20 14:12:12 +00:00
|
|
|
|
2019-04-01 16:34:27 +00:00
|
|
|
std::optional<int> level = std::nullopt;
|
2018-12-21 12:17:30 +00:00
|
|
|
if (options.count("level"))
|
2019-04-01 16:34:27 +00:00
|
|
|
level = options["level"].as<int>();
|
2019-04-01 16:28:20 +00:00
|
|
|
|
2020-05-04 00:11:49 +00:00
|
|
|
CompressionCodecPtr codec;
|
2018-12-26 15:01:26 +00:00
|
|
|
if (!codecs.empty())
|
|
|
|
{
|
2020-05-04 00:11:49 +00:00
|
|
|
ParserCodec codec_parser;
|
2019-04-01 16:28:20 +00:00
|
|
|
|
|
|
|
std::string codecs_line = boost::algorithm::join(codecs, ",");
|
2024-03-17 18:53:58 +00:00
|
|
|
auto ast = parseQuery(codec_parser, "(" + codecs_line + ")", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
|
2020-08-28 17:40:45 +00:00
|
|
|
codec = CompressionCodecFactory::instance().get(ast, nullptr);
|
2018-12-26 15:01:26 +00:00
|
|
|
}
|
|
|
|
else
|
2020-08-28 17:40:45 +00:00
|
|
|
codec = CompressionCodecFactory::instance().get(method_family, level);
|
2018-12-21 12:17:30 +00:00
|
|
|
|
2020-12-29 19:12:29 +00:00
|
|
|
std::unique_ptr<ReadBufferFromFileBase> rb;
|
|
|
|
std::unique_ptr<WriteBufferFromFileBase> wb;
|
|
|
|
|
|
|
|
if (options.count("input"))
|
|
|
|
rb = std::make_unique<ReadBufferFromFile>(options["input"].as<std::string>());
|
|
|
|
else
|
|
|
|
rb = std::make_unique<ReadBufferFromFileDescriptor>(STDIN_FILENO);
|
|
|
|
|
|
|
|
if (options.count("output"))
|
|
|
|
wb = std::make_unique<WriteBufferFromFile>(options["output"].as<std::string>());
|
|
|
|
else
|
|
|
|
wb = std::make_unique<WriteBufferFromFileDescriptor>(STDOUT_FILENO);
|
2017-09-20 14:12:12 +00:00
|
|
|
|
|
|
|
if (stat_mode)
|
|
|
|
{
|
|
|
|
/// Output statistic for compressed file.
|
2020-12-29 19:12:29 +00:00
|
|
|
checkAndWriteHeader(*rb, *wb);
|
2017-09-20 14:12:12 +00:00
|
|
|
}
|
|
|
|
else if (decompress)
|
|
|
|
{
|
|
|
|
/// Decompression
|
2020-12-29 20:06:14 +00:00
|
|
|
|
|
|
|
size_t offset_in_compressed_file = options["offset-in-compressed-file"].as<size_t>();
|
|
|
|
size_t offset_in_decompressed_block = options["offset-in-decompressed-block"].as<size_t>();
|
|
|
|
|
|
|
|
if (offset_in_compressed_file || offset_in_decompressed_block)
|
|
|
|
{
|
2020-12-31 07:08:12 +00:00
|
|
|
CompressedReadBufferFromFile compressed_file(std::move(rb));
|
2020-12-29 20:06:14 +00:00
|
|
|
compressed_file.seek(offset_in_compressed_file, offset_in_decompressed_block);
|
|
|
|
copyData(compressed_file, *wb);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
CompressedReadBuffer from(*rb);
|
|
|
|
copyData(from, *wb);
|
|
|
|
}
|
2017-09-20 14:12:12 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Compression
|
2024-10-20 01:11:16 +00:00
|
|
|
|
|
|
|
if (num_threads == 1)
|
|
|
|
{
|
|
|
|
CompressedWriteBuffer to(*wb, codec, block_size);
|
|
|
|
copyData(*rb, to);
|
|
|
|
to.finalize();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ThreadPool pool(CurrentMetrics::LocalThread, CurrentMetrics::LocalThreadActive, CurrentMetrics::LocalThreadScheduled, num_threads);
|
|
|
|
ParallelCompressedWriteBuffer to(*wb, codec, block_size, num_threads, pool);
|
|
|
|
copyData(*rb, to);
|
|
|
|
to.finalize();
|
|
|
|
}
|
2017-09-20 14:12:12 +00:00
|
|
|
}
|
2024-09-26 08:27:37 +00:00
|
|
|
|
|
|
|
wb->finalize();
|
2017-09-20 14:12:12 +00:00
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
2023-03-06 16:49:28 +00:00
|
|
|
std::cerr << getCurrentExceptionMessage(print_stacktrace) << '\n';
|
2020-05-04 00:11:49 +00:00
|
|
|
return getCurrentExceptionCode();
|
2017-09-20 14:12:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|