ClickHouse/programs/compressor/Compressor.cpp

164 lines
5.3 KiB
C++
Raw Normal View History

#include <iostream>
2018-12-26 16:18:21 +00:00
#include <optional>
#include <boost/program_options.hpp>
2019-04-01 16:28:20 +00:00
#include <boost/algorithm/string/join.hpp>
#include <Common/Exception.h>
#include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/ReadBufferFromFileDescriptor.h>
2018-12-28 18:15:26 +00:00
#include <Compression/CompressedWriteBuffer.h>
#include <Compression/CompressedReadBuffer.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
2019-04-01 16:28:20 +00:00
#include <Parsers/parseQuery.h>
#include <Parsers/ExpressionElementParsers.h>
2018-12-21 12:17:30 +00:00
#include <Compression/CompressionFactory.h>
2019-08-23 15:47:27 +00:00
#include <Common/TerminalSize.h>
#include <Core/Defines.h>
2019-08-23 13:19:12 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int TOO_LARGE_SIZE_COMPRESSED;
2018-12-26 15:01:26 +00:00
extern const int BAD_ARGUMENTS;
}
}
namespace
{
/// Outputs sizes of uncompressed and compressed blocks for compressed file.
void checkAndWriteHeader(DB::ReadBuffer & in, DB::WriteBuffer & out)
{
while (!in.eof())
{
in.ignore(16); /// checksum
char header[COMPRESSED_BLOCK_HEADER_SIZE];
in.readStrict(header, COMPRESSED_BLOCK_HEADER_SIZE);
UInt32 size_compressed = unalignedLoad<UInt32>(&header[1]);
if (size_compressed > DBMS_MAX_COMPRESSED_SIZE)
throw DB::Exception("Too large size_compressed. Most likely corrupted data.", DB::ErrorCodes::TOO_LARGE_SIZE_COMPRESSED);
UInt32 size_decompressed = unalignedLoad<UInt32>(&header[5]);
DB::writeText(size_decompressed, out);
DB::writeChar('\t', out);
DB::writeText(size_compressed, out);
DB::writeChar('\n', out);
in.ignore(size_compressed - COMPRESSED_BLOCK_HEADER_SIZE);
}
}
}
2019-12-15 06:34:43 +00:00
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wmissing-declarations"
int mainEntryClickHouseCompressor(int argc, char ** argv)
{
2020-05-04 00:11:49 +00:00
using namespace DB;
2019-08-23 15:47:27 +00:00
boost::program_options::options_description desc = createOptionsDescription("Allowed options", getTerminalWidth());
desc.add_options()
("help,h", "produce help message")
("decompress,d", "decompress")
("block-size,b", boost::program_options::value<unsigned>()->default_value(DBMS_DEFAULT_BUFFER_SIZE), "compress in blocks of specified size")
("hc", "use LZ4HC instead of LZ4")
("zstd", "use ZSTD instead of LZ4")
2018-12-26 15:01:26 +00:00
("codec", boost::program_options::value<std::vector<std::string>>()->multitoken(), "use codecs combination instead of LZ4")
2020-01-11 09:50:41 +00:00
("level", boost::program_options::value<int>(), "compression level for codecs specified via flags")
("none", "use no compression instead of LZ4")
("stat", "print block statistics of compressed data")
;
boost::program_options::variables_map options;
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);
if (options.count("help"))
{
std::cout << "Usage: " << argv[0] << " [options] < in > out" << std::endl;
std::cout << desc << std::endl;
return 1;
}
try
{
bool decompress = options.count("decompress");
bool use_lz4hc = options.count("hc");
bool use_zstd = options.count("zstd");
bool stat_mode = options.count("stat");
bool use_none = options.count("none");
unsigned block_size = options["block-size"].as<unsigned>();
2018-12-26 15:01:26 +00:00
std::vector<std::string> codecs;
if (options.count("codec"))
codecs = options["codec"].as<std::vector<std::string>>();
if ((use_lz4hc || use_zstd || use_none) && !codecs.empty())
2020-05-04 00:11:49 +00:00
throw Exception("Wrong options, codec flags like --zstd and --codec options are mutually exclusive", ErrorCodes::BAD_ARGUMENTS);
2019-04-01 16:28:20 +00:00
if (!codecs.empty() && options.count("level"))
2020-05-04 00:11:49 +00:00
throw Exception("Wrong options, --level is not compatible with --codec list", ErrorCodes::BAD_ARGUMENTS);
2019-04-01 16:28:20 +00:00
2018-12-21 12:17:30 +00:00
std::string method_family = "LZ4";
if (use_lz4hc)
2018-12-21 12:17:30 +00:00
method_family = "LZ4HC";
else if (use_zstd)
2018-12-21 12:17:30 +00:00
method_family = "ZSTD";
else if (use_none)
2018-12-21 12:17:30 +00:00
method_family = "NONE";
2019-04-01 16:34:27 +00:00
std::optional<int> level = std::nullopt;
2018-12-21 12:17:30 +00:00
if (options.count("level"))
2019-04-01 16:34:27 +00:00
level = options["level"].as<int>();
2019-04-01 16:28:20 +00:00
2020-05-04 00:11:49 +00:00
CompressionCodecPtr codec;
2018-12-26 15:01:26 +00:00
if (!codecs.empty())
{
2020-05-04 00:11:49 +00:00
ParserCodec codec_parser;
2019-04-01 16:28:20 +00:00
std::string codecs_line = boost::algorithm::join(codecs, ",");
2020-05-04 00:11:49 +00:00
auto ast = parseQuery(codec_parser, "(" + codecs_line + ")", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
codec = CompressionCodecFactory::instance().get(ast, nullptr);
2018-12-26 15:01:26 +00:00
}
else
codec = CompressionCodecFactory::instance().get(method_family, level);
2018-12-21 12:17:30 +00:00
2020-05-04 00:11:49 +00:00
ReadBufferFromFileDescriptor rb(STDIN_FILENO);
WriteBufferFromFileDescriptor wb(STDOUT_FILENO);
if (stat_mode)
{
/// Output statistic for compressed file.
checkAndWriteHeader(rb, wb);
}
else if (decompress)
{
/// Decompression
2020-05-04 00:11:49 +00:00
CompressedReadBuffer from(rb);
copyData(from, wb);
}
else
{
/// Compression
2020-05-04 00:11:49 +00:00
CompressedWriteBuffer to(wb, codec, block_size);
copyData(rb, to);
}
}
catch (...)
{
std::cerr << getCurrentExceptionMessage(true) << '\n';
2020-05-04 00:11:49 +00:00
return getCurrentExceptionCode();
}
return 0;
}