2017-09-20 14:12:12 +00:00
# include <iostream>
2018-12-26 16:18:21 +00:00
# include <optional>
2017-09-20 14:12:12 +00:00
# include <boost/program_options.hpp>
2019-04-01 16:28:20 +00:00
# include <boost/algorithm/string/join.hpp>
2017-09-20 14:12:12 +00:00
# include <Common/Exception.h>
# include <IO/WriteBufferFromFileDescriptor.h>
# include <IO/ReadBufferFromFileDescriptor.h>
2020-12-29 19:12:29 +00:00
# include <IO/WriteBufferFromFile.h>
# include <IO/ReadBufferFromFile.h>
2018-12-28 18:15:26 +00:00
# include <Compression/CompressedWriteBuffer.h>
# include <Compression/CompressedReadBuffer.h>
2020-12-29 20:06:14 +00:00
# include <Compression/CompressedReadBufferFromFile.h>
2017-09-20 14:12:12 +00:00
# include <IO/WriteHelpers.h>
# include <IO/copyData.h>
2019-04-01 16:28:20 +00:00
# include <Parsers/parseQuery.h>
# include <Parsers/ExpressionElementParsers.h>
2018-12-21 12:17:30 +00:00
# include <Compression/CompressionFactory.h>
2019-08-23 15:47:27 +00:00
# include <Common/TerminalSize.h>
2020-04-15 20:28:05 +00:00
# include <Core/Defines.h>
2019-08-23 13:19:12 +00:00
2017-09-20 14:12:12 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int TOO_LARGE_SIZE_COMPRESSED ;
2018-12-26 15:01:26 +00:00
extern const int BAD_ARGUMENTS ;
2017-09-20 14:12:12 +00:00
}
}
namespace
{
/// Outputs sizes of uncompressed and compressed blocks for compressed file.
void checkAndWriteHeader ( DB : : ReadBuffer & in , DB : : WriteBuffer & out )
{
while ( ! in . eof ( ) )
{
in . ignore ( 16 ) ; /// checksum
char header [ COMPRESSED_BLOCK_HEADER_SIZE ] ;
in . readStrict ( header , COMPRESSED_BLOCK_HEADER_SIZE ) ;
UInt32 size_compressed = unalignedLoad < UInt32 > ( & header [ 1 ] ) ;
if ( size_compressed > DBMS_MAX_COMPRESSED_SIZE )
throw DB : : Exception ( " Too large size_compressed. Most likely corrupted data. " , DB : : ErrorCodes : : TOO_LARGE_SIZE_COMPRESSED ) ;
UInt32 size_decompressed = unalignedLoad < UInt32 > ( & header [ 5 ] ) ;
DB : : writeText ( size_decompressed , out ) ;
DB : : writeChar ( ' \t ' , out ) ;
DB : : writeText ( size_compressed , out ) ;
DB : : writeChar ( ' \n ' , out ) ;
in . ignore ( size_compressed - COMPRESSED_BLOCK_HEADER_SIZE ) ;
}
}
}
2019-12-15 06:34:43 +00:00
# pragma GCC diagnostic ignored "-Wunused-function"
# pragma GCC diagnostic ignored "-Wmissing-declarations"
2017-09-20 14:12:12 +00:00
int mainEntryClickHouseCompressor ( int argc , char * * argv )
{
2020-05-04 00:11:49 +00:00
using namespace DB ;
2019-08-23 15:47:27 +00:00
boost : : program_options : : options_description desc = createOptionsDescription ( " Allowed options " , getTerminalWidth ( ) ) ;
2017-09-20 14:12:12 +00:00
desc . add_options ( )
( " help,h " , " produce help message " )
2020-12-29 19:12:29 +00:00
( " input " , boost : : program_options : : value < std : : string > ( ) - > value_name ( " INPUT " ) , " input file " )
( " output " , boost : : program_options : : value < std : : string > ( ) - > value_name ( " OUTPUT " ) , " output file " )
2017-09-20 14:12:12 +00:00
( " decompress,d " , " decompress " )
2020-12-29 20:06:14 +00:00
( " offset-in-compressed-file " , boost : : program_options : : value < size_t > ( ) - > default_value ( 0ULL ) , " offset to the compressed block (i.e. physical file offset) " )
( " offset-in-decompressed-block " , boost : : program_options : : value < size_t > ( ) - > default_value ( 0ULL ) , " offset to the decompressed block (i.e. virtual offset) " )
2017-09-20 14:12:12 +00:00
( " block-size,b " , boost : : program_options : : value < unsigned > ( ) - > default_value ( DBMS_DEFAULT_BUFFER_SIZE ) , " compress in blocks of specified size " )
( " hc " , " use LZ4HC instead of LZ4 " )
( " zstd " , " use ZSTD instead of LZ4 " )
2018-12-26 15:01:26 +00:00
( " codec " , boost : : program_options : : value < std : : vector < std : : string > > ( ) - > multitoken ( ) , " use codecs combination instead of LZ4 " )
2020-01-11 09:50:41 +00:00
( " level " , boost : : program_options : : value < int > ( ) , " compression level for codecs specified via flags " )
2017-09-20 14:12:12 +00:00
( " none " , " use no compression instead of LZ4 " )
( " stat " , " print block statistics of compressed data " )
;
2020-12-29 19:12:29 +00:00
boost : : program_options : : positional_options_description positional_desc ;
positional_desc . add ( " input " , 1 ) ;
positional_desc . add ( " output " , 1 ) ;
2017-09-20 14:12:12 +00:00
boost : : program_options : : variables_map options ;
2020-12-29 19:12:29 +00:00
boost : : program_options : : store ( boost : : program_options : : command_line_parser ( argc , argv ) . options ( desc ) . positional ( positional_desc ) . run ( ) , options ) ;
2017-09-20 14:12:12 +00:00
if ( options . count ( " help " ) )
{
2020-12-29 19:12:29 +00:00
std : : cout < < " Usage: " < < argv [ 0 ] < < " [options] INPUT OUTPUT " < < std : : endl ;
2017-09-20 14:12:12 +00:00
std : : cout < < desc < < std : : endl ;
return 1 ;
}
try
{
bool decompress = options . count ( " decompress " ) ;
bool use_lz4hc = options . count ( " hc " ) ;
bool use_zstd = options . count ( " zstd " ) ;
bool stat_mode = options . count ( " stat " ) ;
bool use_none = options . count ( " none " ) ;
unsigned block_size = options [ " block-size " ] . as < unsigned > ( ) ;
2018-12-26 15:01:26 +00:00
std : : vector < std : : string > codecs ;
if ( options . count ( " codec " ) )
codecs = options [ " codec " ] . as < std : : vector < std : : string > > ( ) ;
if ( ( use_lz4hc | | use_zstd | | use_none ) & & ! codecs . empty ( ) )
2020-05-04 00:11:49 +00:00
throw Exception ( " Wrong options, codec flags like --zstd and --codec options are mutually exclusive " , ErrorCodes : : BAD_ARGUMENTS ) ;
2017-09-20 14:12:12 +00:00
2019-04-01 16:28:20 +00:00
if ( ! codecs . empty ( ) & & options . count ( " level " ) )
2020-05-04 00:11:49 +00:00
throw Exception ( " Wrong options, --level is not compatible with --codec list " , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-04-01 16:28:20 +00:00
2018-12-21 12:17:30 +00:00
std : : string method_family = " LZ4 " ;
2017-09-20 14:12:12 +00:00
if ( use_lz4hc )
2018-12-21 12:17:30 +00:00
method_family = " LZ4HC " ;
2017-09-20 14:12:12 +00:00
else if ( use_zstd )
2018-12-21 12:17:30 +00:00
method_family = " ZSTD " ;
2017-09-20 14:12:12 +00:00
else if ( use_none )
2018-12-21 12:17:30 +00:00
method_family = " NONE " ;
2017-09-20 14:12:12 +00:00
2019-04-01 16:34:27 +00:00
std : : optional < int > level = std : : nullopt ;
2018-12-21 12:17:30 +00:00
if ( options . count ( " level " ) )
2019-04-01 16:34:27 +00:00
level = options [ " level " ] . as < int > ( ) ;
2019-04-01 16:28:20 +00:00
2020-05-04 00:11:49 +00:00
CompressionCodecPtr codec ;
2018-12-26 15:01:26 +00:00
if ( ! codecs . empty ( ) )
{
2020-05-04 00:11:49 +00:00
ParserCodec codec_parser ;
2019-04-01 16:28:20 +00:00
std : : string codecs_line = boost : : algorithm : : join ( codecs , " , " ) ;
2020-05-04 00:11:49 +00:00
auto ast = parseQuery ( codec_parser , " ( " + codecs_line + " ) " , 0 , DBMS_DEFAULT_MAX_PARSER_DEPTH ) ;
2020-08-28 17:40:45 +00:00
codec = CompressionCodecFactory : : instance ( ) . get ( ast , nullptr ) ;
2018-12-26 15:01:26 +00:00
}
else
2020-08-28 17:40:45 +00:00
codec = CompressionCodecFactory : : instance ( ) . get ( method_family , level ) ;
2018-12-21 12:17:30 +00:00
2017-10-13 01:02:16 +00:00
2020-12-29 19:12:29 +00:00
std : : unique_ptr < ReadBufferFromFileBase > rb ;
std : : unique_ptr < WriteBufferFromFileBase > wb ;
if ( options . count ( " input " ) )
rb = std : : make_unique < ReadBufferFromFile > ( options [ " input " ] . as < std : : string > ( ) ) ;
else
rb = std : : make_unique < ReadBufferFromFileDescriptor > ( STDIN_FILENO ) ;
if ( options . count ( " output " ) )
wb = std : : make_unique < WriteBufferFromFile > ( options [ " output " ] . as < std : : string > ( ) ) ;
else
wb = std : : make_unique < WriteBufferFromFileDescriptor > ( STDOUT_FILENO ) ;
2017-09-20 14:12:12 +00:00
if ( stat_mode )
{
/// Output statistic for compressed file.
2020-12-29 19:12:29 +00:00
checkAndWriteHeader ( * rb , * wb ) ;
2017-09-20 14:12:12 +00:00
}
else if ( decompress )
{
/// Decompression
2020-12-29 20:06:14 +00:00
size_t offset_in_compressed_file = options [ " offset-in-compressed-file " ] . as < size_t > ( ) ;
size_t offset_in_decompressed_block = options [ " offset-in-decompressed-block " ] . as < size_t > ( ) ;
if ( offset_in_compressed_file | | offset_in_decompressed_block )
{
if ( ! options . count ( " input " ) )
{
throw DB : : Exception ( " --offset-in-compressed-file/--offset-in-decompressed-block requires --input " , DB : : ErrorCodes : : BAD_ARGUMENTS ) ;
}
CompressedReadBufferFromFile compressed_file ( options [ " input " ] . as < std : : string > ( ) , 0 , 0 , 0 ) ;
compressed_file . seek ( offset_in_compressed_file , offset_in_decompressed_block ) ;
copyData ( compressed_file , * wb ) ;
}
else
{
CompressedReadBuffer from ( * rb ) ;
copyData ( from , * wb ) ;
}
2017-09-20 14:12:12 +00:00
}
else
{
/// Compression
2020-12-29 19:12:29 +00:00
CompressedWriteBuffer to ( * wb , codec , block_size ) ;
copyData ( * rb , to ) ;
2017-09-20 14:12:12 +00:00
}
}
catch ( . . . )
{
2020-12-23 21:18:08 +00:00
std : : cerr < < getCurrentExceptionMessage ( true ) < < ' \n ' ;
2020-05-04 00:11:49 +00:00
return getCurrentExceptionCode ( ) ;
2017-09-20 14:12:12 +00:00
}
return 0 ;
}