Merge pull request #22145 from ClickHouse/speedup-none

Speedup codec NONE
This commit is contained in:
alexey-milovidov 2021-03-28 01:56:00 +03:00 committed by GitHub
commit d07a40c675
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 67 additions and 12 deletions

View File

@ -51,7 +51,7 @@ bool CachedCompressedReadBuffer::nextImpl()
{
owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer();
owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes);
decompress(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
decompressTo(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
}

View File

@ -21,7 +21,7 @@ bool CompressedReadBuffer::nextImpl()
memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
return true;
}
@ -48,7 +48,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)
/// If the decompressed block fits entirely where it needs to be copied.
if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read)
{
decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum);
decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum);
bytes_read += size_decompressed;
bytes += size_decompressed;
}
@ -61,9 +61,9 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)
memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
pos = working_buffer.begin();
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
pos = working_buffer.begin();
bytes_read += read(to + bytes_read, n - bytes_read);
break;

View File

@ -184,7 +184,7 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed,
}
void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, bool allow_different_codecs)
{
ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks);
ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed);
@ -210,11 +210,38 @@ void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, s
ErrorCodes::CANNOT_DECOMPRESS);
}
}
}
void CompressedReadBufferBase::decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
{
readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs);
codec->decompress(compressed_buffer, size_compressed_without_checksum, to);
}
void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum)
{
readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs);
if (codec->isNone())
{
/// Shortcut for NONE codec to avoid extra memcpy.
/// We doing it by changing the buffer `to` to point to existing uncompressed data.
UInt8 header_size = ICompressionCodec::getHeaderSize();
if (size_compressed_without_checksum < header_size)
throw Exception(ErrorCodes::CORRUPTED_DATA,
"Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})",
size_compressed_without_checksum, static_cast<size_t>(header_size));
to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum);
}
else
codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin());
}
/// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.
CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_)
: compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_)

View File

@ -3,6 +3,7 @@
#include <Common/PODArray.h>
#include <Compression/LZ4_decompress_faster.h>
#include <Compression/ICompressionCodec.h>
#include <IO/BufferBase.h>
namespace DB
@ -37,7 +38,12 @@ protected:
/// Returns number of compressed bytes read.
size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum, bool always_copy);
void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum);
/// Decompress into memory pointed by `to`
void decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum);
/// This method can change location of `to` to avoid unnecessary copy if data is uncompressed.
/// It is more efficient for compression codec NONE but not suitable if you want to decompress into specific location.
void decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum);
public:
/// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.

View File

@ -31,7 +31,7 @@ bool CompressedReadBufferFromFile::nextImpl()
memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
return true;
}
@ -108,7 +108,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)
/// If the decompressed block fits entirely where it needs to be copied.
if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read)
{
decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum);
decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum);
bytes_read += size_decompressed;
bytes += size_decompressed;
}
@ -122,9 +122,9 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)
memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
pos = working_buffer.begin();
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
pos = working_buffer.begin();
bytes_read += read(to + bytes_read, n - bytes_read);
break;

View File

@ -98,7 +98,7 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch
UInt8 header_size = getHeaderSize();
if (source_size < header_size)
throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", source_size, size_t(header_size));
throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", source_size, static_cast<size_t>(header_size));
uint8_t our_method = getMethodByte();
uint8_t method = source[0];

View File

@ -0,0 +1,13 @@
<test>
<preconditions>
<table_exists>hits_10m_single</table_exists>
</preconditions>
<create_query>CREATE TABLE hits_none (Title String CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple()</create_query>
<fill_query>INSERT INTO hits_none SELECT Title FROM test.hits</fill_query>
<fill_query>OPTIMIZE TABLE hits_none FINAL</fill_query>
<query><![CDATA[SELECT sum(length(Title)) FROM hits_none]]></query>
<drop_query>DROP TABLE hits_none</drop_query>
</test>

View File

@ -0,0 +1 @@
687074654

View File

@ -0,0 +1,8 @@
DROP TABLE IF EXISTS hits_none;
CREATE TABLE hits_none (Title String CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple();
INSERT INTO hits_none SELECT Title FROM test.hits;
SET min_bytes_to_use_mmap_io = 1;
SELECT sum(length(Title)) FROM hits_none;
DROP TABLE hits_none;