Merge pull request #42046 from evillique/fix-bz2-decoding

Fix bzip2 decoding issue
This commit is contained in:
Nikolay Degterinsky 2022-10-06 16:02:09 +02:00 committed by GitHub
commit 16a8145deb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 67 additions and 13 deletions

View File

@ -35,6 +35,33 @@ public:
BZ2_bzDecompressEnd(&stream); BZ2_bzDecompressEnd(&stream);
} }
void reinitialize()
{
auto avail_out = stream.avail_out;
auto * next_out = stream.next_out;
int ret = BZ2_bzDecompressEnd(&stream);
if (ret != BZ_OK)
throw Exception(
ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
"bzip2 stream encoder reinit decompress end failed: error code: {}",
ret);
memset(&stream, 0, sizeof(bz->stream));
ret = BZ2_bzDecompressInit(&stream, 0, 0);
if (ret != BZ_OK)
throw Exception(
ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
"bzip2 stream encoder reinit failed: error code: {}",
ret);
stream.avail_out = avail_out;
stream.next_out = next_out;
}
bz_stream stream; bz_stream stream;
}; };
@ -68,24 +95,24 @@ bool Bzip2ReadBuffer::nextImpl()
ret = BZ2_bzDecompress(&bz->stream); ret = BZ2_bzDecompress(&bz->stream);
in->position() = in->buffer().end() - bz->stream.avail_in; in->position() = in->buffer().end() - bz->stream.avail_in;
if (ret == BZ_STREAM_END && !in->eof())
{
bz->reinitialize();
bz->stream.avail_in = in->buffer().end() - in->position();
bz->stream.next_in = in->position();
ret = BZ_OK;
}
} }
while (bz->stream.avail_out == internal_buffer.size() && ret == BZ_OK && !in->eof()); while (bz->stream.avail_out == internal_buffer.size() && ret == BZ_OK && !in->eof());
working_buffer.resize(internal_buffer.size() - bz->stream.avail_out); working_buffer.resize(internal_buffer.size() - bz->stream.avail_out);
if (ret == BZ_STREAM_END) if (ret == BZ_STREAM_END && in->eof())
{ {
if (in->eof()) eof_flag = true;
{ return !working_buffer.empty();
eof_flag = true;
return !working_buffer.empty();
}
else
{
throw Exception(
ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
"bzip2 decoder finished, but input stream has not exceeded: error code: {}", ret);
}
} }
if (ret != BZ_OK) if (ret != BZ_OK)

View File

@ -123,3 +123,4 @@ Hello, world
Hello, world Hello, world
0 0
Part1 Part2 Part1 Part2
Part1 Part2

View File

@ -51,5 +51,6 @@ echo "'Hello, world'" | bzip2 -c | ${CLICKHOUSE_CURL} -sS --data-binary @- -H 'C
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&enable_http_compression=1" -H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 0' | wc -c; ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&enable_http_compression=1" -H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 0' | wc -c;
# POST multiple concatenated gzip streams. # POST multiple concatenated gzip and bzip2 streams.
(echo -n "SELECT 'Part1" | gzip -c; echo " Part2'" | gzip -c) | ${CLICKHOUSE_CURL} -sS -H 'Content-Encoding: gzip' "${CLICKHOUSE_URL}" --data-binary @- (echo -n "SELECT 'Part1" | gzip -c; echo " Part2'" | gzip -c) | ${CLICKHOUSE_CURL} -sS -H 'Content-Encoding: gzip' "${CLICKHOUSE_URL}" --data-binary @-
(echo -n "SELECT 'Part1" | bzip2 -c; echo " Part2'" | bzip2 -c) | ${CLICKHOUSE_CURL} -sS -H 'Content-Encoding: bz2' "${CLICKHOUSE_URL}" --data-binary @-

View File

@ -0,0 +1,4 @@
0
1
2
3

View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Tags: no-fasttest
# Tag no-fasttest: depends on bzip2
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
WORKING_FOLDER_02457="${USER_FILES_PATH}/${CLICKHOUSE_DATABASE}"
rm -rf "${WORKING_FOLDER_02457}"
mkdir "${WORKING_FOLDER_02457}"
${CLICKHOUSE_CLIENT} --query "SELECT * FROM numbers(0, 2) INTO OUTFILE '${WORKING_FOLDER_02457}/file_1.bz2'"
${CLICKHOUSE_CLIENT} --query "SELECT * FROM numbers(2, 2) INTO OUTFILE '${WORKING_FOLDER_02457}/file_2.bz2'"
cat ${WORKING_FOLDER_02457}/file_1.bz2 ${WORKING_FOLDER_02457}/file_2.bz2 > ${WORKING_FOLDER_02457}/concatenated.bz2
${CLICKHOUSE_CLIENT} --query "SELECT * FROM file('${WORKING_FOLDER_02457}/concatenated.bz2', 'TabSeparated', 'col Int64')"
rm -rf "${WORKING_FOLDER_02457}"