mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-27 18:12:02 +00:00
Control memory usage in generateRandom
This commit is contained in:
parent
3f5ff27d67
commit
c62558f982
@ -29,6 +29,7 @@
|
||||
|
||||
#include <Common/SipHash.h>
|
||||
#include <Common/randomSeed.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <base/unaligned.h>
|
||||
|
||||
#include <Functions/FunctionFactory.h>
|
||||
@ -81,6 +82,66 @@ void fillBufferWithRandomData(char * __restrict data, size_t limit, size_t size_
|
||||
}
|
||||
|
||||
|
||||
size_t estimateValueSize(
|
||||
const DataTypePtr type,
|
||||
UInt64 max_array_length,
|
||||
UInt64 max_string_length)
|
||||
{
|
||||
if (type->haveMaximumSizeOfValue())
|
||||
return type->getMaximumSizeOfValueInMemory();
|
||||
|
||||
TypeIndex idx = type->getTypeId();
|
||||
|
||||
switch (idx)
|
||||
{
|
||||
case TypeIndex::String:
|
||||
{
|
||||
return max_string_length + sizeof(size_t) + 1;
|
||||
}
|
||||
|
||||
/// The logic in this function should reflect the logic of fillColumnWithRandomData.
|
||||
case TypeIndex::Array:
|
||||
{
|
||||
auto nested_type = typeid_cast<const DataTypeArray &>(*type).getNestedType();
|
||||
return sizeof(size_t) + estimateValueSize(nested_type, max_array_length / 2, max_string_length);
|
||||
}
|
||||
|
||||
case TypeIndex::Map:
|
||||
{
|
||||
const DataTypePtr & nested_type = typeid_cast<const DataTypeMap &>(*type).getNestedType();
|
||||
return sizeof(size_t) + estimateValueSize(nested_type, max_array_length / 2, max_string_length);
|
||||
}
|
||||
|
||||
case TypeIndex::Tuple:
|
||||
{
|
||||
auto elements = typeid_cast<const DataTypeTuple *>(type.get())->getElements();
|
||||
const size_t tuple_size = elements.size();
|
||||
size_t res = 0;
|
||||
|
||||
for (size_t i = 0; i < tuple_size; ++i)
|
||||
res += estimateValueSize(elements[i], max_array_length, max_string_length);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
case TypeIndex::Nullable:
|
||||
{
|
||||
auto nested_type = typeid_cast<const DataTypeNullable &>(*type).getNestedType();
|
||||
return 1 + estimateValueSize(nested_type, max_array_length, max_string_length);
|
||||
}
|
||||
|
||||
case TypeIndex::LowCardinality:
|
||||
{
|
||||
auto nested_type = typeid_cast<const DataTypeLowCardinality &>(*type).getDictionaryType();
|
||||
return sizeof(size_t) + estimateValueSize(nested_type, max_array_length, max_string_length);
|
||||
}
|
||||
|
||||
default:
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "The 'GenerateRandom' is not implemented for type {}", type->getName());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ColumnPtr fillColumnWithRandomData(
|
||||
const DataTypePtr type,
|
||||
UInt64 limit,
|
||||
@ -192,7 +253,8 @@ ColumnPtr fillColumnWithRandomData(
|
||||
offsets[i] = offset;
|
||||
}
|
||||
|
||||
auto data_column = fillColumnWithRandomData(nested_type, offset, max_array_length, max_string_length, rng, context);
|
||||
/// This division by two makes the size growth subexponential on depth.
|
||||
auto data_column = fillColumnWithRandomData(nested_type, offset, max_array_length / 2, max_string_length, rng, context);
|
||||
|
||||
return ColumnArray::create(data_column, std::move(offsets_column));
|
||||
}
|
||||
@ -200,7 +262,7 @@ ColumnPtr fillColumnWithRandomData(
|
||||
case TypeIndex::Map:
|
||||
{
|
||||
const DataTypePtr & nested_type = typeid_cast<const DataTypeMap &>(*type).getNestedType();
|
||||
auto nested_column = fillColumnWithRandomData(nested_type, limit, max_array_length, max_string_length, rng, context);
|
||||
auto nested_column = fillColumnWithRandomData(nested_type, limit, max_array_length / 2, max_string_length, rng, context);
|
||||
return ColumnMap::create(nested_column);
|
||||
}
|
||||
|
||||
@ -597,6 +659,25 @@ Pipe StorageGenerateRandom::read(
|
||||
block_header.insert({std::move(column), name_type.type, name_type.name});
|
||||
}
|
||||
|
||||
/// Correction of block size for wide tables.
|
||||
size_t preferred_block_size_bytes = context->getSettingsRef().preferred_block_size_bytes;
|
||||
if (preferred_block_size_bytes)
|
||||
{
|
||||
size_t estimated_row_size_bytes = estimateValueSize(std::make_shared<DataTypeTuple>(block_header.getDataTypes()), max_array_length, max_string_length);
|
||||
|
||||
size_t estimated_block_size_bytes = 0;
|
||||
if (common::mulOverflow(max_block_size, estimated_row_size_bytes, estimated_block_size_bytes))
|
||||
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large estimated block size in GenerateRandom table: its estimation leads to 64bit overflow");
|
||||
chassert(estimated_block_size_bytes != 0);
|
||||
|
||||
if (estimated_block_size_bytes > preferred_block_size_bytes)
|
||||
{
|
||||
max_block_size = static_cast<size_t>(max_block_size * (static_cast<double>(preferred_block_size_bytes) / estimated_block_size_bytes));
|
||||
if (max_block_size == 0)
|
||||
max_block_size = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Will create more seed values for each source from initial seed.
|
||||
pcg64 generate(random_seed);
|
||||
|
||||
|
@ -5,7 +5,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&max_block_size=5&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0" -d 'SELECT max(number) FROM numbers(10)' 2>&1 | grep -E 'Content-Encoding|X-ClickHouse-Progress|^[0-9]'
|
||||
# This test will fail with external poco (progress not supported)
|
||||
|
||||
${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&max_block_size=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0&output_format_parallel_formatting=0" -d 'SELECT number FROM numbers(10)' 2>&1 | grep -E 'Content-Encoding|X-ClickHouse-Progress|^[0-9]'
|
||||
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_block_size=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0&enable_http_compression=1" -H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 10' | gzip -d
|
||||
|
Loading…
Reference in New Issue
Block a user