Control memory usage in generateRandom

This commit is contained in:
Alexey Milovidov 2023-06-04 04:44:51 +02:00
parent 3f5ff27d67
commit c62558f982
2 changed files with 83 additions and 3 deletions

View File

@ -29,6 +29,7 @@
#include <Common/SipHash.h>
#include <Common/randomSeed.h>
#include <Interpreters/Context.h>
#include <base/unaligned.h>
#include <Functions/FunctionFactory.h>
@ -81,6 +82,66 @@ void fillBufferWithRandomData(char * __restrict data, size_t limit, size_t size_
}
size_t estimateValueSize(
const DataTypePtr type,
UInt64 max_array_length,
UInt64 max_string_length)
{
if (type->haveMaximumSizeOfValue())
return type->getMaximumSizeOfValueInMemory();
TypeIndex idx = type->getTypeId();
switch (idx)
{
case TypeIndex::String:
{
return max_string_length + sizeof(size_t) + 1;
}
/// The logic in this function should reflect the logic of fillColumnWithRandomData.
case TypeIndex::Array:
{
auto nested_type = typeid_cast<const DataTypeArray &>(*type).getNestedType();
return sizeof(size_t) + estimateValueSize(nested_type, max_array_length / 2, max_string_length);
}
case TypeIndex::Map:
{
const DataTypePtr & nested_type = typeid_cast<const DataTypeMap &>(*type).getNestedType();
return sizeof(size_t) + estimateValueSize(nested_type, max_array_length / 2, max_string_length);
}
case TypeIndex::Tuple:
{
auto elements = typeid_cast<const DataTypeTuple *>(type.get())->getElements();
const size_t tuple_size = elements.size();
size_t res = 0;
for (size_t i = 0; i < tuple_size; ++i)
res += estimateValueSize(elements[i], max_array_length, max_string_length);
return res;
}
case TypeIndex::Nullable:
{
auto nested_type = typeid_cast<const DataTypeNullable &>(*type).getNestedType();
return 1 + estimateValueSize(nested_type, max_array_length, max_string_length);
}
case TypeIndex::LowCardinality:
{
auto nested_type = typeid_cast<const DataTypeLowCardinality &>(*type).getDictionaryType();
return sizeof(size_t) + estimateValueSize(nested_type, max_array_length, max_string_length);
}
default:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "The 'GenerateRandom' is not implemented for type {}", type->getName());
}
}
ColumnPtr fillColumnWithRandomData(
const DataTypePtr type,
UInt64 limit,
@ -192,7 +253,8 @@ ColumnPtr fillColumnWithRandomData(
offsets[i] = offset;
}
auto data_column = fillColumnWithRandomData(nested_type, offset, max_array_length, max_string_length, rng, context);
/// This division by two makes the size growth subexponential on depth.
auto data_column = fillColumnWithRandomData(nested_type, offset, max_array_length / 2, max_string_length, rng, context);
return ColumnArray::create(data_column, std::move(offsets_column));
}
@ -200,7 +262,7 @@ ColumnPtr fillColumnWithRandomData(
case TypeIndex::Map:
{
const DataTypePtr & nested_type = typeid_cast<const DataTypeMap &>(*type).getNestedType();
auto nested_column = fillColumnWithRandomData(nested_type, limit, max_array_length, max_string_length, rng, context);
auto nested_column = fillColumnWithRandomData(nested_type, limit, max_array_length / 2, max_string_length, rng, context);
return ColumnMap::create(nested_column);
}
@ -597,6 +659,25 @@ Pipe StorageGenerateRandom::read(
block_header.insert({std::move(column), name_type.type, name_type.name});
}
/// Correction of block size for wide tables.
size_t preferred_block_size_bytes = context->getSettingsRef().preferred_block_size_bytes;
if (preferred_block_size_bytes)
{
size_t estimated_row_size_bytes = estimateValueSize(std::make_shared<DataTypeTuple>(block_header.getDataTypes()), max_array_length, max_string_length);
size_t estimated_block_size_bytes = 0;
if (common::mulOverflow(max_block_size, estimated_row_size_bytes, estimated_block_size_bytes))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large estimated block size in GenerateRandom table: its estimation leads to 64bit overflow");
chassert(estimated_block_size_bytes != 0);
if (estimated_block_size_bytes > preferred_block_size_bytes)
{
max_block_size = static_cast<size_t>(max_block_size * (static_cast<double>(preferred_block_size_bytes) / estimated_block_size_bytes));
if (max_block_size == 0)
max_block_size = 1;
}
}
/// Will create more seed values for each source from initial seed.
pcg64 generate(random_seed);

View File

@ -5,7 +5,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CURDIR"/../shell_config.sh
${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&max_block_size=5&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0" -d 'SELECT max(number) FROM numbers(10)' 2>&1 | grep -E 'Content-Encoding|X-ClickHouse-Progress|^[0-9]'
# This test will fail with external poco (progress not supported)
${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&max_block_size=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0&output_format_parallel_formatting=0" -d 'SELECT number FROM numbers(10)' 2>&1 | grep -E 'Content-Encoding|X-ClickHouse-Progress|^[0-9]'
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_block_size=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0&enable_http_compression=1" -H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 10' | gzip -d