better sampling in ColumnSparse and fix alter rename column

This commit is contained in:
Anton Popov 2021-11-08 21:24:38 +03:00
parent 84e914e05a
commit a7f219b3aa
7 changed files with 35 additions and 20 deletions

View File

@ -149,22 +149,34 @@ double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: {}", sample_ratio);
/// Randomize a little to avoid boundary effects.
std::uniform_int_distribution<size_t> dist(1, static_cast<size_t>(1.0 / sample_ratio));
size_t num_rows = size();
size_t num_sampled_rows = static_cast<size_t>(num_rows * sample_ratio);
if (num_sampled_rows == 0)
return 0.0;
size_t step = num_rows / num_sampled_rows;
std::uniform_int_distribution<size_t> dist(1, step);
size_t num_checked_rows = dist(thread_local_rng);
num_sampled_rows = std::min(num_sampled_rows + dist(thread_local_rng), num_rows);
size_t res = 0;
for (size_t i = 0; i < num_rows; i += step)
if (num_sampled_rows == num_rows)
{
size_t idx = std::min(i + dist(thread_local_rng), num_rows - 1);
res += static_cast<const Derived &>(*this).isDefaultAt(idx);
for (size_t i = 0; i < num_rows; ++i)
res += static_cast<const Derived &>(*this).isDefaultAt(i);
num_checked_rows = num_rows;
}
else if (num_sampled_rows != 0)
{
for (size_t i = num_checked_rows; i < num_rows; ++i)
{
if (num_checked_rows * num_rows <= i * num_sampled_rows)
{
res += static_cast<const Derived &>(*this).isDefaultAt(i);
++num_checked_rows;
}
}
}
return static_cast<double>(res) / num_sampled_rows;
return static_cast<double>(res) / num_checked_rows;
}
template <typename Derived>

View File

@ -684,10 +684,15 @@ MergeTreeDataMergerMutator::getColumnsForNewDataPart(
NameSet removed_columns;
NameToNameMap renamed_columns_to_from;
NameToNameMap renamed_columns_from_to;
ColumnsDescription part_columns(source_part->getColumns());
/// All commands are validated in AlterCommand so we don't care about order
for (const auto & command : commands_for_removes)
{
/// If we don't have this column in source part, than we don't need to materialize it
if (!part_columns.has(command.column_name))
continue;
if (command.type == MutationCommand::DROP_COLUMN)
removed_columns.insert(command.column_name);

View File

@ -72,7 +72,7 @@ def drop_table(cluster):
# S3 request will be failed for an appropriate part file write.
FILES_PER_PART_BASE = 6 # partition.dat, default_compression_codec.txt, count.txt, columns.txt, checksums.txt, serialization.txt
FILES_PER_PART_BASE = 5 # partition.dat, default_compression_codec.txt, count.txt, columns.txt, checksums.txt
FILES_PER_PART_WIDE = FILES_PER_PART_BASE + 1 + 1 + 3 * 2 # Primary index, MinMax, Mark and data file for column(s)
# In debug build there are additional requests (from MergeTreeDataPartWriterWide.cpp:554 due to additional validation).

View File

@ -61,8 +61,8 @@ def partition_complex_assert_checksums():
"13cae8e658e0ca4f75c56b1fc424e150\tshadow/1/data/test/partition_complex/19700102_2_2_0/minmax_p.idx\n" \
"25daad3d9e60b45043a70c4ab7d3b1c6\tshadow/1/data/test/partition_complex/19700102_2_2_0/partition.dat\n" \
"3726312af62aec86b64a7708d5751787\tshadow/1/data/test/partition_complex/19700201_1_1_0/partition.dat\n" \
"37855b06a39b79a67ea4e86e4a3299aa\tshadow/1/data/test/partition_complex/19700102_2_2_0/checksums.txt\n" \
"38e62ff37e1e5064e9a3f605dfe09d13\tshadow/1/data/test/partition_complex/19700102_2_2_0/v1.bin\n" \
"43d32c3316e810e0231ee4f93dbf2875\tshadow/1/data/test/partition_complex/19700102_2_2_0/checksums.txt\n" \
"4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/k.mrk\n" \
"4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/p.mrk\n" \
"4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/v1.mrk\n" \
@ -70,19 +70,17 @@ def partition_complex_assert_checksums():
"4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/p.mrk\n" \
"4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/v1.mrk\n" \
"55a54008ad1ba589aa210d2629c1df41\tshadow/1/data/test/partition_complex/19700201_1_1_0/primary.idx\n" \
"5f087cb3e7071bf9407e095821e2af8f\tshadow/1/data/test/partition_complex/19700201_1_1_0/checksums.txt\n" \
"77d5af402ada101574f4da114f242e02\tshadow/1/data/test/partition_complex/19700102_2_2_0/columns.txt\n" \
"77d5af402ada101574f4da114f242e02\tshadow/1/data/test/partition_complex/19700201_1_1_0/columns.txt\n" \
"88cdc31ded355e7572d68d8cde525d3a\tshadow/1/data/test/partition_complex/19700201_1_1_0/p.bin\n" \
"9e688c58a5487b8eaf69c9e1005ad0bf\tshadow/1/data/test/partition_complex/19700102_2_2_0/primary.idx\n" \
"b0f1c38fe8a3e0b38d75f4d9c142bc45\tshadow/1/data/test/partition_complex/19700201_1_1_0/checksums.txt\n" \
"c0904274faa8f3f06f35666cc9c5bd2f\tshadow/1/data/test/partition_complex/19700102_2_2_0/default_compression_codec.txt\n" \
"c0904274faa8f3f06f35666cc9c5bd2f\tshadow/1/data/test/partition_complex/19700201_1_1_0/default_compression_codec.txt\n" \
"c4ca4238a0b923820dcc509a6f75849b\tshadow/1/data/test/partition_complex/19700102_2_2_0/count.txt\n" \
"c4ca4238a0b923820dcc509a6f75849b\tshadow/1/data/test/partition_complex/19700201_1_1_0/count.txt\n" \
"cfcb770c3ecd0990dcceb1bde129e6c6\tshadow/1/data/test/partition_complex/19700102_2_2_0/p.bin\n" \
"e2af3bef1fd129aea73a890ede1e7a30\tshadow/1/data/test/partition_complex/19700201_1_1_0/k.bin\n" \
"e8490b8552f8b9b774db2f9eb1c90349\tshadow/1/data/test/partition_complex/19700102_2_2_0/serialization.txt\n" \
"e8490b8552f8b9b774db2f9eb1c90349\tshadow/1/data/test/partition_complex/19700201_1_1_0/serialization.txt\n" \
"f2312862cc01adf34a93151377be2ddf\tshadow/1/data/test/partition_complex/19700201_1_1_0/minmax_p.idx\n"
assert TSV(instance.exec_in_container(cmd).replace(' ', '\t')) == TSV(checksums)

View File

@ -9,7 +9,7 @@ from pyhdfs import HdfsClient
SHARDS = 2
FILES_OVERHEAD_PER_TABLE = 1 # format_version.txt
FILES_OVERHEAD_PER_PART_COMPACT = 8
FILES_OVERHEAD_PER_PART_COMPACT = 7
def wait_for_hdfs_objects(cluster, fp, expected, num_tries=30):

View File

@ -29,8 +29,8 @@ def cluster():
FILES_OVERHEAD = 1
FILES_OVERHEAD_PER_COLUMN = 2 # Data and mark files
FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 + 1 + 1
FILES_OVERHEAD_PER_PART_COMPACT = 10 + 1 + 1
FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 + 1
FILES_OVERHEAD_PER_PART_COMPACT = 10 + 1
def random_string(length):

View File

@ -32,8 +32,8 @@ def cluster():
FILES_OVERHEAD = 1
FILES_OVERHEAD_PER_COLUMN = 2 # Data and mark files
FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 + 1 + 1
FILES_OVERHEAD_PER_PART_COMPACT = 10 + 1 + 1
FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 + 1
FILES_OVERHEAD_PER_PART_COMPACT = 10 + 1
def random_string(length):