Fix assert error in quantileDeterministic #15683

This commit is contained in:
Alexey Milovidov 2020-10-27 00:07:24 +03:00
parent cc5f15da29
commit 73f3aac663

View File

@ -39,8 +39,8 @@ namespace ErrorCodes
namespace detail namespace detail
{ {
const size_t DEFAULT_SAMPLE_COUNT = 8192; const size_t DEFAULT_MAX_SAMPLE_SIZE = 8192;
const auto MAX_SKIP_DEGREE = sizeof(UInt32) * 8; const auto MAX_SKIP_DEGREE = sizeof(UInt32) * 8;
} }
/// What if there is not a single value - throw an exception, or return 0 or NaN in the case of double? /// What if there is not a single value - throw an exception, or return 0 or NaN in the case of double?
@ -50,6 +50,7 @@ enum class ReservoirSamplerDeterministicOnEmpty
RETURN_NAN_OR_ZERO, RETURN_NAN_OR_ZERO,
}; };
template <typename T, template <typename T,
ReservoirSamplerDeterministicOnEmpty OnEmpty = ReservoirSamplerDeterministicOnEmpty::THROW> ReservoirSamplerDeterministicOnEmpty OnEmpty = ReservoirSamplerDeterministicOnEmpty::THROW>
class ReservoirSamplerDeterministic class ReservoirSamplerDeterministic
@ -60,8 +61,8 @@ class ReservoirSamplerDeterministic
} }
public: public:
ReservoirSamplerDeterministic(const size_t sample_count_ = DEFAULT_SAMPLE_COUNT) ReservoirSamplerDeterministic(const size_t max_sample_size_ = detail::DEFAULT_MAX_SAMPLE_SIZE)
: sample_count{sample_count_} : max_sample_size{max_sample_size_}
{ {
} }
@ -131,8 +132,8 @@ public:
void merge(const ReservoirSamplerDeterministic & b) void merge(const ReservoirSamplerDeterministic & b)
{ {
if (sample_count != b.sample_count) if (max_sample_size != b.max_sample_size)
throw Poco::Exception("Cannot merge ReservoirSamplerDeterministic's with different sample_count"); throw Poco::Exception("Cannot merge ReservoirSamplerDeterministic's with different max sample count");
sorted = false; sorted = false;
if (b.skip_degree > skip_degree) if (b.skip_degree > skip_degree)
@ -150,11 +151,16 @@ public:
void read(DB::ReadBuffer & buf) void read(DB::ReadBuffer & buf)
{ {
DB::readIntBinary<size_t>(sample_count, buf); size_t size = 0;
DB::readIntBinary<size_t>(size, buf);
DB::readIntBinary<size_t>(total_values, buf); DB::readIntBinary<size_t>(total_values, buf);
samples.resize(std::min(total_values, sample_count));
for (size_t i = 0; i < samples.size(); ++i) /// Compatibility with old versions.
if (size > total_values)
size = total_values;
samples.resize(size);
for (size_t i = 0; i < size; ++i)
DB::readPODBinary(samples[i], buf); DB::readPODBinary(samples[i], buf);
sorted = false; sorted = false;
@ -162,10 +168,11 @@ public:
void write(DB::WriteBuffer & buf) const void write(DB::WriteBuffer & buf) const
{ {
DB::writeIntBinary<size_t>(sample_count, buf); size_t size = samples.size();
DB::writeIntBinary<size_t>(size, buf);
DB::writeIntBinary<size_t>(total_values, buf); DB::writeIntBinary<size_t>(total_values, buf);
for (size_t i = 0; i < std::min(sample_count, total_values); ++i) for (size_t i = 0; i < size; ++i)
DB::writePODBinary(samples[i], buf); DB::writePODBinary(samples[i], buf);
} }
@ -174,18 +181,19 @@ private:
using Element = std::pair<T, UInt32>; using Element = std::pair<T, UInt32>;
using Array = DB::PODArray<Element, 64>; using Array = DB::PODArray<Element, 64>;
size_t sample_count; const size_t max_sample_size; /// Maximum amount of stored values.
size_t total_values{}; size_t total_values = 0; /// How many values were inserted (regardless if they remain in sample or not).
bool sorted{}; bool sorted = false;
Array samples; Array samples;
UInt8 skip_degree{}; UInt8 skip_degree = 0; /// The number N determining that we save only one per 2^N elements in average.
void insertImpl(const T & v, const UInt32 hash) void insertImpl(const T & v, const UInt32 hash)
{ {
/// @todo why + 1? I don't quite recall /// Make a room for plus one element.
while (samples.size() + 1 >= sample_count) while (samples.size() >= max_sample_size)
{ {
if (++skip_degree > detail::MAX_SKIP_DEGREE) ++skip_degree;
if (skip_degree > detail::MAX_SKIP_DEGREE)
throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED}; throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED};
thinOut(); thinOut();
} }
@ -195,35 +203,17 @@ private:
void thinOut() void thinOut()
{ {
auto size = samples.size(); samples.resize(std::distance(samples.begin(),
for (size_t i = 0; i < size;) std::remove_if(samples.begin(), samples.end(), [this](const auto & elem){ return !good(elem.second); })));
{ sorted = false;
if (!good(samples[i].second))
{
/// swap current element with the last one
std::swap(samples[size - 1], samples[i]);
--size;
}
else
++i;
}
if (size != samples.size())
{
samples.resize(size);
sorted = false;
}
} }
void sortIfNeeded() void sortIfNeeded()
{ {
if (sorted) if (sorted)
return; return;
std::sort(samples.begin(), samples.end(), [](const auto & lhs, const auto & rhs) { return lhs.first < rhs.first; });
sorted = true; sorted = true;
std::sort(samples.begin(), samples.end(), [] (const std::pair<T, UInt32> & lhs, const std::pair<T, UInt32> & rhs)
{
return lhs.first < rhs.first;
});
} }
template <typename ResultType> template <typename ResultType>