ColumnSparse: unit tests and several fixes

This commit is contained in:
Anton Popov 2021-04-13 15:08:24 +03:00
parent 298251e55d
commit fac54e9ccc
4 changed files with 381 additions and 30 deletions

View File

@ -34,7 +34,11 @@ ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offs
if (offsets->size() + 1 != values->size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Values size is inconsistent with offsets size. Expected: {}, got {}", offsets->size() + 1, values->size());
"Values size ({}) is inconsistent with offsets size ({})", values->size(), offsets->size());
if (_size < offsets->size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size());
}
MutableColumnPtr ColumnSparse::cloneResized(size_t new_size) const
@ -113,7 +117,7 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len
if (length == 0)
return;
if (start + length >= src.size())
if (start + length > src.size())
throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.",
ErrorCodes::LOGICAL_ERROR);
@ -125,13 +129,11 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len
const auto & src_offsets = src_sparse->getOffsetsData();
const auto & src_values = src_sparse->getValuesColumn();
if (!src_offsets.empty())
{
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
size_t offset_end = std::upper_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
if (offset_end != 0)
--offset_end;
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
size_t offset_end = std::upper_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
if (offset_start != offset_end)
{
insertManyDefaults(src_offsets[offset_start] - start);
offsets_data.push_back(_size);
++_size;
@ -144,8 +146,8 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len
++_size;
}
insertManyDefaults(end - src_offsets[offset_end]);
values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start + 1);
insertManyDefaults(end - src_offsets[offset_end - 1] - 1);
values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start);
}
else
{
@ -354,7 +356,7 @@ void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num,
nullptr, nested_result, direction, nan_direction_hint);
const auto & offsets_data = getOffsetsData();
compare_results.resize(_size, nested_result[0]);
compare_results.resize_fill(_size, nested_result[0]);
for (size_t i = 0; i < offsets_data.size(); ++i)
compare_results[offsets_data[i]] = nested_result[i + 1];
}
@ -379,21 +381,23 @@ void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direc
return;
res.resize(_size);
for (size_t i = 0; i < _size; ++i)
res[i] = i;
if (offsets->empty())
{
for (size_t i = 0; i < _size; ++i)
res[i] = i;
return;
Permutation perm;
if (collator)
values->getPermutationWithCollation(*collator, reverse, limit, null_direction_hint, perm);
else
values->getPermutation(reverse, limit, null_direction_hint, perm);
}
if (limit == 0 || limit > _size)
limit = _size;
Permutation perm;
/// limit + 1 for case when there is 0 default values
if (collator)
values->getPermutationWithCollation(*collator, reverse, limit + 1, null_direction_hint, perm);
else
values->getPermutation(reverse, limit + 1, null_direction_hint, perm);
size_t num_of_defaults = getNumberOfDefaults();
size_t row = 0;
size_t current_offset = 0;
@ -407,7 +411,7 @@ void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direc
if (!num_of_defaults)
continue;
while (row < limit && current_default_row < _size)
while (row < limit)
{
while (current_offset < offsets_data.size() && current_default_row == offsets_data[current_offset])
{
@ -415,6 +419,9 @@ void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direc
++current_default_row;
}
if (current_default_row == _size)
break;
res[row++] = current_default_row++;
}
}
@ -550,7 +557,7 @@ void ColumnSparse::getIndicesOfNonDefaultValues(IColumn::Offsets & indices, size
size_t ColumnSparse::getNumberOfDefaultRows(size_t step) const
{
return (_size - offsets->size()) / step;
return getNumberOfDefaults() / step;
}
MutableColumns ColumnSparse::scatter(ColumnIndex num_columns, const Selector & selector) const

View File

@ -35,7 +35,8 @@ public:
return Base::create(values_->assumeMutable(), offsets_->assumeMutable(), size_);
}
static MutablePtr create(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_)
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
static MutablePtr create(TColumnPtr && values_, TColumnPtr && offsets_, size_t size_)
{
return Base::create(std::move(values_), std::move(offsets_), size_);
}
@ -45,10 +46,10 @@ public:
return Base::create(values_->assumeMutable());
}
template <typename Arg, typename = typename std::enable_if_t<std::is_rvalue_reference_v<Arg &&>>>
static MutablePtr create(Arg && arg)
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
static MutablePtr create(TColumnPtr && values_)
{
return Base::create(std::forward<Arg>(arg));
return Base::create(std::forward<TColumnPtr>(values_));
}
bool isSparse() const override { return true; }

View File

@ -0,0 +1,341 @@
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnsNumber.h>
#include <Common/randomSeed.h>
#include <pcg_random.hpp>
#include <gtest/gtest.h>
#include <algorithm>
#include <numeric>
#include <Common/FieldVisitors.h>
using namespace DB;
pcg64 rng(randomSeed());
std::pair<MutableColumnPtr, MutableColumnPtr> createColumns(size_t n, size_t k)
{
auto values = ColumnVector<UInt64>::create();
auto offsets = ColumnVector<UInt64>::create();
auto full = ColumnVector<UInt64>::create();
auto & values_data = values->getData();
auto & offsets_data = offsets->getData();
auto & full_data = full->getData();
values_data.push_back(0);
for (size_t i = 0; i < n; ++i)
{
bool not_zero = rng() % k == 0;
size_t value = not_zero ? rng() % 1000000 : 0;
full_data.push_back(value);
if (not_zero)
{
values_data.push_back(value);
offsets_data.push_back(i);
}
}
auto sparse = ColumnSparse::create(std::move(values), std::move(offsets), n);
return std::make_pair(std::move(sparse), std::move(full));
}
bool checkEquals(const IColumn & lhs, const IColumn & rhs)
{
if (lhs.size() != rhs.size())
return false;
for (size_t i = 0; i < lhs.size(); ++i)
if (lhs.compareAt(i, i, rhs, 0) != 0)
return false;
return true;
}
constexpr size_t T = 5000;
constexpr size_t MAX_ROWS = 10000;
constexpr size_t sparse_ratios[] = {1, 2, 5, 10, 32, 50, 64, 100, 256, 500, 1000, 5000, 10000};
constexpr size_t K = sizeof(sparse_ratios) / sizeof(sparse_ratios[0]);
#define DUMP_COLUMN(column) std::cerr << #column << ": " << (column)->dumpStructure() << "\n"
#define DUMP_NON_DEFAULTS(column) std::cerr << "non-default values in " #column ": " << (column)->size() - (column)->getNumberOfDefaultRows(1) << "\n"
TEST(ColumnSparse, InsertRangeFrom)
{
auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t from, size_t len)
{
auto [sparse_dst, full_dst] = createColumns(n1, k1);
auto [sparse_src, full_src] = createColumns(n2, k2);
sparse_dst->insertRangeFrom(*sparse_src, from, len);
full_dst->insertRangeFrom(*full_src, from, len);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
DUMP_NON_DEFAULTS(full_dst);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n1 = rng() % MAX_ROWS + 1;
size_t k1 = sparse_ratios[rng() % K];
size_t n2 = rng() % MAX_ROWS + 1;
size_t k2 = sparse_ratios[rng() % K];
size_t from = rng() % n2;
size_t to = rng() % n2;
if (from > to)
std::swap(from, to);
test_case(n1, k1, n2, k2, from, to - from);
}
}
catch(const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, PopBack)
{
auto test_case = [&](size_t n, size_t k, size_t m)
{
auto [sparse_dst, full_dst] = createColumns(n, k);
sparse_dst->popBack(m);
full_dst->popBack(m);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
DUMP_NON_DEFAULTS(full_dst);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t m = rng() % n;
test_case(n, k, m);
}
}
catch(const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, Filter)
{
auto test_case = [&](size_t n, size_t k, size_t m)
{
auto [sparse_src, full_src] = createColumns(n, k);
PaddedPODArray<UInt8> filt(n);
for (size_t i = 0; i < n; ++i)
filt[i] = rng() % m == 0;
auto sparse_dst = sparse_src->filter(filt, -1);
auto full_dst = full_src->filter(filt, -1);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
DUMP_NON_DEFAULTS(full_dst);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t m = sparse_ratios[rng() % K];
test_case(n, k, m);
}
}
catch(const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, Permute)
{
auto test_case = [&](size_t n, size_t k, size_t limit)
{
auto [sparse_src, full_src] = createColumns(n, k);
PaddedPODArray<UInt64> perm(n);
std::iota(perm.begin(), perm.end(), 0);
std::shuffle(perm.begin(), perm.end(), rng);
auto sparse_dst = sparse_src->permute(perm, limit);
auto full_dst = full_src->permute(perm, limit);
if (limit)
{
sparse_dst = sparse_dst->cut(0, limit);
full_dst = full_dst->cut(0, limit);
}
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
DUMP_NON_DEFAULTS(full_dst);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t limit = rng() % 2 ? 0 : rng() % n;
test_case(n, k, limit);
}
}
catch(const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, CompareColumn)
{
auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t row_num)
{
auto [sparse_src1, full_src1] = createColumns(n1, k1);
auto [sparse_src2, full_src2] = createColumns(n2, k2);
PaddedPODArray<Int8> comp_sparse;
PaddedPODArray<Int8> comp_full;
sparse_src1->compareColumn(*sparse_src2, row_num, nullptr, comp_sparse, 1, 1);
full_src1->compareColumn(*full_src2, row_num, nullptr, comp_full, 1, 1);
if (comp_sparse != comp_full)
{
DUMP_COLUMN(sparse_src1);
DUMP_COLUMN(full_src1);
DUMP_COLUMN(sparse_src2);
DUMP_COLUMN(full_src2);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Compare results are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n1 = rng() % MAX_ROWS + 1;
size_t k1 = sparse_ratios[rng() % K];
size_t n2 = rng() % MAX_ROWS + 1;
size_t k2 = sparse_ratios[rng() % K];
size_t row_num = rng() % n2;
test_case(n1, k1, n2, k2, row_num);
}
}
catch(const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, GetPermutation)
{
auto test_case = [&](size_t n, size_t k, size_t limit, bool reverse)
{
auto [sparse_src, full_src] = createColumns(n, k);
PaddedPODArray<UInt64> perm_sparse;
PaddedPODArray<UInt64> perm_full;
sparse_src->getPermutation(reverse, limit, 1, perm_sparse);
full_src->getPermutation(reverse, limit, 1, perm_full);
auto sparse_sorted = sparse_src->permute(perm_sparse, limit);
auto full_sorted = full_src->permute(perm_full, limit);
if (limit)
{
sparse_sorted = sparse_sorted->cut(0, limit);
full_sorted = full_sorted->cut(0, limit);
}
if (!checkEquals(*sparse_sorted->convertToFullColumnIfSparse(), *full_sorted))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_sorted);
DUMP_COLUMN(full_sorted);
DUMP_NON_DEFAULTS(full_sorted);
std::cerr << "sparse: ";
for (size_t i = 0; i < sparse_sorted->size(); ++i)
std::cerr << applyVisitor(FieldVisitorToString(), (*sparse_sorted)[i]) << " ";
std::cerr << "\n";
std::cerr << "full: ";
for (size_t i = 0; i < full_sorted->size(); ++i)
std::cerr << applyVisitor(FieldVisitorToString(), (*full_sorted)[i]) << " ";
std::cerr << "\n";
throw Exception(ErrorCodes::LOGICAL_ERROR, "Sorted columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t limit = rng() % 2 ? 0 : rng() % n;
bool reverse = rng() % 2;
test_case(n, k, limit, reverse);
}
}
catch(const Exception & e)
{
FAIL() << e.displayText();
}
}
#undef DUMP_COLUMN
#undef DUMP_NON_DEFAULTS

View File

@ -7,6 +7,8 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Common/FieldVisitors.h>
namespace DB
{
@ -38,7 +40,7 @@ void serializeOffsets(const IColumn::Offsets & offsets, WriteBuffer & ostr, size
}
size_t deserializeOffsets(IColumn::Offsets & offsets,
ReadBuffer & istr, size_t limit, DeserializeStateSparse & state)
ReadBuffer & istr, size_t start, size_t limit, DeserializeStateSparse & state)
{
if (limit && state.num_trailing_defaults >= limit)
{
@ -52,7 +54,7 @@ size_t deserializeOffsets(IColumn::Offsets & offsets,
size_t total_rows = state.num_trailing_defaults;
if (state.has_value_after_defaults)
{
size_t start_of_group = offsets.empty() ? 0 : offsets.back() + 1;
size_t start_of_group = offsets.empty() ? start : offsets.back() + 1;
offsets.push_back(start_of_group + state.num_trailing_defaults);
state.has_value_after_defaults = false;
@ -85,7 +87,7 @@ size_t deserializeOffsets(IColumn::Offsets & offsets,
}
else
{
size_t start_of_group = offsets.empty() ? 0 : offsets.back() + 1;
size_t start_of_group = offsets.empty() ? start : offsets.back() + 1;
offsets.push_back(start_of_group + group_size);
state.num_trailing_defaults = 0;
@ -204,7 +206,7 @@ void SerializationSparse::deserializeBinaryBulkWithMultipleStreams(
size_t read_rows = 0;
settings.path.push_back(Substream::SparseOffsets);
if (auto * stream = settings.getter(settings.path))
read_rows = deserializeOffsets(offsets_data, *stream, limit, *state_sparse);
read_rows = deserializeOffsets(offsets_data, *stream, column_sparse.size(), limit, *state_sparse);
auto & values_column = column_sparse.getValuesPtr();
size_t values_limit = offsets_data.size() - old_size;