diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 7c7befd52e9..29abd09ac6d 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -34,7 +34,11 @@ ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offs if (offsets->size() + 1 != values->size()) throw Exception(ErrorCodes::LOGICAL_ERROR, - "Values size is inconsistent with offsets size. Expected: {}, got {}", offsets->size() + 1, values->size()); + "Values size ({}) is inconsistent with offsets size ({})", values->size(), offsets->size()); + + if (_size < offsets->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size()); } MutableColumnPtr ColumnSparse::cloneResized(size_t new_size) const @@ -113,7 +117,7 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len if (length == 0) return; - if (start + length >= src.size()) + if (start + length > src.size()) throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.", ErrorCodes::LOGICAL_ERROR); @@ -125,13 +129,11 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len const auto & src_offsets = src_sparse->getOffsetsData(); const auto & src_values = src_sparse->getValuesColumn(); - if (!src_offsets.empty()) - { - size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin(); - size_t offset_end = std::upper_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin(); - if (offset_end != 0) - --offset_end; + size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin(); + size_t offset_end = std::upper_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin(); + if (offset_start != offset_end) + { insertManyDefaults(src_offsets[offset_start] - start); offsets_data.push_back(_size); ++_size; @@ -144,8 +146,8 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len ++_size; } - insertManyDefaults(end - src_offsets[offset_end]); - values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start + 1); + insertManyDefaults(end - src_offsets[offset_end - 1] - 1); + values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start); } else { @@ -354,7 +356,7 @@ void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num, nullptr, nested_result, direction, nan_direction_hint); const auto & offsets_data = getOffsetsData(); - compare_results.resize(_size, nested_result[0]); + compare_results.resize_fill(_size, nested_result[0]); for (size_t i = 0; i < offsets_data.size(); ++i) compare_results[offsets_data[i]] = nested_result[i + 1]; } @@ -379,21 +381,23 @@ void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direc return; res.resize(_size); - for (size_t i = 0; i < _size; ++i) - res[i] = i; - if (offsets->empty()) + { + for (size_t i = 0; i < _size; ++i) + res[i] = i; return; - - Permutation perm; - if (collator) - values->getPermutationWithCollation(*collator, reverse, limit, null_direction_hint, perm); - else - values->getPermutation(reverse, limit, null_direction_hint, perm); + } if (limit == 0 || limit > _size) limit = _size; + Permutation perm; + /// limit + 1 for case when there is 0 default values + if (collator) + values->getPermutationWithCollation(*collator, reverse, limit + 1, null_direction_hint, perm); + else + values->getPermutation(reverse, limit + 1, null_direction_hint, perm); + size_t num_of_defaults = getNumberOfDefaults(); size_t row = 0; size_t current_offset = 0; @@ -407,7 +411,7 @@ void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direc if (!num_of_defaults) continue; - while (row < limit && current_default_row < _size) + while (row < limit) { while (current_offset < offsets_data.size() && current_default_row == offsets_data[current_offset]) { @@ -415,6 +419,9 @@ void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direc ++current_default_row; } + if (current_default_row == _size) + break; + res[row++] = current_default_row++; } } @@ -550,7 +557,7 @@ void ColumnSparse::getIndicesOfNonDefaultValues(IColumn::Offsets & indices, size size_t ColumnSparse::getNumberOfDefaultRows(size_t step) const { - return (_size - offsets->size()) / step; + return getNumberOfDefaults() / step; } MutableColumns ColumnSparse::scatter(ColumnIndex num_columns, const Selector & selector) const diff --git a/src/Columns/ColumnSparse.h b/src/Columns/ColumnSparse.h index f0beb4cdefb..87123ff0057 100644 --- a/src/Columns/ColumnSparse.h +++ b/src/Columns/ColumnSparse.h @@ -35,7 +35,8 @@ public: return Base::create(values_->assumeMutable(), offsets_->assumeMutable(), size_); } - static MutablePtr create(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_) + template ::value>::type> + static MutablePtr create(TColumnPtr && values_, TColumnPtr && offsets_, size_t size_) { return Base::create(std::move(values_), std::move(offsets_), size_); } @@ -45,10 +46,10 @@ public: return Base::create(values_->assumeMutable()); } - template >> - static MutablePtr create(Arg && arg) + template ::value>::type> + static MutablePtr create(TColumnPtr && values_) { - return Base::create(std::forward(arg)); + return Base::create(std::forward(values_)); } bool isSparse() const override { return true; } diff --git a/src/Columns/tests/gtest_column_sparse.cpp b/src/Columns/tests/gtest_column_sparse.cpp new file mode 100644 index 00000000000..b0b461894d0 --- /dev/null +++ b/src/Columns/tests/gtest_column_sparse.cpp @@ -0,0 +1,341 @@ +#include +#include + +#include +#include +#include + +#include +#include + +#include + +using namespace DB; +pcg64 rng(randomSeed()); + +std::pair createColumns(size_t n, size_t k) +{ + auto values = ColumnVector::create(); + auto offsets = ColumnVector::create(); + auto full = ColumnVector::create(); + + auto & values_data = values->getData(); + auto & offsets_data = offsets->getData(); + auto & full_data = full->getData(); + + values_data.push_back(0); + + for (size_t i = 0; i < n; ++i) + { + bool not_zero = rng() % k == 0; + size_t value = not_zero ? rng() % 1000000 : 0; + full_data.push_back(value); + + if (not_zero) + { + values_data.push_back(value); + offsets_data.push_back(i); + } + } + + auto sparse = ColumnSparse::create(std::move(values), std::move(offsets), n); + return std::make_pair(std::move(sparse), std::move(full)); +} + +bool checkEquals(const IColumn & lhs, const IColumn & rhs) +{ + if (lhs.size() != rhs.size()) + return false; + + for (size_t i = 0; i < lhs.size(); ++i) + if (lhs.compareAt(i, i, rhs, 0) != 0) + return false; + + return true; +} + +constexpr size_t T = 5000; +constexpr size_t MAX_ROWS = 10000; +constexpr size_t sparse_ratios[] = {1, 2, 5, 10, 32, 50, 64, 100, 256, 500, 1000, 5000, 10000}; +constexpr size_t K = sizeof(sparse_ratios) / sizeof(sparse_ratios[0]); + +#define DUMP_COLUMN(column) std::cerr << #column << ": " << (column)->dumpStructure() << "\n" +#define DUMP_NON_DEFAULTS(column) std::cerr << "non-default values in " #column ": " << (column)->size() - (column)->getNumberOfDefaultRows(1) << "\n" + +TEST(ColumnSparse, InsertRangeFrom) +{ + auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t from, size_t len) + { + auto [sparse_dst, full_dst] = createColumns(n1, k1); + auto [sparse_src, full_src] = createColumns(n2, k2); + + sparse_dst->insertRangeFrom(*sparse_src, from, len); + full_dst->insertRangeFrom(*full_src, from, len); + + if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst)) + { + DUMP_COLUMN(sparse_src); + DUMP_COLUMN(full_src); + DUMP_COLUMN(sparse_dst); + DUMP_COLUMN(full_dst); + DUMP_NON_DEFAULTS(full_dst); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n1 = rng() % MAX_ROWS + 1; + size_t k1 = sparse_ratios[rng() % K]; + + size_t n2 = rng() % MAX_ROWS + 1; + size_t k2 = sparse_ratios[rng() % K]; + + size_t from = rng() % n2; + size_t to = rng() % n2; + + if (from > to) + std::swap(from, to); + + test_case(n1, k1, n2, k2, from, to - from); + } + } + catch(const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, PopBack) +{ + auto test_case = [&](size_t n, size_t k, size_t m) + { + auto [sparse_dst, full_dst] = createColumns(n, k); + + sparse_dst->popBack(m); + full_dst->popBack(m); + + if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst)) + { + DUMP_COLUMN(sparse_dst); + DUMP_COLUMN(full_dst); + DUMP_NON_DEFAULTS(full_dst); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n = rng() % MAX_ROWS + 1; + size_t k = sparse_ratios[rng() % K]; + size_t m = rng() % n; + + test_case(n, k, m); + } + } + catch(const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, Filter) +{ + auto test_case = [&](size_t n, size_t k, size_t m) + { + auto [sparse_src, full_src] = createColumns(n, k); + + PaddedPODArray filt(n); + for (size_t i = 0; i < n; ++i) + filt[i] = rng() % m == 0; + + auto sparse_dst = sparse_src->filter(filt, -1); + auto full_dst = full_src->filter(filt, -1); + + if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst)) + { + DUMP_COLUMN(sparse_src); + DUMP_COLUMN(full_src); + DUMP_COLUMN(sparse_dst); + DUMP_COLUMN(full_dst); + DUMP_NON_DEFAULTS(full_dst); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n = rng() % MAX_ROWS + 1; + size_t k = sparse_ratios[rng() % K]; + size_t m = sparse_ratios[rng() % K]; + + test_case(n, k, m); + } + } + catch(const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, Permute) +{ + auto test_case = [&](size_t n, size_t k, size_t limit) + { + auto [sparse_src, full_src] = createColumns(n, k); + + PaddedPODArray perm(n); + std::iota(perm.begin(), perm.end(), 0); + std::shuffle(perm.begin(), perm.end(), rng); + + auto sparse_dst = sparse_src->permute(perm, limit); + auto full_dst = full_src->permute(perm, limit); + + if (limit) + { + sparse_dst = sparse_dst->cut(0, limit); + full_dst = full_dst->cut(0, limit); + } + + if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst)) + { + DUMP_COLUMN(sparse_src); + DUMP_COLUMN(full_src); + DUMP_COLUMN(sparse_dst); + DUMP_COLUMN(full_dst); + DUMP_NON_DEFAULTS(full_dst); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n = rng() % MAX_ROWS + 1; + size_t k = sparse_ratios[rng() % K]; + size_t limit = rng() % 2 ? 0 : rng() % n; + + test_case(n, k, limit); + } + } + catch(const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, CompareColumn) +{ + auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t row_num) + { + auto [sparse_src1, full_src1] = createColumns(n1, k1); + auto [sparse_src2, full_src2] = createColumns(n2, k2); + + PaddedPODArray comp_sparse; + PaddedPODArray comp_full; + + sparse_src1->compareColumn(*sparse_src2, row_num, nullptr, comp_sparse, 1, 1); + full_src1->compareColumn(*full_src2, row_num, nullptr, comp_full, 1, 1); + + if (comp_sparse != comp_full) + { + DUMP_COLUMN(sparse_src1); + DUMP_COLUMN(full_src1); + DUMP_COLUMN(sparse_src2); + DUMP_COLUMN(full_src2); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Compare results are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n1 = rng() % MAX_ROWS + 1; + size_t k1 = sparse_ratios[rng() % K]; + + size_t n2 = rng() % MAX_ROWS + 1; + size_t k2 = sparse_ratios[rng() % K]; + + size_t row_num = rng() % n2; + + test_case(n1, k1, n2, k2, row_num); + } + } + catch(const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, GetPermutation) +{ + auto test_case = [&](size_t n, size_t k, size_t limit, bool reverse) + { + auto [sparse_src, full_src] = createColumns(n, k); + + PaddedPODArray perm_sparse; + PaddedPODArray perm_full; + + sparse_src->getPermutation(reverse, limit, 1, perm_sparse); + full_src->getPermutation(reverse, limit, 1, perm_full); + + auto sparse_sorted = sparse_src->permute(perm_sparse, limit); + auto full_sorted = full_src->permute(perm_full, limit); + + if (limit) + { + sparse_sorted = sparse_sorted->cut(0, limit); + full_sorted = full_sorted->cut(0, limit); + } + + if (!checkEquals(*sparse_sorted->convertToFullColumnIfSparse(), *full_sorted)) + { + DUMP_COLUMN(sparse_src); + DUMP_COLUMN(full_src); + DUMP_COLUMN(sparse_sorted); + DUMP_COLUMN(full_sorted); + DUMP_NON_DEFAULTS(full_sorted); + + std::cerr << "sparse: "; + for (size_t i = 0; i < sparse_sorted->size(); ++i) + std::cerr << applyVisitor(FieldVisitorToString(), (*sparse_sorted)[i]) << " "; + std::cerr << "\n"; + + std::cerr << "full: "; + for (size_t i = 0; i < full_sorted->size(); ++i) + std::cerr << applyVisitor(FieldVisitorToString(), (*full_sorted)[i]) << " "; + std::cerr << "\n"; + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Sorted columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n = rng() % MAX_ROWS + 1; + size_t k = sparse_ratios[rng() % K]; + + size_t limit = rng() % 2 ? 0 : rng() % n; + bool reverse = rng() % 2; + + test_case(n, k, limit, reverse); + } + } + catch(const Exception & e) + { + FAIL() << e.displayText(); + } +} + +#undef DUMP_COLUMN +#undef DUMP_NON_DEFAULTS diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp index af4688de817..eb4bcb0f1c2 100644 --- a/src/DataTypes/Serializations/SerializationSparse.cpp +++ b/src/DataTypes/Serializations/SerializationSparse.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace DB { @@ -38,7 +40,7 @@ void serializeOffsets(const IColumn::Offsets & offsets, WriteBuffer & ostr, size } size_t deserializeOffsets(IColumn::Offsets & offsets, - ReadBuffer & istr, size_t limit, DeserializeStateSparse & state) + ReadBuffer & istr, size_t start, size_t limit, DeserializeStateSparse & state) { if (limit && state.num_trailing_defaults >= limit) { @@ -52,7 +54,7 @@ size_t deserializeOffsets(IColumn::Offsets & offsets, size_t total_rows = state.num_trailing_defaults; if (state.has_value_after_defaults) { - size_t start_of_group = offsets.empty() ? 0 : offsets.back() + 1; + size_t start_of_group = offsets.empty() ? start : offsets.back() + 1; offsets.push_back(start_of_group + state.num_trailing_defaults); state.has_value_after_defaults = false; @@ -85,7 +87,7 @@ size_t deserializeOffsets(IColumn::Offsets & offsets, } else { - size_t start_of_group = offsets.empty() ? 0 : offsets.back() + 1; + size_t start_of_group = offsets.empty() ? start : offsets.back() + 1; offsets.push_back(start_of_group + group_size); state.num_trailing_defaults = 0; @@ -204,7 +206,7 @@ void SerializationSparse::deserializeBinaryBulkWithMultipleStreams( size_t read_rows = 0; settings.path.push_back(Substream::SparseOffsets); if (auto * stream = settings.getter(settings.path)) - read_rows = deserializeOffsets(offsets_data, *stream, limit, *state_sparse); + read_rows = deserializeOffsets(offsets_data, *stream, column_sparse.size(), limit, *state_sparse); auto & values_column = column_sparse.getValuesPtr(); size_t values_limit = offsets_data.size() - old_size;