mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-29 19:12:03 +00:00
801 lines
24 KiB
C++
801 lines
24 KiB
C++
#include <Columns/ColumnSparse.h>
|
|
#include <Columns/ColumnsCommon.h>
|
|
#include <Columns/ColumnCompressed.h>
|
|
#include <Columns/ColumnTuple.h>
|
|
#include <Common/WeakHash.h>
|
|
#include <Common/SipHash.h>
|
|
#include <Common/HashTable/Hash.h>
|
|
#include <Processors/Transforms/ColumnGathererTransform.h>
|
|
|
|
#include <algorithm>
|
|
#include <bit>
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int LOGICAL_ERROR;
|
|
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
|
|
}
|
|
|
|
ColumnSparse::ColumnSparse(MutableColumnPtr && values_)
|
|
: values(std::move(values_)), _size(0)
|
|
{
|
|
if (!values->empty())
|
|
throw Exception("Not empty values passed to ColumnSparse, but no offsets passed", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
values->insertDefault();
|
|
offsets = ColumnUInt64::create();
|
|
}
|
|
|
|
ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_)
|
|
: values(std::move(values_)), offsets(std::move(offsets_)), _size(size_)
|
|
{
|
|
const ColumnUInt64 * offsets_concrete = typeid_cast<const ColumnUInt64 *>(offsets.get());
|
|
|
|
if (!offsets_concrete)
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName());
|
|
|
|
/// 'values' should contain one extra element: default value at 0 position.
|
|
if (offsets->size() + 1 != values->size())
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
|
"Values size ({}) is inconsistent with offsets size ({})", values->size(), offsets->size());
|
|
|
|
if (_size < offsets->size())
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
|
"Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size());
|
|
|
|
if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back())
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
|
"Size of sparse column ({}) should be greater than last position of non-default value ({})",
|
|
_size, offsets_concrete->getData().back());
|
|
|
|
#ifndef NDEBUG
|
|
const auto & offsets_data = getOffsetsData();
|
|
const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal<>());
|
|
if (it != offsets_data.end())
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Offsets of ColumnSparse must be strictly sorted");
|
|
#endif
|
|
}
|
|
|
|
MutableColumnPtr ColumnSparse::cloneResized(size_t new_size) const
|
|
{
|
|
if (new_size == 0)
|
|
return ColumnSparse::create(values->cloneEmpty());
|
|
|
|
if (new_size >= _size)
|
|
return ColumnSparse::create(IColumn::mutate(values), IColumn::mutate(offsets), new_size);
|
|
|
|
auto res = ColumnSparse::create(values->cloneEmpty());
|
|
res->insertRangeFrom(*this, 0, new_size);
|
|
return res;
|
|
}
|
|
|
|
bool ColumnSparse::isDefaultAt(size_t n) const
|
|
{
|
|
return getValueIndex(n) == 0;
|
|
}
|
|
|
|
bool ColumnSparse::isNullAt(size_t n) const
|
|
{
|
|
return values->isNullAt(getValueIndex(n));
|
|
}
|
|
|
|
Field ColumnSparse::operator[](size_t n) const
|
|
{
|
|
return (*values)[getValueIndex(n)];
|
|
}
|
|
|
|
void ColumnSparse::get(size_t n, Field & res) const
|
|
{
|
|
values->get(getValueIndex(n), res);
|
|
}
|
|
|
|
bool ColumnSparse::getBool(size_t n) const
|
|
{
|
|
return values->getBool(getValueIndex(n));
|
|
}
|
|
|
|
Float64 ColumnSparse::getFloat64(size_t n) const
|
|
{
|
|
return values->getFloat64(getValueIndex(n));
|
|
}
|
|
|
|
Float32 ColumnSparse::getFloat32(size_t n) const
|
|
{
|
|
return values->getFloat32(getValueIndex(n));
|
|
}
|
|
|
|
UInt64 ColumnSparse::getUInt(size_t n) const
|
|
{
|
|
return values->getUInt(getValueIndex(n));
|
|
}
|
|
|
|
Int64 ColumnSparse::getInt(size_t n) const
|
|
{
|
|
return values->getInt(getValueIndex(n));
|
|
}
|
|
|
|
UInt64 ColumnSparse::get64(size_t n) const
|
|
{
|
|
return values->get64(getValueIndex(n));
|
|
}
|
|
|
|
StringRef ColumnSparse::getDataAt(size_t n) const
|
|
{
|
|
return values->getDataAt(getValueIndex(n));
|
|
}
|
|
|
|
ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const
|
|
{
|
|
return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1);
|
|
}
|
|
|
|
void ColumnSparse::insertSingleValue(const Inserter & inserter)
|
|
{
|
|
inserter(*values);
|
|
|
|
size_t last_idx = values->size() - 1;
|
|
if (values->isDefaultAt(last_idx))
|
|
values->popBack(1);
|
|
else
|
|
getOffsetsData().push_back(_size);
|
|
|
|
++_size;
|
|
}
|
|
|
|
void ColumnSparse::insertData(const char * pos, size_t length)
|
|
{
|
|
insertSingleValue([&](IColumn & column) { column.insertData(pos, length); });
|
|
}
|
|
|
|
StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
|
|
{
|
|
return values->serializeValueIntoArena(getValueIndex(n), arena, begin);
|
|
}
|
|
|
|
const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos)
|
|
{
|
|
const char * res = nullptr;
|
|
insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); });
|
|
return res;
|
|
}
|
|
|
|
const char * ColumnSparse::skipSerializedInArena(const char * pos) const
|
|
{
|
|
return values->skipSerializedInArena(pos);
|
|
}
|
|
|
|
void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t length)
|
|
{
|
|
if (length == 0)
|
|
return;
|
|
|
|
if (start + length > src.size())
|
|
throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.",
|
|
ErrorCodes::LOGICAL_ERROR);
|
|
|
|
auto & offsets_data = getOffsetsData();
|
|
|
|
size_t end = start + length;
|
|
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
|
|
{
|
|
const auto & src_offsets = src_sparse->getOffsetsData();
|
|
const auto & src_values = src_sparse->getValuesColumn();
|
|
|
|
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
|
|
size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
|
|
assert(offset_start <= offset_end);
|
|
|
|
if (offset_start != offset_end)
|
|
{
|
|
offsets_data.reserve(offsets_data.size() + offset_end - offset_start);
|
|
insertManyDefaults(src_offsets[offset_start] - start);
|
|
offsets_data.push_back(_size);
|
|
++_size;
|
|
|
|
for (size_t i = offset_start + 1; i < offset_end; ++i)
|
|
{
|
|
size_t current_diff = src_offsets[i] - src_offsets[i - 1];
|
|
insertManyDefaults(current_diff - 1);
|
|
offsets_data.push_back(_size);
|
|
++_size;
|
|
}
|
|
|
|
/// 'end' <= 'src_offsets[offsets_end]', but end is excluded, so index is 'offsets_end' - 1.
|
|
/// Since 'end' is excluded, need to subtract one more row from result.
|
|
insertManyDefaults(end - src_offsets[offset_end - 1] - 1);
|
|
values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start);
|
|
}
|
|
else
|
|
{
|
|
insertManyDefaults(length);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (size_t i = start; i < end; ++i)
|
|
{
|
|
if (!src.isDefaultAt(i))
|
|
{
|
|
values->insertFrom(src, i);
|
|
offsets_data.push_back(_size);
|
|
}
|
|
|
|
++_size;
|
|
}
|
|
}
|
|
}
|
|
|
|
void ColumnSparse::insert(const Field & x)
|
|
{
|
|
insertSingleValue([&](IColumn & column) { column.insert(x); });
|
|
}
|
|
|
|
void ColumnSparse::insertFrom(const IColumn & src, size_t n)
|
|
{
|
|
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
|
|
{
|
|
if (size_t value_index = src_sparse->getValueIndex(n))
|
|
{
|
|
getOffsetsData().push_back(_size);
|
|
values->insertFrom(src_sparse->getValuesColumn(), value_index);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!src.isDefaultAt(n))
|
|
{
|
|
values->insertFrom(src, n);
|
|
getOffsetsData().push_back(_size);
|
|
}
|
|
}
|
|
|
|
++_size;
|
|
}
|
|
|
|
void ColumnSparse::insertDefault()
|
|
{
|
|
++_size;
|
|
}
|
|
|
|
void ColumnSparse::insertManyDefaults(size_t length)
|
|
{
|
|
_size += length;
|
|
}
|
|
|
|
void ColumnSparse::popBack(size_t n)
|
|
{
|
|
assert(n < _size);
|
|
|
|
auto & offsets_data = getOffsetsData();
|
|
size_t new_size = _size - n;
|
|
|
|
size_t removed_values = 0;
|
|
while (!offsets_data.empty() && offsets_data.back() >= new_size)
|
|
{
|
|
offsets_data.pop_back();
|
|
++removed_values;
|
|
}
|
|
|
|
if (removed_values)
|
|
values->popBack(removed_values);
|
|
|
|
_size = new_size;
|
|
}
|
|
|
|
ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const
|
|
{
|
|
if (_size != filt.size())
|
|
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), _size);
|
|
|
|
if (offsets->empty())
|
|
{
|
|
auto res = cloneEmpty();
|
|
res->insertManyDefaults(countBytesInFilter(filt));
|
|
return res;
|
|
}
|
|
|
|
auto res_offsets = offsets->cloneEmpty();
|
|
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
|
|
|
|
Filter values_filter;
|
|
values_filter.reserve(values->size());
|
|
values_filter.push_back(1);
|
|
size_t values_result_size_hint = 1;
|
|
|
|
size_t res_offset = 0;
|
|
auto offset_it = begin();
|
|
for (size_t i = 0; i < _size; ++i, ++offset_it)
|
|
{
|
|
if (!offset_it.isDefault())
|
|
{
|
|
if (filt[i])
|
|
{
|
|
res_offsets_data.push_back(res_offset);
|
|
values_filter.push_back(1);
|
|
++res_offset;
|
|
++values_result_size_hint;
|
|
}
|
|
else
|
|
{
|
|
values_filter.push_back(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
res_offset += filt[i] != 0;
|
|
}
|
|
}
|
|
|
|
auto res_values = values->filter(values_filter, values_result_size_hint);
|
|
return this->create(res_values, std::move(res_offsets), res_offset);
|
|
}
|
|
|
|
void ColumnSparse::expand(const Filter & mask, bool inverted)
|
|
{
|
|
if (mask.size() < _size)
|
|
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
auto res_offsets = offsets->cloneEmpty();
|
|
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
|
|
|
|
auto it = begin();
|
|
for (size_t i = 0; i < mask.size(); ++i)
|
|
{
|
|
if (!!mask[i] ^ inverted)
|
|
{
|
|
if (it.getCurrentRow() == _size)
|
|
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
if (!it.isDefault())
|
|
res_offsets_data[it.getCurrentOffset()] = i;
|
|
|
|
++it;
|
|
}
|
|
}
|
|
|
|
_size = mask.size();
|
|
}
|
|
|
|
ColumnPtr ColumnSparse::permute(const Permutation & perm, size_t limit) const
|
|
{
|
|
return permuteImpl(*this, perm, limit);
|
|
}
|
|
|
|
ColumnPtr ColumnSparse::index(const IColumn & indexes, size_t limit) const
|
|
{
|
|
return selectIndexImpl(*this, indexes, limit);
|
|
}
|
|
|
|
template <typename Type>
|
|
ColumnPtr ColumnSparse::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
|
|
{
|
|
assert(limit <= indexes.size());
|
|
if (limit == 0)
|
|
return ColumnSparse::create(values->cloneEmpty());
|
|
|
|
if (offsets->empty())
|
|
{
|
|
auto res = cloneEmpty();
|
|
res->insertManyDefaults(limit);
|
|
return res;
|
|
}
|
|
|
|
auto res_offsets = offsets->cloneEmpty();
|
|
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
|
|
auto res_values = values->cloneEmpty();
|
|
res_values->insertDefault();
|
|
|
|
/// If we need to permute full column, or if limit is large enough,
|
|
/// it's better to save indexes of values in O(size)
|
|
/// and avoid binary search for obtaining every index.
|
|
/// 3 is just a guess for overhead on copying indexes.
|
|
bool execute_linear =
|
|
limit == _size || limit * std::bit_width(offsets->size()) > _size * 3;
|
|
|
|
if (execute_linear)
|
|
{
|
|
PaddedPODArray<UInt64> values_index(_size);
|
|
auto offset_it = begin();
|
|
for (size_t i = 0; i < _size; ++i, ++offset_it)
|
|
values_index[i] = offset_it.getValueIndex();
|
|
|
|
for (size_t i = 0; i < limit; ++i)
|
|
{
|
|
size_t index = values_index[indexes[i]];
|
|
if (index != 0)
|
|
{
|
|
res_values->insertFrom(*values, index);
|
|
res_offsets_data.push_back(i);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (size_t i = 0; i < limit; ++i)
|
|
{
|
|
size_t index = getValueIndex(indexes[i]);
|
|
if (index != 0)
|
|
{
|
|
res_values->insertFrom(*values, index);
|
|
res_offsets_data.push_back(i);
|
|
}
|
|
}
|
|
}
|
|
|
|
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), limit);
|
|
}
|
|
|
|
int ColumnSparse::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const
|
|
{
|
|
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs_))
|
|
return values->compareAt(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint);
|
|
|
|
return values->compareAt(getValueIndex(n), m, rhs_, null_direction_hint);
|
|
}
|
|
|
|
void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num,
|
|
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
|
|
int direction, int nan_direction_hint) const
|
|
{
|
|
if (row_indexes)
|
|
{
|
|
/// TODO: implement without conversion to full column.
|
|
auto this_full = convertToFullColumnIfSparse();
|
|
auto rhs_full = rhs.convertToFullColumnIfSparse();
|
|
this_full->compareColumn(*rhs_full, rhs_row_num, row_indexes, compare_results, direction, nan_direction_hint);
|
|
}
|
|
else
|
|
{
|
|
const auto & rhs_sparse = assert_cast<const ColumnSparse &>(rhs);
|
|
PaddedPODArray<Int8> nested_result;
|
|
values->compareColumn(rhs_sparse.getValuesColumn(), rhs_sparse.getValueIndex(rhs_row_num),
|
|
nullptr, nested_result, direction, nan_direction_hint);
|
|
|
|
const auto & offsets_data = getOffsetsData();
|
|
compare_results.resize_fill(_size, nested_result[0]);
|
|
for (size_t i = 0; i < offsets_data.size(); ++i)
|
|
compare_results[offsets_data[i]] = nested_result[i + 1];
|
|
}
|
|
}
|
|
|
|
int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const
|
|
{
|
|
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
|
|
return values->compareAtWithCollation(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint, collator);
|
|
|
|
return values->compareAtWithCollation(getValueIndex(n), m, rhs, null_direction_hint, collator);
|
|
}
|
|
|
|
bool ColumnSparse::hasEqualValues() const
|
|
{
|
|
size_t num_defaults = getNumberOfDefaults();
|
|
if (num_defaults == _size)
|
|
return true;
|
|
|
|
/// Have at least 1 default and 1 non-default values.
|
|
if (num_defaults != 0)
|
|
return false;
|
|
|
|
/// Check that probably all non-default values are equal.
|
|
/// It's suboptiomal, but it's a rare case.
|
|
for (size_t i = 2; i < values->size(); ++i)
|
|
if (values->compareAt(1, i, *values, 1) != 0)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
void ColumnSparse::getPermutationImpl(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
|
|
size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
|
|
{
|
|
if (_size == 0)
|
|
return;
|
|
|
|
res.resize(_size);
|
|
if (offsets->empty())
|
|
{
|
|
for (size_t i = 0; i < _size; ++i)
|
|
res[i] = i;
|
|
return;
|
|
}
|
|
|
|
if (limit == 0 || limit > _size)
|
|
limit = _size;
|
|
|
|
Permutation perm;
|
|
/// Firstly we sort all values.
|
|
/// limit + 1 for case when there are 0 default values.
|
|
if (collator)
|
|
values->getPermutationWithCollation(*collator, direction, stability, limit + 1, null_direction_hint, perm);
|
|
else
|
|
values->getPermutation(direction, stability, limit + 1, null_direction_hint, perm);
|
|
|
|
size_t num_of_defaults = getNumberOfDefaults();
|
|
size_t row = 0;
|
|
|
|
const auto & offsets_data = getOffsetsData();
|
|
|
|
/// Fill the permutation.
|
|
for (size_t i = 0; i < perm.size() && row < limit; ++i)
|
|
{
|
|
if (perm[i] == 0)
|
|
{
|
|
if (!num_of_defaults)
|
|
continue;
|
|
|
|
/// Fill the positions of default values in the required quantity.
|
|
auto offset_it = begin();
|
|
while (row < limit)
|
|
{
|
|
while (offset_it.getCurrentRow() < _size && !offset_it.isDefault())
|
|
++offset_it;
|
|
|
|
if (offset_it.getCurrentRow() == _size)
|
|
break;
|
|
|
|
res[row++] = offset_it.getCurrentRow();
|
|
++offset_it;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
res[row++] = offsets_data[perm[i] - 1];
|
|
}
|
|
}
|
|
|
|
assert(row == limit);
|
|
}
|
|
|
|
void ColumnSparse::getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
|
|
size_t limit, int null_direction_hint, Permutation & res) const
|
|
{
|
|
if (unlikely(stability == IColumn::PermutationSortStability::Stable))
|
|
{
|
|
auto this_full = convertToFullColumnIfSparse();
|
|
this_full->getPermutation(direction, stability, limit, null_direction_hint, res);
|
|
return;
|
|
}
|
|
|
|
return getPermutationImpl(direction, stability, limit, null_direction_hint, res, nullptr);
|
|
}
|
|
|
|
void ColumnSparse::updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
|
|
size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_ranges) const
|
|
{
|
|
auto this_full = convertToFullColumnIfSparse();
|
|
this_full->updatePermutation(direction, stability, limit, null_direction_hint, res, equal_ranges);
|
|
}
|
|
|
|
void ColumnSparse::getPermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
|
|
size_t limit, int null_direction_hint, Permutation & res) const
|
|
{
|
|
return getPermutationImpl(direction, stability, limit, null_direction_hint, res, &collator);
|
|
}
|
|
|
|
void ColumnSparse::updatePermutationWithCollation(
|
|
const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
|
|
size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_ranges) const
|
|
{
|
|
auto this_full = convertToFullColumnIfSparse();
|
|
this_full->updatePermutationWithCollation(collator, direction, stability, limit, null_direction_hint, res, equal_ranges);
|
|
}
|
|
|
|
size_t ColumnSparse::byteSize() const
|
|
{
|
|
return values->byteSize() + offsets->byteSize() + sizeof(_size);
|
|
}
|
|
|
|
size_t ColumnSparse::byteSizeAt(size_t n) const
|
|
{
|
|
size_t index = getValueIndex(n);
|
|
size_t res = values->byteSizeAt(index);
|
|
if (index)
|
|
res += sizeof(UInt64);
|
|
|
|
return res;
|
|
}
|
|
|
|
size_t ColumnSparse::allocatedBytes() const
|
|
{
|
|
return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size);
|
|
}
|
|
|
|
void ColumnSparse::protect()
|
|
{
|
|
values->protect();
|
|
offsets->protect();
|
|
}
|
|
|
|
ColumnPtr ColumnSparse::replicate(const Offsets & replicate_offsets) const
|
|
{
|
|
/// TODO: implement specializations.
|
|
if (_size != replicate_offsets.size())
|
|
throw Exception("Size of offsets doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
|
|
|
if (_size == 0)
|
|
return ColumnSparse::create(values->cloneEmpty());
|
|
|
|
auto res_offsets = offsets->cloneEmpty();
|
|
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
|
|
auto res_values = values->cloneEmpty();
|
|
res_values->insertDefault();
|
|
|
|
auto offset_it = begin();
|
|
for (size_t i = 0; i < _size; ++i, ++offset_it)
|
|
{
|
|
if (!offset_it.isDefault())
|
|
{
|
|
size_t replicate_size = replicate_offsets[i] - replicate_offsets[i - 1];
|
|
res_offsets_data.reserve(res_offsets_data.size() + replicate_size);
|
|
for (size_t row = replicate_offsets[i - 1]; row < replicate_offsets[i]; ++row)
|
|
{
|
|
res_offsets_data.push_back(row);
|
|
res_values->insertFrom(*values, offset_it.getValueIndex());
|
|
}
|
|
}
|
|
}
|
|
|
|
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), replicate_offsets.back());
|
|
}
|
|
|
|
void ColumnSparse::updateHashWithValue(size_t n, SipHash & hash) const
|
|
{
|
|
values->updateHashWithValue(getValueIndex(n), hash);
|
|
}
|
|
|
|
void ColumnSparse::updateWeakHash32(WeakHash32 & hash) const
|
|
{
|
|
if (hash.getData().size() != _size)
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
|
|
"column size is {}, hash size is {}", _size, hash.getData().size());
|
|
|
|
auto offset_it = begin();
|
|
auto & hash_data = hash.getData();
|
|
for (size_t i = 0; i < _size; ++i, ++offset_it)
|
|
{
|
|
size_t value_index = offset_it.getValueIndex();
|
|
auto data_ref = values->getDataAt(value_index);
|
|
hash_data[i] = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(data_ref.data), data_ref.size, hash_data[i]);
|
|
}
|
|
}
|
|
|
|
void ColumnSparse::updateHashFast(SipHash & hash) const
|
|
{
|
|
values->updateHashFast(hash);
|
|
offsets->updateHashFast(hash);
|
|
hash.update(_size);
|
|
}
|
|
|
|
void ColumnSparse::getExtremes(Field & min, Field & max) const
|
|
{
|
|
if (_size == 0)
|
|
{
|
|
values->get(0, min);
|
|
values->get(0, max);
|
|
return;
|
|
}
|
|
|
|
if (getNumberOfDefaults() == 0)
|
|
{
|
|
size_t min_idx = 1;
|
|
size_t max_idx = 1;
|
|
|
|
for (size_t i = 2; i < values->size(); ++i)
|
|
{
|
|
if (values->compareAt(i, min_idx, *values, 1) < 0)
|
|
min_idx = i;
|
|
else if (values->compareAt(i, max_idx, *values, 1) > 0)
|
|
max_idx = i;
|
|
}
|
|
|
|
values->get(min_idx, min);
|
|
values->get(max_idx, max);
|
|
return;
|
|
}
|
|
|
|
values->getExtremes(min, max);
|
|
}
|
|
|
|
void ColumnSparse::getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const
|
|
{
|
|
const auto & offsets_data = getOffsetsData();
|
|
const auto * start = from ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from) : offsets_data.begin();
|
|
const auto * end = limit ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from + limit) : offsets_data.end();
|
|
|
|
indices.insert(start, end);
|
|
}
|
|
|
|
double ColumnSparse::getRatioOfDefaultRows(double) const
|
|
{
|
|
return static_cast<double>(getNumberOfDefaults()) / _size;
|
|
}
|
|
|
|
MutableColumns ColumnSparse::scatter(ColumnIndex num_columns, const Selector & selector) const
|
|
{
|
|
return scatterImpl<ColumnSparse>(num_columns, selector);
|
|
}
|
|
|
|
void ColumnSparse::gather(ColumnGathererStream & gatherer_stream)
|
|
{
|
|
gatherer_stream.gather(*this);
|
|
}
|
|
|
|
ColumnPtr ColumnSparse::compress() const
|
|
{
|
|
auto values_compressed = values->compress();
|
|
auto offsets_compressed = offsets->compress();
|
|
|
|
size_t byte_size = values_compressed->byteSize() + offsets_compressed->byteSize();
|
|
|
|
return ColumnCompressed::create(size(), byte_size,
|
|
[values_compressed = std::move(values_compressed), offsets_compressed = std::move(offsets_compressed), size = size()]
|
|
{
|
|
return ColumnSparse::create(values_compressed->decompress(), offsets_compressed->decompress(), size);
|
|
});
|
|
}
|
|
|
|
bool ColumnSparse::structureEquals(const IColumn & rhs) const
|
|
{
|
|
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
|
|
return values->structureEquals(*rhs_sparse->values);
|
|
return false;
|
|
}
|
|
|
|
void ColumnSparse::forEachSubcolumn(ColumnCallback callback)
|
|
{
|
|
callback(values);
|
|
callback(offsets);
|
|
}
|
|
|
|
const IColumn::Offsets & ColumnSparse::getOffsetsData() const
|
|
{
|
|
return assert_cast<const ColumnUInt64 &>(*offsets).getData();
|
|
}
|
|
|
|
IColumn::Offsets & ColumnSparse::getOffsetsData()
|
|
{
|
|
return assert_cast<ColumnUInt64 &>(*offsets).getData();
|
|
}
|
|
|
|
size_t ColumnSparse::getValueIndex(size_t n) const
|
|
{
|
|
assert(n < _size);
|
|
|
|
const auto & offsets_data = getOffsetsData();
|
|
const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n);
|
|
if (it == offsets_data.end() || *it != n)
|
|
return 0;
|
|
|
|
return it - offsets_data.begin() + 1;
|
|
}
|
|
|
|
ColumnSparse::Iterator ColumnSparse::getIterator(size_t n) const
|
|
{
|
|
const auto & offsets_data = getOffsetsData();
|
|
const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n);
|
|
size_t current_offset = it - offsets_data.begin();
|
|
return Iterator(offsets_data, _size, current_offset, n);
|
|
}
|
|
|
|
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
|
|
{
|
|
if (!column)
|
|
return column;
|
|
|
|
if (const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get()))
|
|
{
|
|
auto columns = column_tuple->getColumns();
|
|
for (auto & element : columns)
|
|
element = recursiveRemoveSparse(element);
|
|
|
|
return ColumnTuple::create(columns);
|
|
}
|
|
|
|
return column->convertToFullColumnIfSparse();
|
|
}
|
|
|
|
}
|