better ColumnSparse

This commit is contained in:
Anton Popov 2021-09-29 17:18:41 +03:00
parent 6f9e53197c
commit dc94d2239e
8 changed files with 88 additions and 63 deletions

View File

@ -985,6 +985,16 @@ ColumnPtr ColumnArray::compress() const
});
}
double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnArray>(sample_ratio);
}
void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnArray>(indices, from, limit);
}
ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
{

View File

@ -142,15 +142,9 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getRatioOfDefaultRowsImpl<ColumnArray>(sample_ratio);
}
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndicesOfNonDefaultRowsImpl<ColumnArray>(indices, from, limit);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
bool isCollationSupported() const override { return getData().isCollationSupported(); }

View File

@ -53,33 +53,8 @@ MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const
bool ColumnFixedString::isDefaultAt(size_t index) const
{
const UInt8 * pos = chars.data() + index * n;
const UInt8 * end = pos + n;
#ifdef __SSE2__
static constexpr size_t SIMD_BYTES = 16;
const UInt8 * end_sse = pos + n / SIMD_BYTES * SIMD_BYTES;
const __m128i zero16 = _mm_setzero_si128();
while (pos < end_sse)
{
if (0xFFFF != _mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pos)), zero16)))
return false;
pos += SIMD_BYTES;
}
#endif
while (pos < end)
{
if (*pos != 0)
return false;
++pos;
}
return true;
assert(index < size());
return memoryIsZero(chars.data() + index * n, n);
}
void ColumnFixedString::insert(const Field & x)

View File

@ -183,12 +183,12 @@ public:
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getRatioOfDefaultRowsImpl<ColumnLowCardinality>(sample_ratio);
return getIndexes().getRatioOfDefaultRows(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndicesOfNonDefaultRowsImpl<ColumnLowCardinality>(indices, from, limit);
return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit);
}
bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); }

View File

@ -34,7 +34,7 @@ ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offs
const ColumnUInt64 * offsets_concrete = typeid_cast<const ColumnUInt64 *>(offsets.get());
if (!offsets_concrete)
throw Exception("offsets_column must be a ColumnUInt64", ErrorCodes::LOGICAL_ERROR);
throw Exception( ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName());
/// 'values' should contain one extra element: default value at 0 position.
if (offsets->size() + 1 != values->size())
@ -45,6 +45,11 @@ ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offs
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size());
if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size sparse columns ({}) should be greater than last position of non-default value ({})",
_size, offsets_concrete->getData().back());
#ifndef NDEBUG
const auto & offsets_data = getOffsetsData();
const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal<UInt64>());
@ -126,13 +131,24 @@ ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const
return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1);
}
void ColumnSparse::insertData(const char * pos, size_t length)
void ColumnSparse::insertSingleValue(const Inserter & inserter)
{
_size += length;
return values->insertData(pos, length);
inserter(*values);
size_t last_idx = values->size() - 1;
if (values->isDefaultAt(last_idx))
values->popBack(1);
else
getOffsetsData().push_back(_size);
++_size;
}
void ColumnSparse::insertData(const char * pos, size_t length)
{
insertSingleValue([&](IColumn & column) { column.insertData(pos, length); });
}
/// TODO: maybe need to reimplement it.
StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
{
return values->serializeValueIntoArena(getValueIndex(n), arena, begin);
@ -140,8 +156,9 @@ StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char co
const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos)
{
++_size;
return values->deserializeAndInsertFromArena(pos);
const char * res = nullptr;
insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); });
return res;
}
const char * ColumnSparse::skipSerializedInArena(const char * pos) const
@ -168,6 +185,7 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
assert(offset_start <= offset_end);
if (offset_start != offset_end)
{
@ -198,24 +216,24 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len
{
for (size_t i = start; i < end; ++i)
{
offsets_data.push_back(_size);
if (!src.isDefaultAt(i))
{
values->insertFrom(src, i);
offsets_data.push_back(_size);
}
++_size;
}
values->insertRangeFrom(src, start, length);
}
}
void ColumnSparse::insert(const Field & x)
{
getOffsetsData().push_back(_size);
values->insert(x);
++_size;
insertSingleValue([&](IColumn & column) { column.insert(x); });
}
void ColumnSparse::insertFrom(const IColumn & src, size_t n)
{
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
{
if (size_t value_index = src_sparse->getValueIndex(n))
@ -226,8 +244,11 @@ void ColumnSparse::insertFrom(const IColumn & src, size_t n)
}
else
{
getOffsetsData().push_back(_size);
values->insertFrom(src, n);
if (!src.isDefaultAt(n))
{
values->insertFrom(src, n);
getOffsetsData().push_back(_size);
}
}
++_size;
@ -467,7 +488,21 @@ int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs
bool ColumnSparse::hasEqualValues() const
{
return offsets->empty();
size_t num_defaults = getNumberOfDefaults();
if (num_defaults == _size)
return true;
/// Have at least 1 default and 1 non-default values.
if (num_defaults != 0)
return false;
/// Check that probably all non-default values are equal.
/// It's suboptiomal, but it's a rare case.
for (size_t i = 2; i < values->size(); ++i)
if (values->compareAt(1, i, *values, 1) != 0)
return false;
return true;
}
void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
@ -555,7 +590,7 @@ void ColumnSparse::updatePermutationWithCollation(
size_t ColumnSparse::byteSize() const
{
return values->byteSize() + offsets->byteSize();
return values->byteSize() + offsets->byteSize() + sizeof(_size);
}
size_t ColumnSparse::byteSizeAt(size_t n) const
@ -570,7 +605,7 @@ size_t ColumnSparse::byteSizeAt(size_t n) const
size_t ColumnSparse::allocatedBytes() const
{
return values->allocatedBytes() + offsets->allocatedBytes();
return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size);
}
void ColumnSparse::protect()

View File

@ -149,7 +149,7 @@ public:
/// Return position of element in 'values' columns,
/// that corresponds to n-th element of full column.
/// O(log(size)) complexity,
/// O(log(offsets.size())) complexity,
size_t getValueIndex(size_t n) const;
const IColumn & getValuesColumn() const { return *values; }
@ -209,7 +209,16 @@ public:
Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); }
private:
using Inserter = std::function<void(IColumn &)>;
/// Inserts value to 'values' column via callback.
/// Properly handles cases, when inserted value is default.
/// Used, when it's unknown in advance if inserted value is default.
void insertSingleValue(const Inserter & inserter);
/// Contains default value at 0 position.
/// It's convenient, because it allows to execute, e.g functions or sorting,
/// for this column without handling different cases.
WrappedPtr values;
/// Sorted offsets of non-default values in the full column.

View File

@ -68,7 +68,7 @@ public:
Field operator[](size_t n) const override { return (*getNestedColumn())[n]; }
void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); }
bool isDefaultAt(size_t n) const override { return getNestedColumn()->isDefaultAt(n); }
bool isDefaultAt(size_t n) const override { return n == 0; }
StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); }
StringRef getDataAtWithTerminatingZero(size_t n) const override
{
@ -123,14 +123,14 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override
double getRatioOfDefaultRows(double) const override
{
return getNestedColumn()->getRatioOfDefaultRows(sample_ratio);
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getRatioOfDefaultRows' not implemeted for ColumnUnique");
}
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
void getIndicesOfNonDefaultRows(IColumn::Offsets &, size_t, size_t) const override
{
return getNestedColumn()->getIndicesOfNonDefaultRows(indices, from, limit);
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getIndicesOfNonDefaultRows' not implemeted for ColumnUnique");
}
const UInt64 * tryGetSavedHash() const override { return reverse_index.tryGetSavedHash(); }

View File

@ -143,7 +143,9 @@ bool IColumn::hasEqualValuesImpl() const
template <typename Derived>
double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const
{
assert(sample_ratio > 0 && sample_ratio <= 1.0);
if (sample_ratio <= 0.0 || sample_ratio > 1.0)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: {}", sample_ratio);
size_t num_rows = size();
size_t num_sampled_rows = static_cast<size_t>(num_rows * sample_ratio);