mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 00:52:02 +00:00
better ColumnSparse
This commit is contained in:
parent
6f9e53197c
commit
dc94d2239e
@ -985,6 +985,16 @@ ColumnPtr ColumnArray::compress() const
|
||||
});
|
||||
}
|
||||
|
||||
double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const
|
||||
{
|
||||
return getRatioOfDefaultRowsImpl<ColumnArray>(sample_ratio);
|
||||
}
|
||||
|
||||
void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
|
||||
{
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnArray>(indices, from, limit);
|
||||
}
|
||||
|
||||
|
||||
ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
|
||||
{
|
||||
|
@ -142,15 +142,9 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return getRatioOfDefaultRowsImpl<ColumnArray>(sample_ratio);
|
||||
}
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override;
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnArray>(indices, from, limit);
|
||||
}
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
|
||||
|
||||
bool isCollationSupported() const override { return getData().isCollationSupported(); }
|
||||
|
||||
|
@ -53,33 +53,8 @@ MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const
|
||||
|
||||
bool ColumnFixedString::isDefaultAt(size_t index) const
|
||||
{
|
||||
const UInt8 * pos = chars.data() + index * n;
|
||||
const UInt8 * end = pos + n;
|
||||
|
||||
#ifdef __SSE2__
|
||||
static constexpr size_t SIMD_BYTES = 16;
|
||||
const UInt8 * end_sse = pos + n / SIMD_BYTES * SIMD_BYTES;
|
||||
const __m128i zero16 = _mm_setzero_si128();
|
||||
|
||||
while (pos < end_sse)
|
||||
{
|
||||
if (0xFFFF != _mm_movemask_epi8(_mm_cmpeq_epi8(
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pos)), zero16)))
|
||||
return false;
|
||||
|
||||
pos += SIMD_BYTES;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (pos < end)
|
||||
{
|
||||
if (*pos != 0)
|
||||
return false;
|
||||
|
||||
++pos;
|
||||
}
|
||||
|
||||
return true;
|
||||
assert(index < size());
|
||||
return memoryIsZero(chars.data() + index * n, n);
|
||||
}
|
||||
|
||||
void ColumnFixedString::insert(const Field & x)
|
||||
|
@ -183,12 +183,12 @@ public:
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return getRatioOfDefaultRowsImpl<ColumnLowCardinality>(sample_ratio);
|
||||
return getIndexes().getRatioOfDefaultRows(sample_ratio);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnLowCardinality>(indices, from, limit);
|
||||
return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit);
|
||||
}
|
||||
|
||||
bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); }
|
||||
|
@ -34,7 +34,7 @@ ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offs
|
||||
const ColumnUInt64 * offsets_concrete = typeid_cast<const ColumnUInt64 *>(offsets.get());
|
||||
|
||||
if (!offsets_concrete)
|
||||
throw Exception("offsets_column must be a ColumnUInt64", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception( ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName());
|
||||
|
||||
/// 'values' should contain one extra element: default value at 0 position.
|
||||
if (offsets->size() + 1 != values->size())
|
||||
@ -45,6 +45,11 @@ ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offs
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size());
|
||||
|
||||
if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Size sparse columns ({}) should be greater than last position of non-default value ({})",
|
||||
_size, offsets_concrete->getData().back());
|
||||
|
||||
#ifndef NDEBUG
|
||||
const auto & offsets_data = getOffsetsData();
|
||||
const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal<UInt64>());
|
||||
@ -126,13 +131,24 @@ ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const
|
||||
return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1);
|
||||
}
|
||||
|
||||
void ColumnSparse::insertData(const char * pos, size_t length)
|
||||
void ColumnSparse::insertSingleValue(const Inserter & inserter)
|
||||
{
|
||||
_size += length;
|
||||
return values->insertData(pos, length);
|
||||
inserter(*values);
|
||||
|
||||
size_t last_idx = values->size() - 1;
|
||||
if (values->isDefaultAt(last_idx))
|
||||
values->popBack(1);
|
||||
else
|
||||
getOffsetsData().push_back(_size);
|
||||
|
||||
++_size;
|
||||
}
|
||||
|
||||
void ColumnSparse::insertData(const char * pos, size_t length)
|
||||
{
|
||||
insertSingleValue([&](IColumn & column) { column.insertData(pos, length); });
|
||||
}
|
||||
|
||||
/// TODO: maybe need to reimplement it.
|
||||
StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
|
||||
{
|
||||
return values->serializeValueIntoArena(getValueIndex(n), arena, begin);
|
||||
@ -140,8 +156,9 @@ StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char co
|
||||
|
||||
const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos)
|
||||
{
|
||||
++_size;
|
||||
return values->deserializeAndInsertFromArena(pos);
|
||||
const char * res = nullptr;
|
||||
insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); });
|
||||
return res;
|
||||
}
|
||||
|
||||
const char * ColumnSparse::skipSerializedInArena(const char * pos) const
|
||||
@ -168,6 +185,7 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len
|
||||
|
||||
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
|
||||
size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
|
||||
assert(offset_start <= offset_end);
|
||||
|
||||
if (offset_start != offset_end)
|
||||
{
|
||||
@ -198,24 +216,24 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len
|
||||
{
|
||||
for (size_t i = start; i < end; ++i)
|
||||
{
|
||||
offsets_data.push_back(_size);
|
||||
if (!src.isDefaultAt(i))
|
||||
{
|
||||
values->insertFrom(src, i);
|
||||
offsets_data.push_back(_size);
|
||||
}
|
||||
|
||||
++_size;
|
||||
}
|
||||
|
||||
values->insertRangeFrom(src, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnSparse::insert(const Field & x)
|
||||
{
|
||||
getOffsetsData().push_back(_size);
|
||||
values->insert(x);
|
||||
++_size;
|
||||
insertSingleValue([&](IColumn & column) { column.insert(x); });
|
||||
}
|
||||
|
||||
void ColumnSparse::insertFrom(const IColumn & src, size_t n)
|
||||
{
|
||||
|
||||
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
|
||||
{
|
||||
if (size_t value_index = src_sparse->getValueIndex(n))
|
||||
@ -226,8 +244,11 @@ void ColumnSparse::insertFrom(const IColumn & src, size_t n)
|
||||
}
|
||||
else
|
||||
{
|
||||
getOffsetsData().push_back(_size);
|
||||
values->insertFrom(src, n);
|
||||
if (!src.isDefaultAt(n))
|
||||
{
|
||||
values->insertFrom(src, n);
|
||||
getOffsetsData().push_back(_size);
|
||||
}
|
||||
}
|
||||
|
||||
++_size;
|
||||
@ -467,7 +488,21 @@ int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs
|
||||
|
||||
bool ColumnSparse::hasEqualValues() const
|
||||
{
|
||||
return offsets->empty();
|
||||
size_t num_defaults = getNumberOfDefaults();
|
||||
if (num_defaults == _size)
|
||||
return true;
|
||||
|
||||
/// Have at least 1 default and 1 non-default values.
|
||||
if (num_defaults != 0)
|
||||
return false;
|
||||
|
||||
/// Check that probably all non-default values are equal.
|
||||
/// It's suboptiomal, but it's a rare case.
|
||||
for (size_t i = 2; i < values->size(); ++i)
|
||||
if (values->compareAt(1, i, *values, 1) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
|
||||
@ -555,7 +590,7 @@ void ColumnSparse::updatePermutationWithCollation(
|
||||
|
||||
size_t ColumnSparse::byteSize() const
|
||||
{
|
||||
return values->byteSize() + offsets->byteSize();
|
||||
return values->byteSize() + offsets->byteSize() + sizeof(_size);
|
||||
}
|
||||
|
||||
size_t ColumnSparse::byteSizeAt(size_t n) const
|
||||
@ -570,7 +605,7 @@ size_t ColumnSparse::byteSizeAt(size_t n) const
|
||||
|
||||
size_t ColumnSparse::allocatedBytes() const
|
||||
{
|
||||
return values->allocatedBytes() + offsets->allocatedBytes();
|
||||
return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size);
|
||||
}
|
||||
|
||||
void ColumnSparse::protect()
|
||||
|
@ -149,7 +149,7 @@ public:
|
||||
|
||||
/// Return position of element in 'values' columns,
|
||||
/// that corresponds to n-th element of full column.
|
||||
/// O(log(size)) complexity,
|
||||
/// O(log(offsets.size())) complexity,
|
||||
size_t getValueIndex(size_t n) const;
|
||||
|
||||
const IColumn & getValuesColumn() const { return *values; }
|
||||
@ -209,7 +209,16 @@ public:
|
||||
Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); }
|
||||
|
||||
private:
|
||||
using Inserter = std::function<void(IColumn &)>;
|
||||
|
||||
/// Inserts value to 'values' column via callback.
|
||||
/// Properly handles cases, when inserted value is default.
|
||||
/// Used, when it's unknown in advance if inserted value is default.
|
||||
void insertSingleValue(const Inserter & inserter);
|
||||
|
||||
/// Contains default value at 0 position.
|
||||
/// It's convenient, because it allows to execute, e.g functions or sorting,
|
||||
/// for this column without handling different cases.
|
||||
WrappedPtr values;
|
||||
|
||||
/// Sorted offsets of non-default values in the full column.
|
||||
|
@ -68,7 +68,7 @@ public:
|
||||
|
||||
Field operator[](size_t n) const override { return (*getNestedColumn())[n]; }
|
||||
void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); }
|
||||
bool isDefaultAt(size_t n) const override { return getNestedColumn()->isDefaultAt(n); }
|
||||
bool isDefaultAt(size_t n) const override { return n == 0; }
|
||||
StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); }
|
||||
StringRef getDataAtWithTerminatingZero(size_t n) const override
|
||||
{
|
||||
@ -123,14 +123,14 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
double getRatioOfDefaultRows(double) const override
|
||||
{
|
||||
return getNestedColumn()->getRatioOfDefaultRows(sample_ratio);
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getRatioOfDefaultRows' not implemeted for ColumnUnique");
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
|
||||
void getIndicesOfNonDefaultRows(IColumn::Offsets &, size_t, size_t) const override
|
||||
{
|
||||
return getNestedColumn()->getIndicesOfNonDefaultRows(indices, from, limit);
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getIndicesOfNonDefaultRows' not implemeted for ColumnUnique");
|
||||
}
|
||||
|
||||
const UInt64 * tryGetSavedHash() const override { return reverse_index.tryGetSavedHash(); }
|
||||
|
@ -143,7 +143,9 @@ bool IColumn::hasEqualValuesImpl() const
|
||||
template <typename Derived>
|
||||
double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const
|
||||
{
|
||||
assert(sample_ratio > 0 && sample_ratio <= 1.0);
|
||||
if (sample_ratio <= 0.0 || sample_ratio > 1.0)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: {}", sample_ratio);
|
||||
|
||||
size_t num_rows = size();
|
||||
size_t num_sampled_rows = static_cast<size_t>(num_rows * sample_ratio);
|
||||
|
Loading…
Reference in New Issue
Block a user