diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index f33354748a1..867728d5c8b 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -985,6 +985,16 @@ ColumnPtr ColumnArray::compress() const }); } +double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const +{ + return getRatioOfDefaultRowsImpl(sample_ratio); +} + +void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} + ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const { diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index d8368f955aa..adfb4788b93 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -142,15 +142,9 @@ public: return false; } - double getRatioOfDefaultRows(double sample_ratio) const override - { - return getRatioOfDefaultRowsImpl(sample_ratio); - } + double getRatioOfDefaultRows(double sample_ratio) const override; - void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override - { - return getIndicesOfNonDefaultRowsImpl(indices, from, limit); - } + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; bool isCollationSupported() const override { return getData().isCollationSupported(); } diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index ab581e87dab..89f93c83596 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -53,33 +53,8 @@ MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const bool ColumnFixedString::isDefaultAt(size_t index) const { - const UInt8 * pos = chars.data() + index * n; - const UInt8 * end = pos + n; - -#ifdef __SSE2__ - static constexpr size_t SIMD_BYTES = 16; - const UInt8 * end_sse = pos + n / SIMD_BYTES * SIMD_BYTES; - const __m128i zero16 = _mm_setzero_si128(); - - while (pos < end_sse) - { - if (0xFFFF != _mm_movemask_epi8(_mm_cmpeq_epi8( - _mm_loadu_si128(reinterpret_cast(pos)), zero16))) - return false; - - pos += SIMD_BYTES; - } -#endif - - while (pos < end) - { - if (*pos != 0) - return false; - - ++pos; - } - - return true; + assert(index < size()); + return memoryIsZero(chars.data() + index * n, n); } void ColumnFixedString::insert(const Field & x) diff --git a/src/Columns/ColumnLowCardinality.h b/src/Columns/ColumnLowCardinality.h index 3575a7f8877..d3ee54778e0 100644 --- a/src/Columns/ColumnLowCardinality.h +++ b/src/Columns/ColumnLowCardinality.h @@ -183,12 +183,12 @@ public: double getRatioOfDefaultRows(double sample_ratio) const override { - return getRatioOfDefaultRowsImpl(sample_ratio); + return getIndexes().getRatioOfDefaultRows(sample_ratio); } void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override { - return getIndicesOfNonDefaultRowsImpl(indices, from, limit); + return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit); } bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); } diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index d39f98671b4..6aecd4cecd5 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -34,7 +34,7 @@ ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offs const ColumnUInt64 * offsets_concrete = typeid_cast(offsets.get()); if (!offsets_concrete) - throw Exception("offsets_column must be a ColumnUInt64", ErrorCodes::LOGICAL_ERROR); + throw Exception( ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName()); /// 'values' should contain one extra element: default value at 0 position. if (offsets->size() + 1 != values->size()) @@ -45,6 +45,11 @@ ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offs throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size()); + if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Size sparse columns ({}) should be greater than last position of non-default value ({})", + _size, offsets_concrete->getData().back()); + #ifndef NDEBUG const auto & offsets_data = getOffsetsData(); const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal()); @@ -126,13 +131,24 @@ ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1); } -void ColumnSparse::insertData(const char * pos, size_t length) +void ColumnSparse::insertSingleValue(const Inserter & inserter) { - _size += length; - return values->insertData(pos, length); + inserter(*values); + + size_t last_idx = values->size() - 1; + if (values->isDefaultAt(last_idx)) + values->popBack(1); + else + getOffsetsData().push_back(_size); + + ++_size; +} + +void ColumnSparse::insertData(const char * pos, size_t length) +{ + insertSingleValue([&](IColumn & column) { column.insertData(pos, length); }); } -/// TODO: maybe need to reimplement it. StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const { return values->serializeValueIntoArena(getValueIndex(n), arena, begin); @@ -140,8 +156,9 @@ StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char co const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos) { - ++_size; - return values->deserializeAndInsertFromArena(pos); + const char * res = nullptr; + insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); }); + return res; } const char * ColumnSparse::skipSerializedInArena(const char * pos) const @@ -168,6 +185,7 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin(); size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin(); + assert(offset_start <= offset_end); if (offset_start != offset_end) { @@ -198,24 +216,24 @@ void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t len { for (size_t i = start; i < end; ++i) { - offsets_data.push_back(_size); + if (!src.isDefaultAt(i)) + { + values->insertFrom(src, i); + offsets_data.push_back(_size); + } + ++_size; } - - values->insertRangeFrom(src, start, length); } } void ColumnSparse::insert(const Field & x) { - getOffsetsData().push_back(_size); - values->insert(x); - ++_size; + insertSingleValue([&](IColumn & column) { column.insert(x); }); } void ColumnSparse::insertFrom(const IColumn & src, size_t n) { - if (const auto * src_sparse = typeid_cast(&src)) { if (size_t value_index = src_sparse->getValueIndex(n)) @@ -226,8 +244,11 @@ void ColumnSparse::insertFrom(const IColumn & src, size_t n) } else { - getOffsetsData().push_back(_size); - values->insertFrom(src, n); + if (!src.isDefaultAt(n)) + { + values->insertFrom(src, n); + getOffsetsData().push_back(_size); + } } ++_size; @@ -467,7 +488,21 @@ int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs bool ColumnSparse::hasEqualValues() const { - return offsets->empty(); + size_t num_defaults = getNumberOfDefaults(); + if (num_defaults == _size) + return true; + + /// Have at least 1 default and 1 non-default values. + if (num_defaults != 0) + return false; + + /// Check that probably all non-default values are equal. + /// It's suboptiomal, but it's a rare case. + for (size_t i = 2; i < values->size(); ++i) + if (values->compareAt(1, i, *values, 1) != 0) + return false; + + return true; } void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const @@ -555,7 +590,7 @@ void ColumnSparse::updatePermutationWithCollation( size_t ColumnSparse::byteSize() const { - return values->byteSize() + offsets->byteSize(); + return values->byteSize() + offsets->byteSize() + sizeof(_size); } size_t ColumnSparse::byteSizeAt(size_t n) const @@ -570,7 +605,7 @@ size_t ColumnSparse::byteSizeAt(size_t n) const size_t ColumnSparse::allocatedBytes() const { - return values->allocatedBytes() + offsets->allocatedBytes(); + return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size); } void ColumnSparse::protect() diff --git a/src/Columns/ColumnSparse.h b/src/Columns/ColumnSparse.h index 037b967e8dd..eb24664c7bb 100644 --- a/src/Columns/ColumnSparse.h +++ b/src/Columns/ColumnSparse.h @@ -149,7 +149,7 @@ public: /// Return position of element in 'values' columns, /// that corresponds to n-th element of full column. - /// O(log(size)) complexity, + /// O(log(offsets.size())) complexity, size_t getValueIndex(size_t n) const; const IColumn & getValuesColumn() const { return *values; } @@ -209,7 +209,16 @@ public: Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); } private: + using Inserter = std::function; + + /// Inserts value to 'values' column via callback. + /// Properly handles cases, when inserted value is default. + /// Used, when it's unknown in advance if inserted value is default. + void insertSingleValue(const Inserter & inserter); + /// Contains default value at 0 position. + /// It's convenient, because it allows to execute, e.g functions or sorting, + /// for this column without handling different cases. WrappedPtr values; /// Sorted offsets of non-default values in the full column. diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index 6f97acd9df1..6eef745a355 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -68,7 +68,7 @@ public: Field operator[](size_t n) const override { return (*getNestedColumn())[n]; } void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); } - bool isDefaultAt(size_t n) const override { return getNestedColumn()->isDefaultAt(n); } + bool isDefaultAt(size_t n) const override { return n == 0; } StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); } StringRef getDataAtWithTerminatingZero(size_t n) const override { @@ -123,14 +123,14 @@ public: return false; } - double getRatioOfDefaultRows(double sample_ratio) const override + double getRatioOfDefaultRows(double) const override { - return getNestedColumn()->getRatioOfDefaultRows(sample_ratio); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getRatioOfDefaultRows' not implemeted for ColumnUnique"); } - void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override + void getIndicesOfNonDefaultRows(IColumn::Offsets &, size_t, size_t) const override { - return getNestedColumn()->getIndicesOfNonDefaultRows(indices, from, limit); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getIndicesOfNonDefaultRows' not implemeted for ColumnUnique"); } const UInt64 * tryGetSavedHash() const override { return reverse_index.tryGetSavedHash(); } diff --git a/src/Columns/IColumnImpl.h b/src/Columns/IColumnImpl.h index fe9ad251111..7d9f338f8e2 100644 --- a/src/Columns/IColumnImpl.h +++ b/src/Columns/IColumnImpl.h @@ -143,7 +143,9 @@ bool IColumn::hasEqualValuesImpl() const template double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const { - assert(sample_ratio > 0 && sample_ratio <= 1.0); + if (sample_ratio <= 0.0 || sample_ratio > 1.0) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: {}", sample_ratio); size_t num_rows = size(); size_t num_sampled_rows = static_cast(num_rows * sample_ratio);