Merge pull request #16273 from Avogar/collation-support

Support collate in LowCardinality, Nullable, Array and Tuple, where nested type is String
This commit is contained in:
alexey-milovidov 2020-11-06 21:52:52 +03:00 committed by GitHub
commit 4e85d6a4c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 783 additions and 285 deletions

View File

@ -240,6 +240,10 @@ TESTS_TO_SKIP=(
01354_order_by_tuple_collate_const
01355_ilike
01411_bayesian_ab_testing
01532_collate_in_low_cardinality
01533_collate_in_nullable
01542_collate_in_array
01543_collate_in_tuple
_orc_
arrow
avro

View File

@ -324,8 +324,7 @@ void ColumnArray::popBack(size_t n)
offsets_data.resize_assume_reserved(offsets_data.size() - n);
}
int ColumnArray::compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint) const
int ColumnArray::compareAtImpl(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint, const Collator * collator) const
{
const ColumnArray & rhs = assert_cast<const ColumnArray &>(rhs_);
@ -334,8 +333,15 @@ int ColumnArray::compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_dir
size_t rhs_size = rhs.sizeAt(m);
size_t min_size = std::min(lhs_size, rhs_size);
for (size_t i = 0; i < min_size; ++i)
if (int res = getData().compareAt(offsetAt(n) + i, rhs.offsetAt(m) + i, *rhs.data.get(), nan_direction_hint))
{
int res;
if (collator)
res = getData().compareAtWithCollation(offsetAt(n) + i, rhs.offsetAt(m) + i, *rhs.data.get(), nan_direction_hint, *collator);
else
res = getData().compareAt(offsetAt(n) + i, rhs.offsetAt(m) + i, *rhs.data.get(), nan_direction_hint);
if (res)
return res;
}
return lhs_size < rhs_size
? -1
@ -344,6 +350,16 @@ int ColumnArray::compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_dir
: 1);
}
int ColumnArray::compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint) const
{
return compareAtImpl(n, m, rhs_, nan_direction_hint);
}
int ColumnArray::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint, const Collator & collator) const
{
return compareAtImpl(n, m, rhs_, nan_direction_hint, &collator);
}
void ColumnArray::compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const
@ -352,27 +368,26 @@ void ColumnArray::compareColumn(const IColumn & rhs, size_t rhs_row_num,
compare_results, direction, nan_direction_hint);
}
namespace
template <bool positive>
struct ColumnArray::Cmp
{
template <bool positive>
struct Less
const ColumnArray & parent;
int nan_direction_hint;
const Collator * collator;
Cmp(const ColumnArray & parent_, int nan_direction_hint_, const Collator * collator_=nullptr)
: parent(parent_), nan_direction_hint(nan_direction_hint_), collator(collator_) {}
int operator()(size_t lhs, size_t rhs) const
{
const ColumnArray & parent;
int nan_direction_hint;
Less(const ColumnArray & parent_, int nan_direction_hint_)
: parent(parent_), nan_direction_hint(nan_direction_hint_) {}
bool operator()(size_t lhs, size_t rhs) const
{
if (positive)
return parent.compareAt(lhs, rhs, parent, nan_direction_hint) < 0;
else
return parent.compareAt(lhs, rhs, parent, nan_direction_hint) > 0;
}
};
}
int res;
if (collator)
res = parent.compareAtWithCollation(lhs, rhs, parent, nan_direction_hint, *collator);
else
res = parent.compareAt(lhs, rhs, parent, nan_direction_hint);
return positive ? res : -res;
}
};
void ColumnArray::reserve(size_t n)
{
@ -753,7 +768,8 @@ ColumnPtr ColumnArray::indexImpl(const PaddedPODArray<T> & indexes, size_t limit
INSTANTIATE_INDEX_IMPL(ColumnArray)
void ColumnArray::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
template <typename Comparator>
void ColumnArray::getPermutationImpl(size_t limit, Permutation & res, Comparator cmp) const
{
size_t s = size();
if (limit >= s)
@ -763,23 +779,16 @@ void ColumnArray::getPermutation(bool reverse, size_t limit, int nan_direction_h
for (size_t i = 0; i < s; ++i)
res[i] = i;
auto less = [&cmp](size_t lhs, size_t rhs){ return cmp(lhs, rhs) < 0; };
if (limit)
{
if (reverse)
std::partial_sort(res.begin(), res.begin() + limit, res.end(), Less<false>(*this, nan_direction_hint));
else
std::partial_sort(res.begin(), res.begin() + limit, res.end(), Less<true>(*this, nan_direction_hint));
}
std::partial_sort(res.begin(), res.begin() + limit, res.end(), less);
else
{
if (reverse)
std::sort(res.begin(), res.end(), Less<false>(*this, nan_direction_hint));
else
std::sort(res.begin(), res.end(), Less<true>(*this, nan_direction_hint));
}
std::sort(res.begin(), res.end(), less);
}
void ColumnArray::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const
template <typename Comparator>
void ColumnArray::updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_range, Comparator cmp) const
{
if (equal_range.empty())
return;
@ -792,20 +801,19 @@ void ColumnArray::updatePermutation(bool reverse, size_t limit, int nan_directio
if (limit)
--number_of_ranges;
auto less = [&cmp](size_t lhs, size_t rhs){ return cmp(lhs, rhs) < 0; };
EqualRanges new_ranges;
for (size_t i = 0; i < number_of_ranges; ++i)
{
const auto & [first, last] = equal_range[i];
if (reverse)
std::sort(res.begin() + first, res.begin() + last, Less<false>(*this, nan_direction_hint));
else
std::sort(res.begin() + first, res.begin() + last, Less<true>(*this, nan_direction_hint));
std::sort(res.begin() + first, res.begin() + last, less);
auto new_first = first;
for (auto j = first + 1; j < last; ++j)
{
if (compareAt(res[new_first], res[j], *this, nan_direction_hint) != 0)
if (cmp(res[new_first], res[j]) != 0)
{
if (j - new_first > 1)
new_ranges.emplace_back(new_first, j);
@ -827,14 +835,11 @@ void ColumnArray::updatePermutation(bool reverse, size_t limit, int nan_directio
/// Since then we are working inside the interval.
if (reverse)
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, Less<false>(*this, nan_direction_hint));
else
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, Less<true>(*this, nan_direction_hint));
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less);
auto new_first = first;
for (auto j = first + 1; j < limit; ++j)
{
if (compareAt(res[new_first], res[j], *this, nan_direction_hint) != 0)
if (cmp(res[new_first], res[j]) != 0)
{
if (j - new_first > 1)
new_ranges.emplace_back(new_first, j);
@ -845,7 +850,7 @@ void ColumnArray::updatePermutation(bool reverse, size_t limit, int nan_directio
auto new_last = limit;
for (auto j = limit; j < last; ++j)
{
if (compareAt(res[new_first], res[j], *this, nan_direction_hint) == 0)
if (cmp(res[new_first], res[j]) == 0)
{
std::swap(res[new_last], res[j]);
++new_last;
@ -859,6 +864,39 @@ void ColumnArray::updatePermutation(bool reverse, size_t limit, int nan_directio
equal_range = std::move(new_ranges);
}
void ColumnArray::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
{
if (reverse)
getPermutationImpl(limit, res, Cmp<false>(*this, nan_direction_hint));
else
getPermutationImpl(limit, res, Cmp<true>(*this, nan_direction_hint));
}
void ColumnArray::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const
{
if (reverse)
updatePermutationImpl(limit, res, equal_range, Cmp<false>(*this, nan_direction_hint));
else
updatePermutationImpl(limit, res, equal_range, Cmp<true>(*this, nan_direction_hint));
}
void ColumnArray::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
{
if (reverse)
getPermutationImpl(limit, res, Cmp<false>(*this, nan_direction_hint, &collator));
else
getPermutationImpl(limit, res, Cmp<true>(*this, nan_direction_hint, &collator));
}
void ColumnArray::updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const
{
if (reverse)
updatePermutationImpl(limit, res, equal_range, Cmp<false>(*this, nan_direction_hint, &collator));
else
updatePermutationImpl(limit, res, equal_range, Cmp<true>(*this, nan_direction_hint, &collator));
}
ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
{
if (replicate_offsets.empty())

View File

@ -77,8 +77,11 @@ public:
void compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const override;
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint, const Collator & collator) const override;
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_range) const override;
void reserve(size_t n) override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
@ -132,6 +135,8 @@ public:
return false;
}
bool isCollationSupported() const override { return getData().isCollationSupported(); }
private:
WrappedPtr data;
WrappedPtr offsets;
@ -169,6 +174,17 @@ private:
ColumnPtr filterTuple(const Filter & filt, ssize_t result_size_hint) const;
ColumnPtr filterNullable(const Filter & filt, ssize_t result_size_hint) const;
ColumnPtr filterGeneric(const Filter & filt, ssize_t result_size_hint) const;
int compareAtImpl(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint, const Collator * collator=nullptr) const;
template <typename Comparator>
void getPermutationImpl(size_t limit, Permutation & res, Comparator cmp) const;
template <typename Comparator>
void updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_range, Comparator cmp) const;
template <bool positive>
struct Cmp;
};

View File

@ -248,6 +248,8 @@ public:
/// The constant value. It is valid even if the size of the column is 0.
template <typename T>
T getValue() const { return getField().safeGet<NearestFieldType<T>>(); }
bool isCollationSupported() const override { return data->isCollationSupported(); }
};
}

View File

@ -1,5 +1,6 @@
#include <Columns/ColumnLowCardinality.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnString.h>
#include <DataStreams/ColumnGathererStream.h>
#include <DataTypes/NumberTraits.h>
#include <Common/HashTable/HashMap.h>
@ -278,14 +279,26 @@ MutableColumnPtr ColumnLowCardinality::cloneResized(size_t size) const
return ColumnLowCardinality::create(IColumn::mutate(std::move(unique_ptr)), getIndexes().cloneResized(size));
}
int ColumnLowCardinality::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
int ColumnLowCardinality::compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator) const
{
const auto & low_cardinality_column = assert_cast<const ColumnLowCardinality &>(rhs);
size_t n_index = getIndexes().getUInt(n);
size_t m_index = low_cardinality_column.getIndexes().getUInt(m);
if (collator)
return getDictionary().getNestedColumn()->compareAtWithCollation(n_index, m_index, *low_cardinality_column.getDictionary().getNestedColumn(), nan_direction_hint, *collator);
return getDictionary().compareAt(n_index, m_index, low_cardinality_column.getDictionary(), nan_direction_hint);
}
int ColumnLowCardinality::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
{
return compareAtImpl(n, m, rhs, nan_direction_hint);
}
int ColumnLowCardinality::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator & collator) const
{
return compareAtImpl(n, m, rhs, nan_direction_hint, &collator);
}
void ColumnLowCardinality::compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const
@ -295,14 +308,17 @@ void ColumnLowCardinality::compareColumn(const IColumn & rhs, size_t rhs_row_num
compare_results, direction, nan_direction_hint);
}
void ColumnLowCardinality::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
void ColumnLowCardinality::getPermutationImpl(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, const Collator * collator) const
{
if (limit == 0)
limit = size();
size_t unique_limit = getDictionary().size();
Permutation unique_perm;
getDictionary().getNestedColumn()->getPermutation(reverse, unique_limit, nan_direction_hint, unique_perm);
if (collator)
getDictionary().getNestedColumn()->getPermutationWithCollation(*collator, reverse, unique_limit, nan_direction_hint, unique_perm);
else
getDictionary().getNestedColumn()->getPermutation(reverse, unique_limit, nan_direction_hint, unique_perm);
/// TODO: optimize with sse.
@ -330,7 +346,8 @@ void ColumnLowCardinality::getPermutation(bool reverse, size_t limit, int nan_di
}
}
void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
template <typename Cmp>
void ColumnLowCardinality::updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_ranges, Cmp comparator) const
{
if (equal_ranges.empty())
return;
@ -345,20 +362,17 @@ void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan
EqualRanges new_ranges;
SCOPE_EXIT({equal_ranges = std::move(new_ranges);});
auto less = [&comparator](size_t lhs, size_t rhs){ return comparator(lhs, rhs) < 0; };
for (size_t i = 0; i < number_of_ranges; ++i)
{
const auto& [first, last] = equal_ranges[i];
if (reverse)
std::sort(res.begin() + first, res.begin() + last, [this, nan_direction_hint](size_t a, size_t b)
{return getDictionary().compareAt(getIndexes().getUInt(a), getIndexes().getUInt(b), getDictionary(), nan_direction_hint) > 0; });
else
std::sort(res.begin() + first, res.begin() + last, [this, nan_direction_hint](size_t a, size_t b)
{return getDictionary().compareAt(getIndexes().getUInt(a), getIndexes().getUInt(b), getDictionary(), nan_direction_hint) < 0; });
std::sort(res.begin() + first, res.begin() + last, less);
auto new_first = first;
for (auto j = first + 1; j < last; ++j)
{
if (compareAt(res[new_first], res[j], *this, nan_direction_hint) != 0)
if (comparator(res[new_first], res[j]) != 0)
{
if (j - new_first > 1)
new_ranges.emplace_back(new_first, j);
@ -379,17 +393,12 @@ void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan
/// Since then we are working inside the interval.
if (reverse)
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, [this, nan_direction_hint](size_t a, size_t b)
{return getDictionary().compareAt(getIndexes().getUInt(a), getIndexes().getUInt(b), getDictionary(), nan_direction_hint) > 0; });
else
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, [this, nan_direction_hint](size_t a, size_t b)
{return getDictionary().compareAt(getIndexes().getUInt(a), getIndexes().getUInt(b), getDictionary(), nan_direction_hint) < 0; });
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less);
auto new_first = first;
for (auto j = first + 1; j < limit; ++j)
{
if (getDictionary().compareAt(getIndexes().getUInt(res[new_first]), getIndexes().getUInt(res[j]), getDictionary(), nan_direction_hint) != 0)
if (comparator(res[new_first],res[j]) != 0)
{
if (j - new_first > 1)
new_ranges.emplace_back(new_first, j);
@ -401,7 +410,7 @@ void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan
auto new_last = limit;
for (auto j = limit; j < last; ++j)
{
if (getDictionary().compareAt(getIndexes().getUInt(res[new_first]), getIndexes().getUInt(res[j]), getDictionary(), nan_direction_hint) == 0)
if (comparator(res[new_first], res[j]) == 0)
{
std::swap(res[new_last], res[j]);
++new_last;
@ -412,6 +421,38 @@ void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan
}
}
void ColumnLowCardinality::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
{
getPermutationImpl(reverse, limit, nan_direction_hint, res);
}
void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
{
auto comparator = [this, nan_direction_hint, reverse](size_t lhs, size_t rhs)
{
int ret = getDictionary().compareAt(getIndexes().getUInt(lhs), getIndexes().getUInt(rhs), getDictionary(), nan_direction_hint);
return reverse ? -ret : ret;
};
updatePermutationImpl(limit, res, equal_ranges, comparator);
}
void ColumnLowCardinality::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
{
getPermutationImpl(reverse, limit, nan_direction_hint, res, &collator);
}
void ColumnLowCardinality::updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_ranges) const
{
auto comparator = [this, &collator, reverse, nan_direction_hint](size_t lhs, size_t rhs)
{
int ret = getDictionary().getNestedColumn()->compareAtWithCollation(getIndexes().getUInt(lhs), getIndexes().getUInt(rhs), *getDictionary().getNestedColumn(), nan_direction_hint, collator);
return reverse ? -ret : ret;
};
updatePermutationImpl(limit, res, equal_ranges, comparator);
}
std::vector<MutableColumnPtr> ColumnLowCardinality::scatter(ColumnIndex num_columns, const Selector & selector) const
{
auto columns = getIndexes().scatter(num_columns, selector);

View File

@ -125,10 +125,16 @@ public:
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const override;
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator &) const override;
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges & equal_range) const override;
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_range) const override;
ColumnPtr replicate(const Offsets & offsets) const override
{
return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().replicate(offsets));
@ -170,6 +176,7 @@ public:
size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); }
bool isNumeric() const override { return getDictionary().isNumeric(); }
bool lowCardinality() const override { return true; }
bool isCollationSupported() const override { return getDictionary().getNestedColumn()->isCollationSupported(); }
/**
* Checks if the dictionary column is Nullable(T).
@ -309,6 +316,13 @@ private:
void compactInplace();
void compactIfSharedDictionary();
int compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator=nullptr) const;
void getPermutationImpl(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, const Collator * collator = nullptr) const;
template <typename Cmp>
void updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_ranges, Cmp comparator) const;
};

View File

@ -6,6 +6,7 @@
#include <Common/WeakHash.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <DataStreams/ColumnGathererStream.h>
@ -223,7 +224,7 @@ ColumnPtr ColumnNullable::index(const IColumn & indexes, size_t limit) const
return ColumnNullable::create(indexed_data, indexed_null_map);
}
int ColumnNullable::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const
int ColumnNullable::compareAtImpl(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint, const Collator * collator) const
{
/// NULL values share the properties of NaN values.
/// Here the last parameter of compareAt is called null_direction_hint
@ -245,9 +246,22 @@ int ColumnNullable::compareAt(size_t n, size_t m, const IColumn & rhs_, int null
}
const IColumn & nested_rhs = nullable_rhs.getNestedColumn();
if (collator)
return getNestedColumn().compareAtWithCollation(n, m, nested_rhs, null_direction_hint, *collator);
return getNestedColumn().compareAt(n, m, nested_rhs, null_direction_hint);
}
int ColumnNullable::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const
{
return compareAtImpl(n, m, rhs_, null_direction_hint);
}
int ColumnNullable::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint, const Collator & collator) const
{
return compareAtImpl(n, m, rhs_, null_direction_hint, &collator);
}
void ColumnNullable::compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const
@ -256,10 +270,14 @@ void ColumnNullable::compareColumn(const IColumn & rhs, size_t rhs_row_num,
compare_results, direction, nan_direction_hint);
}
void ColumnNullable::getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
void ColumnNullable::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
{
/// Cannot pass limit because of unknown amount of NULLs.
getNestedColumn().getPermutation(reverse, 0, null_direction_hint, res);
if (collator)
getNestedColumn().getPermutationWithCollation(*collator, reverse, 0, null_direction_hint, res);
else
getNestedColumn().getPermutation(reverse, 0, null_direction_hint, res);
if ((null_direction_hint > 0) != reverse)
{
@ -329,7 +347,7 @@ void ColumnNullable::getPermutation(bool reverse, size_t limit, int null_directi
}
}
void ColumnNullable::updatePermutation(bool reverse, size_t limit, int null_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
void ColumnNullable::updatePermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_ranges, const Collator * collator) const
{
if (equal_ranges.empty())
return;
@ -432,12 +450,35 @@ void ColumnNullable::updatePermutation(bool reverse, size_t limit, int null_dire
}
}
getNestedColumn().updatePermutation(reverse, limit, null_direction_hint, res, new_ranges);
if (collator)
getNestedColumn().updatePermutationWithCollation(*collator, reverse, limit, null_direction_hint, res, new_ranges);
else
getNestedColumn().updatePermutation(reverse, limit, null_direction_hint, res, new_ranges);
equal_ranges = std::move(new_ranges);
std::move(null_ranges.begin(), null_ranges.end(), std::back_inserter(equal_ranges));
}
void ColumnNullable::getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
{
getPermutationImpl(reverse, limit, null_direction_hint, res);
}
void ColumnNullable::updatePermutation(bool reverse, size_t limit, int null_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
{
updatePermutationImpl(reverse, limit, null_direction_hint, res, equal_ranges);
}
void ColumnNullable::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
{
getPermutationImpl(reverse, limit, null_direction_hint, res, &collator);
}
void ColumnNullable::updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const
{
updatePermutationImpl(reverse, limit, null_direction_hint, res, equal_range, &collator);
}
void ColumnNullable::gather(ColumnGathererStream & gatherer)
{
gatherer.gather(*this);

View File

@ -6,6 +6,7 @@
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
class Collator;
namespace DB
{
@ -92,8 +93,12 @@ public:
void compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const override;
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator &) const override;
void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override;
void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutationWithCollation(
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const override;
void reserve(size_t n) override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
@ -129,6 +134,7 @@ public:
bool valuesHaveFixedSize() const override { return nested_column->valuesHaveFixedSize(); }
size_t sizeOfValueIfFixed() const override { return null_map->sizeOfValueIfFixed() + nested_column->sizeOfValueIfFixed(); }
bool onlyNull() const override { return nested_column->isDummy(); }
bool isCollationSupported() const override { return nested_column->isCollationSupported(); }
/// Return the column that represents values.
@ -164,6 +170,13 @@ private:
template <bool negative>
void applyNullMapImpl(const ColumnUInt8 & map);
int compareAtImpl(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint, const Collator * collator=nullptr) const;
void getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator = nullptr) const;
void updatePermutationImpl(
bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_ranges, const Collator * collator = nullptr) const;
};
ColumnPtr makeNullable(const ColumnPtr & column);

View File

@ -285,21 +285,22 @@ void ColumnString::compareColumn(
}
template <bool positive>
struct ColumnString::less
struct ColumnString::Cmp
{
const ColumnString & parent;
explicit less(const ColumnString & parent_) : parent(parent_) {}
bool operator()(size_t lhs, size_t rhs) const
explicit Cmp(const ColumnString & parent_) : parent(parent_) {}
int operator()(size_t lhs, size_t rhs) const
{
int res = memcmpSmallAllowOverflow15(
parent.chars.data() + parent.offsetAt(lhs), parent.sizeAt(lhs) - 1,
parent.chars.data() + parent.offsetAt(rhs), parent.sizeAt(rhs) - 1);
return positive ? (res < 0) : (res > 0);
return positive ? res : -res;
}
};
void ColumnString::getPermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res) const
template <typename Comparator>
void ColumnString::getPermutationImpl(size_t limit, Permutation & res, Comparator cmp) const
{
size_t s = offsets.size();
res.resize(s);
@ -309,23 +310,16 @@ void ColumnString::getPermutation(bool reverse, size_t limit, int /*nan_directio
if (limit >= s)
limit = 0;
auto less = [&cmp](size_t lhs, size_t rhs){ return cmp(lhs, rhs) < 0; };
if (limit)
{
if (reverse)
std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<false>(*this));
else
std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<true>(*this));
}
std::partial_sort(res.begin(), res.begin() + limit, res.end(), less);
else
{
if (reverse)
std::sort(res.begin(), res.end(), less<false>(*this));
else
std::sort(res.begin(), res.end(), less<true>(*this));
}
std::sort(res.begin(), res.end(), less);
}
void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res, EqualRanges & equal_ranges) const
template <typename Comparator>
void ColumnString::updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_ranges, Comparator cmp) const
{
if (equal_ranges.empty())
return;
@ -340,21 +334,17 @@ void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direc
if (limit)
--number_of_ranges;
auto less = [&cmp](size_t lhs, size_t rhs){ return cmp(lhs, rhs) < 0; };
for (size_t i = 0; i < number_of_ranges; ++i)
{
const auto & [first, last] = equal_ranges[i];
if (reverse)
std::sort(res.begin() + first, res.begin() + last, less<false>(*this));
else
std::sort(res.begin() + first, res.begin() + last, less<true>(*this));
std::sort(res.begin() + first, res.begin() + last, less);
size_t new_first = first;
for (size_t j = first + 1; j < last; ++j)
{
if (memcmpSmallAllowOverflow15(
chars.data() + offsetAt(res[j]), sizeAt(res[j]) - 1,
chars.data() + offsetAt(res[new_first]), sizeAt(res[new_first]) - 1) != 0)
if (cmp(res[j], res[new_first]) != 0)
{
if (j - new_first > 1)
new_ranges.emplace_back(new_first, j);
@ -375,17 +365,12 @@ void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direc
/// Since then we are working inside the interval.
if (reverse)
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less<false>(*this));
else
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less<true>(*this));
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less);
size_t new_first = first;
for (size_t j = first + 1; j < limit; ++j)
{
if (memcmpSmallAllowOverflow15(
chars.data() + offsetAt(res[j]), sizeAt(res[j]) - 1,
chars.data() + offsetAt(res[new_first]), sizeAt(res[new_first]) - 1) != 0)
if (cmp(res[j], res[new_first]) != 0)
{
if (j - new_first > 1)
new_ranges.emplace_back(new_first, j);
@ -395,9 +380,7 @@ void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direc
size_t new_last = limit;
for (size_t j = limit; j < last; ++j)
{
if (memcmpSmallAllowOverflow15(
chars.data() + offsetAt(res[j]), sizeAt(res[j]) - 1,
chars.data() + offsetAt(res[new_first]), sizeAt(res[new_first]) - 1) == 0)
if (cmp(res[j], res[new_first]) == 0)
{
std::swap(res[j], res[new_last]);
++new_last;
@ -408,6 +391,56 @@ void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direc
}
}
void ColumnString::getPermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res) const
{
if (reverse)
getPermutationImpl(limit, res, Cmp<false>(*this));
else
getPermutationImpl(limit, res, Cmp<true>(*this));
}
void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res, EqualRanges & equal_ranges) const
{
if (reverse)
updatePermutationImpl(limit, res, equal_ranges, Cmp<false>(*this));
else
updatePermutationImpl(limit, res, equal_ranges, Cmp<true>(*this));
}
template <bool positive>
struct ColumnString::CmpWithCollation
{
const ColumnString & parent;
const Collator & collator;
CmpWithCollation(const ColumnString & parent_, const Collator & collator_) : parent(parent_), collator(collator_) {}
int operator()(size_t lhs, size_t rhs) const
{
int res = collator.compare(
reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(lhs)]), parent.sizeAt(lhs),
reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(rhs)]), parent.sizeAt(rhs));
return positive ? res : -res;
}
};
void ColumnString::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int, Permutation & res) const
{
if (reverse)
getPermutationImpl(limit, res, CmpWithCollation<false>(*this, collator));
else
getPermutationImpl(limit, res, CmpWithCollation<true>(*this, collator));
}
void ColumnString::updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_ranges) const
{
if (reverse)
updatePermutationImpl(limit, res, equal_ranges, CmpWithCollation<false>(*this, collator));
else
updatePermutationImpl(limit, res, equal_ranges, CmpWithCollation<true>(*this, collator));
}
ColumnPtr ColumnString::replicate(const Offsets & replicate_offsets) const
{
size_t col_size = size();
@ -476,13 +509,13 @@ void ColumnString::getExtremes(Field & min, Field & max) const
size_t min_idx = 0;
size_t max_idx = 0;
less<true> less_op(*this);
Cmp<true> cmp_op(*this);
for (size_t i = 1; i < col_size; ++i)
{
if (less_op(i, min_idx))
if (cmp_op(i, min_idx) < 0)
min_idx = i;
else if (less_op(max_idx, i))
else if (cmp_op(max_idx, i) < 0)
max_idx = i;
}
@ -491,7 +524,7 @@ void ColumnString::getExtremes(Field & min, Field & max) const
}
int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, const Collator & collator) const
int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int, const Collator & collator) const
{
const ColumnString & rhs = assert_cast<const ColumnString &>(rhs_);
@ -500,134 +533,6 @@ int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs
reinterpret_cast<const char *>(&rhs.chars[rhs.offsetAt(m)]), rhs.sizeAt(m));
}
template <bool positive>
struct ColumnString::lessWithCollation
{
const ColumnString & parent;
const Collator & collator;
lessWithCollation(const ColumnString & parent_, const Collator & collator_) : parent(parent_), collator(collator_) {}
bool operator()(size_t lhs, size_t rhs) const
{
int res = collator.compare(
reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(lhs)]), parent.sizeAt(lhs),
reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(rhs)]), parent.sizeAt(rhs));
return positive ? (res < 0) : (res > 0);
}
};
void ColumnString::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, Permutation & res) const
{
size_t s = offsets.size();
res.resize(s);
for (size_t i = 0; i < s; ++i)
res[i] = i;
if (limit >= s)
limit = 0;
if (limit)
{
if (reverse)
std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<false>(*this, collator));
else
std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<true>(*this, collator));
}
else
{
if (reverse)
std::sort(res.begin(), res.end(), lessWithCollation<false>(*this, collator));
else
std::sort(res.begin(), res.end(), lessWithCollation<true>(*this, collator));
}
}
void ColumnString::updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_ranges) const
{
if (equal_ranges.empty())
return;
if (limit >= size() || limit >= equal_ranges.back().second)
limit = 0;
size_t number_of_ranges = equal_ranges.size();
if (limit)
--number_of_ranges;
EqualRanges new_ranges;
SCOPE_EXIT({equal_ranges = std::move(new_ranges);});
for (size_t i = 0; i < number_of_ranges; ++i)
{
const auto& [first, last] = equal_ranges[i];
if (reverse)
std::sort(res.begin() + first, res.begin() + last, lessWithCollation<false>(*this, collator));
else
std::sort(res.begin() + first, res.begin() + last, lessWithCollation<true>(*this, collator));
auto new_first = first;
for (auto j = first + 1; j < last; ++j)
{
if (collator.compare(
reinterpret_cast<const char *>(&chars[offsetAt(res[new_first])]), sizeAt(res[new_first]),
reinterpret_cast<const char *>(&chars[offsetAt(res[j])]), sizeAt(res[j])) != 0)
{
if (j - new_first > 1)
new_ranges.emplace_back(new_first, j);
new_first = j;
}
}
if (last - new_first > 1)
new_ranges.emplace_back(new_first, last);
}
if (limit)
{
const auto & [first, last] = equal_ranges.back();
if (limit < first || limit > last)
return;
/// Since then we are working inside the interval.
if (reverse)
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, lessWithCollation<false>(*this, collator));
else
std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, lessWithCollation<true>(*this, collator));
auto new_first = first;
for (auto j = first + 1; j < limit; ++j)
{
if (collator.compare(
reinterpret_cast<const char *>(&chars[offsetAt(res[new_first])]), sizeAt(res[new_first]),
reinterpret_cast<const char *>(&chars[offsetAt(res[j])]), sizeAt(res[j])) != 0)
{
if (j - new_first > 1)
new_ranges.emplace_back(new_first, j);
new_first = j;
}
}
auto new_last = limit;
for (auto j = limit; j < last; ++j)
{
if (collator.compare(
reinterpret_cast<const char *>(&chars[offsetAt(res[new_first])]), sizeAt(res[new_first]),
reinterpret_cast<const char *>(&chars[offsetAt(res[j])]), sizeAt(res[j])) == 0)
{
std::swap(res[new_last], res[j]);
++new_last;
}
}
if (new_last - new_first > 1)
new_ranges.emplace_back(new_first, new_last);
}
}
void ColumnString::protect()
{
getChars().protect();

View File

@ -43,14 +43,20 @@ private:
size_t ALWAYS_INLINE sizeAt(ssize_t i) const { return offsets[i] - offsets[i - 1]; }
template <bool positive>
struct less;
struct Cmp;
template <bool positive>
struct lessWithCollation;
struct CmpWithCollation;
ColumnString() = default;
ColumnString(const ColumnString & src);
template <typename Comparator>
void getPermutationImpl(size_t limit, Permutation & res, Comparator cmp) const;
template <typename Comparator>
void updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_ranges, Comparator cmp) const;
public:
const char * getFamilyName() const override { return "String"; }
TypeIndex getDataType() const override { return TypeIndex::String; }
@ -229,16 +235,16 @@ public:
int direction, int nan_direction_hint) const override;
/// Variant of compareAt for string comparison with respect of collation.
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, const Collator & collator) const;
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int, const Collator & collator) const override;
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override;
void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_ranges) const override;
/// Sorting with respect of collation.
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, Permutation & res) const;
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int, Permutation & res) const override;
void updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int, Permutation & res, EqualRanges& equal_range) const;
void updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_ranges) const override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
@ -270,6 +276,8 @@ public:
// Throws an exception if offsets/chars are messed up
void validate() const;
bool isCollationSupported() const override { return true; }
};

View File

@ -275,16 +275,27 @@ MutableColumns ColumnTuple::scatter(ColumnIndex num_columns, const Selector & se
return res;
}
int ColumnTuple::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
int ColumnTuple::compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator) const
{
const size_t tuple_size = columns.size();
for (size_t i = 0; i < tuple_size; ++i)
if (int res = columns[i]->compareAt(n, m, *assert_cast<const ColumnTuple &>(rhs).columns[i], nan_direction_hint))
{
int res;
if (collator && columns[i]->isCollationSupported())
res = columns[i]->compareAtWithCollation(n, m, *assert_cast<const ColumnTuple &>(rhs).columns[i], nan_direction_hint, *collator);
else
res = columns[i]->compareAt(n, m, *assert_cast<const ColumnTuple &>(rhs).columns[i], nan_direction_hint);
if (res)
return res;
}
return 0;
}
int ColumnTuple::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
{
return compareAtImpl(n, m, rhs, nan_direction_hint);
}
void ColumnTuple::compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const
@ -293,14 +304,20 @@ void ColumnTuple::compareColumn(const IColumn & rhs, size_t rhs_row_num,
compare_results, direction, nan_direction_hint);
}
int ColumnTuple::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator & collator) const
{
return compareAtImpl(n, m, rhs, nan_direction_hint, &collator);
}
template <bool positive>
struct ColumnTuple::Less
{
TupleColumns columns;
int nan_direction_hint;
const Collator * collator;
Less(const TupleColumns & columns_, int nan_direction_hint_)
: columns(columns_), nan_direction_hint(nan_direction_hint_)
Less(const TupleColumns & columns_, int nan_direction_hint_, const Collator * collator_=nullptr)
: columns(columns_), nan_direction_hint(nan_direction_hint_), collator(collator_)
{
}
@ -308,7 +325,11 @@ struct ColumnTuple::Less
{
for (const auto & column : columns)
{
int res = column->compareAt(a, b, *column, nan_direction_hint);
int res;
if (collator && column->isCollationSupported())
res = column->compareAtWithCollation(a, b, *column, nan_direction_hint, *collator);
else
res = column->compareAt(a, b, *column, nan_direction_hint);
if (res < 0)
return positive;
else if (res > 0)
@ -318,7 +339,8 @@ struct ColumnTuple::Less
}
};
void ColumnTuple::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
template <typename LessOperator>
void ColumnTuple::getPermutationImpl(size_t limit, Permutation & res, LessOperator less) const
{
size_t rows = size();
res.resize(rows);
@ -330,28 +352,25 @@ void ColumnTuple::getPermutation(bool reverse, size_t limit, int nan_direction_h
if (limit)
{
if (reverse)
std::partial_sort(res.begin(), res.begin() + limit, res.end(), Less<false>(columns, nan_direction_hint));
else
std::partial_sort(res.begin(), res.begin() + limit, res.end(), Less<true>(columns, nan_direction_hint));
std::partial_sort(res.begin(), res.begin() + limit, res.end(), less);
}
else
{
if (reverse)
std::sort(res.begin(), res.end(), Less<false>(columns, nan_direction_hint));
else
std::sort(res.begin(), res.end(), Less<true>(columns, nan_direction_hint));
std::sort(res.begin(), res.end(), less);
}
}
void ColumnTuple::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
void ColumnTuple::updatePermutationImpl(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges, const Collator * collator) const
{
if (equal_ranges.empty())
return;
for (const auto & column : columns)
{
column->updatePermutation(reverse, limit, nan_direction_hint, res, equal_ranges);
if (collator && column->isCollationSupported())
column->updatePermutationWithCollation(*collator, reverse, limit, nan_direction_hint, res, equal_ranges);
else
column->updatePermutation(reverse, limit, nan_direction_hint, res, equal_ranges);
while (limit && !equal_ranges.empty() && limit <= equal_ranges.back().first)
equal_ranges.pop_back();
@ -361,6 +380,32 @@ void ColumnTuple::updatePermutation(bool reverse, size_t limit, int nan_directio
}
}
void ColumnTuple::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
{
if (reverse)
getPermutationImpl(limit, res, Less<false>(columns, nan_direction_hint));
else
getPermutationImpl(limit, res, Less<true>(columns, nan_direction_hint));
}
void ColumnTuple::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
{
updatePermutationImpl(reverse, limit, nan_direction_hint, res, equal_ranges);
}
void ColumnTuple::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
{
if (reverse)
getPermutationImpl(limit, res, Less<false>(columns, nan_direction_hint, &collator));
else
getPermutationImpl(limit, res, Less<true>(columns, nan_direction_hint, &collator));
}
void ColumnTuple::updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_ranges) const
{
updatePermutationImpl(reverse, limit, nan_direction_hint, res, equal_ranges, &collator);
}
void ColumnTuple::gather(ColumnGathererStream & gatherer)
{
gatherer.gather(*this);
@ -433,5 +478,15 @@ bool ColumnTuple::structureEquals(const IColumn & rhs) const
return false;
}
bool ColumnTuple::isCollationSupported() const
{
for (const auto& column : columns)
{
if (column->isCollationSupported())
return true;
}
return false;
}
}

View File

@ -75,15 +75,19 @@ public:
void compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const override;
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator & collator) const override;
void getExtremes(Field & min, Field & max) const override;
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const override;
void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
void reserve(size_t n) override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
void protect() override;
void forEachSubcolumn(ColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
bool isCollationSupported() const override;
size_t tupleSize() const { return columns.size(); }
@ -94,6 +98,15 @@ public:
Columns getColumnsCopy() const { return {columns.begin(), columns.end()}; }
const ColumnPtr & getColumnPtr(size_t idx) const { return columns[idx]; }
private:
int compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator=nullptr) const;
template <typename LessOperator>
void getPermutationImpl(size_t limit, Permutation & res, LessOperator less) const;
void updatePermutationImpl(
bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges, const Collator * collator=nullptr) const;
};

View File

@ -9,7 +9,7 @@
class SipHash;
class Collator;
namespace DB
{
@ -18,6 +18,7 @@ namespace ErrorCodes
{
extern const int CANNOT_GET_SIZE_OF_FIELD;
extern const int NOT_IMPLEMENTED;
extern const int BAD_COLLATION;
}
class Arena;
@ -250,6 +251,12 @@ public:
*/
virtual int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const = 0;
/// Equivalent to compareAt, but collator is used to compare values.
virtual int compareAtWithCollation(size_t, size_t, const IColumn &, int, const Collator &) const
{
throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing it.", ErrorCodes::BAD_COLLATION);
}
/// Compare the whole column with single value from rhs column.
/// If row_indexes is nullptr, it's ignored. Otherwise, it is a set of rows to compare.
/// compare_results[i] will be equal to compareAt(row_indexes[i], rhs_row_num, rhs, nan_direction_hint) * direction
@ -277,6 +284,18 @@ public:
*/
virtual void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_ranges) const = 0;
/** Equivalent to getPermutation and updatePermutation but collator is used to compare values.
* Supported for String, LowCardinality(String), Nullable(String) and for Array and Tuple, containing them.
*/
virtual void getPermutationWithCollation(const Collator &, bool, size_t, int, Permutation &) const
{
throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing them.", ErrorCodes::BAD_COLLATION);
}
virtual void updatePermutationWithCollation(const Collator &, bool, size_t, int, Permutation &, EqualRanges&) const
{
throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing them.", ErrorCodes::BAD_COLLATION);
}
/** Copies each element according offsets parameter.
* (i-th element should be copied offsets[i] - offsets[i - 1] times.)
* It is necessary in ARRAY JOIN operation.
@ -402,6 +421,8 @@ public:
virtual bool lowCardinality() const { return false; }
virtual bool isCollationSupported() const { return false; }
virtual ~IColumn() = default;
IColumn() = default;
IColumn(const IColumn &) = default;

View File

@ -96,7 +96,7 @@ struct SortCursorImpl
: column_desc.column_number;
sort_columns.push_back(columns[column_number].get());
need_collation[j] = desc[j].collator != nullptr && typeid_cast<const ColumnString *>(sort_columns.back()); /// TODO Nullable(String)
need_collation[j] = desc[j].collator != nullptr && sort_columns.back()->isCollationSupported(); /// TODO Nullable(String)
has_collation |= need_collation[j];
}
@ -201,10 +201,7 @@ struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
int nulls_direction = desc.nulls_direction;
int res;
if (impl->need_collation[i])
{
const ColumnString & column_string = assert_cast<const ColumnString &>(*impl->sort_columns[i]);
res = column_string.compareAtWithCollation(lhs_pos, rhs_pos, *(rhs.impl->sort_columns[i]), *impl->desc[i].collator);
}
res = impl->sort_columns[i]->compareAtWithCollation(lhs_pos, rhs_pos, *(rhs.impl->sort_columns[i]), nulls_direction, *impl->desc[i].collator);
else
res = impl->sort_columns[i]->compareAt(lhs_pos, rhs_pos, *(rhs.impl->sort_columns[i]), nulls_direction);

View File

@ -2,6 +2,8 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnLowCardinality.h>
#include <Common/typeid_cast.h>
#include <Functions/FunctionHelpers.h>
@ -86,8 +88,7 @@ struct PartialSortingLessWithCollation
}
else if (isCollationRequired(elem.description))
{
const ColumnString & column_string = assert_cast<const ColumnString &>(*elem.column);
res = column_string.compareAtWithCollation(a, b, *elem.column, *elem.description.collator);
res = elem.column->compareAtWithCollation(a, b, *elem.column, elem.description.nulls_direction, *elem.description.collator);
}
else
res = elem.column->compareAt(a, b, *elem.column, elem.description.nulls_direction);
@ -101,7 +102,6 @@ struct PartialSortingLessWithCollation
}
};
void sortBlock(Block & block, const SortDescription & description, UInt64 limit)
{
if (!block)
@ -120,14 +120,13 @@ void sortBlock(Block & block, const SortDescription & description, UInt64 limit)
bool is_column_const = false;
if (isCollationRequired(description[0]))
{
/// it it's real string column, than we need sort
if (const ColumnString * column_string = checkAndGetColumn<ColumnString>(column))
column_string->getPermutationWithCollation(*description[0].collator, reverse, limit, perm);
else if (checkAndGetColumnConstData<ColumnString>(column))
if (!column->isCollationSupported())
throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing them.", ErrorCodes::BAD_COLLATION);
if (isColumnConst(*column))
is_column_const = true;
else
throw Exception("Collations could be specified only for String columns.", ErrorCodes::BAD_COLLATION);
column->getPermutationWithCollation(*description[0].collator, reverse, limit, description[0].nulls_direction, perm);
}
else if (!isColumnConst(*column))
{
@ -163,8 +162,8 @@ void sortBlock(Block & block, const SortDescription & description, UInt64 limit)
const IColumn * column = columns_with_sort_desc[i].column;
if (isCollationRequired(description[i]))
{
if (!checkAndGetColumn<ColumnString>(column) && !checkAndGetColumnConstData<ColumnString>(column))
throw Exception("Collations could be specified only for String columns.", ErrorCodes::BAD_COLLATION);
if (!column->isCollationSupported())
throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing them.", ErrorCodes::BAD_COLLATION);
need_collation = true;
}
@ -187,10 +186,8 @@ void sortBlock(Block & block, const SortDescription & description, UInt64 limit)
if (isCollationRequired(column.description))
{
const ColumnString & column_string = assert_cast<const ColumnString &>(*column.column);
column_string.updatePermutationWithCollation(
*column.description.collator,
column.description.direction < 0, limit, column.description.nulls_direction, perm, ranges);
column.column->updatePermutationWithCollation(
*column.description.collator, column.description.direction < 0, limit, column.description.nulls_direction, perm, ranges);
}
else
{

View File

@ -0,0 +1,64 @@
Order by without collate
1 Ё
2 А
2 Я
1 а
2 я
1 ё
Order by with collate
1 а
2 А
1 ё
1 Ё
2 я
2 Я
Order by tuple without collate
1 Ё
1 а
1 ё
2 А
2 Я
2 я
Order by tuple with collate
1 а
1 ё
1 Ё
2 А
2 я
2 Я
Order by without collate
1 Ё
2 А
2 Я
1 а
2 я
1 ё
1 \N
2 \N
Order by with collate
1 а
2 А
1 ё
1 Ё
2 я
2 Я
1 \N
2 \N
Order by tuple without collate
1 Ё
1 а
1 ё
1 \N
2 А
2 Я
2 я
2 \N
Order by tuple with collate
1 а
1 ё
1 Ё
1 \N
2 А
2 я
2 Я
2 \N

View File

@ -0,0 +1,33 @@
DROP TABLE IF EXISTS test_collate;
DROP TABLE IF EXISTS test_collate_null;
CREATE TABLE test_collate (x UInt32, s LowCardinality(String)) ENGINE=Memory();
CREATE TABLE test_collate_null (x UInt32, s LowCardinality(Nullable(String))) ENGINE=Memory();
INSERT INTO test_collate VALUES (1, 'Ё'), (1, 'ё'), (1, 'а'), (2, 'А'), (2, 'я'), (2, 'Я');
INSERT INTO test_collate_null VALUES (1, 'Ё'), (1, 'ё'), (1, 'а'), (2, 'А'), (2, 'я'), (2, 'Я'), (1, null), (2, null);
SELECT 'Order by without collate';
SELECT * FROM test_collate ORDER BY s;
SELECT 'Order by with collate';
SELECT * FROM test_collate ORDER BY s COLLATE 'ru';
SELECT 'Order by tuple without collate';
SELECT * FROM test_collate ORDER BY x, s;
SELECT 'Order by tuple with collate';
SELECT * FROM test_collate ORDER BY x, s COLLATE 'ru';
SELECT 'Order by without collate';
SELECT * FROM test_collate_null ORDER BY s;
SELECT 'Order by with collate';
SELECT * FROM test_collate_null ORDER BY s COLLATE 'ru';
SELECT 'Order by tuple without collate';
SELECT * FROM test_collate_null ORDER BY x, s;
SELECT 'Order by tuple with collate';
SELECT * FROM test_collate_null ORDER BY x, s COLLATE 'ru';
DROP TABLE test_collate;
DROP TABLE test_collate_null;

View File

@ -0,0 +1,36 @@
Order by without collate
1 Ё
2 А
2 Я
1 а
2 я
1 ё
1 \N
2 \N
Order by with collate
1 а
2 А
1 ё
1 Ё
2 я
2 Я
1 \N
2 \N
Order by tuple without collate
1 Ё
1 а
1 ё
1 \N
2 А
2 Я
2 я
2 \N
Order by tuple with collate
1 а
1 ё
1 Ё
1 \N
2 А
2 я
2 Я
2 \N

View File

@ -0,0 +1,18 @@
DROP TABLE IF EXISTS test_collate;
CREATE TABLE test_collate (x UInt32, s Nullable(String)) ENGINE=Memory();
INSERT INTO test_collate VALUES (1, 'Ё'), (1, 'ё'), (1, 'а'), (1, null), (2, 'А'), (2, 'я'), (2, 'Я'), (2, null);
SELECT 'Order by without collate';
SELECT * FROM test_collate ORDER BY s;
SELECT 'Order by with collate';
SELECT * FROM test_collate ORDER BY s COLLATE 'ru';
SELECT 'Order by tuple without collate';
SELECT * FROM test_collate ORDER BY x, s;
SELECT 'Order by tuple with collate';
SELECT * FROM test_collate ORDER BY x, s COLLATE 'ru';
DROP TABLE test_collate;

View File

@ -0,0 +1,50 @@
1 ['а']
2 ['А']
1 ['ё']
1 ['ё','а']
2 ['ё','а','а']
1 ['ё','я']
1 ['Ё']
2 ['я','а']
2 ['Я']
1 ['а']
1 ['ё']
1 ['ё','а']
1 ['ё','я']
1 ['Ё']
2 ['А']
2 ['ё','а','а']
2 ['я','а']
2 ['Я']
1 ['а']
2 ['А']
1 ['ё']
1 ['ё','а']
2 ['ё','а','а',NULL]
1 ['ё',NULL,'я']
1 ['Ё']
2 ['я']
2 [NULL,'Я']
1 ['а']
1 ['ё']
1 ['ё','а']
1 ['ё',NULL,'я']
1 ['Ё']
2 ['А']
2 ['ё','а','а',NULL]
2 ['я']
2 [NULL,'Я']
2 [['а','а'],['я','ё']]
1 [['а','Ё'],['ё','я']]
1 [['а','я'],['а','ё']]
2 [['ё']]
1 [['а','Ё'],['ё','я']]
1 [['а','я'],['а','ё']]
2 [['а','а'],['я','ё']]
2 [['ё']]

View File

@ -0,0 +1,34 @@
DROP TABLE IF EXISTS collate_test1;
DROP TABLE IF EXISTS collate_test2;
DROP TABLE IF EXISTS collate_test3;
CREATE TABLE collate_test1 (x UInt32, s Array(String)) ENGINE=Memory();
CREATE TABLE collate_test2 (x UInt32, s Array(LowCardinality(Nullable(String)))) ENGINE=Memory();
CREATE TABLE collate_test3 (x UInt32, s Array(Array(String))) ENGINE=Memory();
INSERT INTO collate_test1 VALUES (1, ['Ё']), (1, ['ё']), (1, ['а']), (2, ['А']), (2, ['я', 'а']), (2, ['Я']), (1, ['ё','а']), (1, ['ё', 'я']), (2, ['ё', 'а', 'а']);
INSERT INTO collate_test2 VALUES (1, ['Ё']), (1, ['ё']), (1, ['а']), (2, ['А']), (2, ['я']), (2, [null, 'Я']), (1, ['ё','а']), (1, ['ё', null, 'я']), (2, ['ё', 'а', 'а', null]);
INSERT INTO collate_test3 VALUES (1, [['а', 'я'], ['а', 'ё']]), (1, [['а', 'Ё'], ['ё', 'я']]), (2, [['ё']]), (2, [['а', 'а'], ['я', 'ё']]);
SELECT * FROM collate_test1 ORDER BY s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test1 ORDER BY x, s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test2 ORDER BY s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test2 ORDER BY x, s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test3 ORDER BY s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test3 ORDER BY x, s COLLATE 'ru';
SELECT '';
DROP TABLE collate_test1;
DROP TABLE collate_test2;
DROP TABLE collate_test3;

View File

@ -0,0 +1,60 @@
1 (1,'а')
1 (1,'ё')
1 (1,'Ё')
2 (1,'я')
1 (2,'а')
2 (2,'А')
2 (2,'Я')
1 (3,'я')
1 (1,'а')
1 (1,'ё')
1 (1,'Ё')
1 (2,'а')
1 (3,'я')
2 (1,'я')
2 (2,'А')
2 (2,'Я')
1 (1,'а')
1 (1,'ё')
1 (1,'Ё')
2 (1,'я')
1 (1,NULL)
2 (2,'А')
2 (2,'Я')
1 (2,NULL)
2 (2,NULL)
1 (3,'я')
1 (1,'а')
1 (1,'ё')
1 (1,'Ё')
1 (1,NULL)
1 (2,NULL)
1 (3,'я')
2 (1,'я')
2 (2,'А')
2 (2,'Я')
2 (2,NULL)
2 (1,(1,['А']))
2 (1,(1,['ё','а','а']))
1 (1,(1,['Ё']))
2 (1,(1,['Я']))
1 (1,(2,['а']))
1 (1,(2,['ё','я']))
1 (2,(1,['ё']))
1 (2,(1,['ё','а']))
2 (2,(1,['я']))
1 (1,(1,['Ё']))
1 (1,(2,['а']))
1 (1,(2,['ё','я']))
1 (2,(1,['ё']))
1 (2,(1,['ё','а']))
2 (1,(1,['А']))
2 (1,(1,['ё','а','а']))
2 (1,(1,['Я']))
2 (2,(1,['я']))

View File

@ -0,0 +1,34 @@
DROP TABLE IF EXISTS collate_test1;
DROP TABLE IF EXISTS collate_test2;
DROP TABLE IF EXISTS collate_test3;
CREATE TABLE collate_test1 (x UInt32, s Tuple(UInt32, String)) ENGINE=Memory();
CREATE TABLE collate_test2 (x UInt32, s Tuple(UInt32, LowCardinality(Nullable(String)))) ENGINE=Memory();
CREATE TABLE collate_test3 (x UInt32, s Tuple(UInt32, Tuple(UInt32, Array(String)))) ENGINE=Memory();
INSERT INTO collate_test1 VALUES (1, (1, 'Ё')), (1, (1, 'ё')), (1, (1, 'а')), (2, (2, 'А')), (2, (1, 'я')), (2, (2, 'Я')), (1, (2,'а')), (1, (3, 'я'));
INSERT INTO collate_test2 VALUES (1, (1, 'Ё')), (1, (1, 'ё')), (1, (1, 'а')), (2, (2, 'А')), (2, (1, 'я')), (2, (2, 'Я')), (1, (2, null)), (1, (3, 'я')), (1, (1, null)), (2, (2, null));
INSERT INTO collate_test3 VALUES (1, (1, (1, ['Ё']))), (1, (2, (1, ['ё']))), (1, (1, (2, ['а']))), (2, (1, (1, ['А']))), (2, (2, (1, ['я']))), (2, (1, (1, ['Я']))), (1, (2, (1, ['ё','а']))), (1, (1, (2, ['ё', 'я']))), (2, (1, (1, ['ё', 'а', 'а'])));
SELECT * FROM collate_test1 ORDER BY s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test1 ORDER BY x, s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test2 ORDER BY s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test2 ORDER BY x, s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test3 ORDER BY s COLLATE 'ru';
SELECT '';
SELECT * FROM collate_test3 ORDER BY x, s COLLATE 'ru';
SELECT '';
DROP TABLE collate_test1;
DROP TABLE collate_test2;
DROP TABLE collate_test3;

View File

@ -155,6 +155,10 @@
01509_dictionary_preallocate
01526_max_untracked_memory
01530_drop_database_atomic_sync
01532_collate_in_low_cardinality
01533_collate_in_nullable
01542_collate_in_array
01543_collate_in_tuple
01546_log_queries_min_query_duration_ms
01547_query_log_current_database
01548_query_log_query_execution_ms