ClickHouse/src/Common/RadixSort.h

483 lines
17 KiB
C++
Raw Normal View History

#pragma once
2019-04-02 20:15:02 +00:00
#include <string.h>
#if !defined(__APPLE__) && !defined(__FreeBSD__)
#include <malloc.h>
#endif
2019-04-02 20:15:02 +00:00
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <cstdint>
#include <type_traits>
2017-06-06 17:18:32 +00:00
#include <ext/bit_cast.h>
#include <Core/Types.h>
#include <Core/Defines.h>
/** Radix sort, has the following functionality:
2017-05-07 20:25:26 +00:00
* Can sort unsigned, signed numbers, and floats.
* Can sort an array of fixed length elements that contain something else besides the key.
* Customizable radix size.
2015-11-23 20:24:54 +00:00
*
* LSB, stable.
2017-05-07 20:25:26 +00:00
* NOTE For some applications it makes sense to add MSB-radix-sort,
* as well as radix-select, radix-partial-sort, radix-get-permutation algorithms based on it.
*/
2017-05-07 20:25:26 +00:00
/** Used as a template parameter. See below.
*/
struct RadixSortMallocAllocator
{
void * allocate(size_t size)
{
return malloc(size);
}
2017-12-01 18:36:55 +00:00
void deallocate(void * ptr, size_t /*size*/)
{
return free(ptr);
}
};
2019-05-02 22:51:39 +00:00
/** A transformation that transforms the bit representation of a key into an unsigned integer number,
* that the order relation over the keys will match the order relation over the obtained unsigned numbers.
* For floats this conversion does the following:
* if the signed bit is set, it flips all other bits.
* In this case, NaN-s are bigger than all normal numbers.
*/
template <typename KeyBits>
2019-05-02 22:51:39 +00:00
struct RadixSortFloatTransform
{
2019-05-02 22:51:39 +00:00
/// Is it worth writing the result in memory, or is it better to do calculation every time again?
static constexpr bool transform_is_simple = false;
2019-05-02 22:51:39 +00:00
static KeyBits forward(KeyBits x)
{
return x ^ ((-(x >> (sizeof(KeyBits) * 8 - 1))) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
}
2019-05-02 22:51:39 +00:00
static KeyBits backward(KeyBits x)
{
return x ^ (((x >> (sizeof(KeyBits) * 8 - 1)) - 1) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
}
};
2019-04-26 22:07:32 +00:00
2019-04-25 01:16:26 +00:00
template <typename TElement>
2019-05-02 22:51:39 +00:00
struct RadixSortFloatTraits
{
2020-05-23 14:28:05 +00:00
/// The type of the element. It can be a structure with a key and some other payload. Or just a key.
using Element = TElement;
/// The key to sort by.
using Key = Element;
/// Part of the element that you need in the result array.
/// There are cases when elements are sorted by one part but you need other parts in array of results.
using Result = Element;
/// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t.
using CountType = uint32_t;
2019-05-02 22:51:39 +00:00
/// The type to which the key is transformed to do bit operations. This UInt is the same size as the key.
using KeyBits = std::conditional_t<sizeof(Key) == 8, uint64_t, uint32_t>;
2019-05-02 22:51:39 +00:00
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
/// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits.
using Transform = RadixSortFloatTransform<KeyBits>;
/// An object with the functions allocate and deallocate.
/// Can be used, for example, to allocate memory for a temporary array on the stack.
/// To do this, the allocator itself is created on the stack.
using Allocator = RadixSortMallocAllocator;
2019-05-02 22:51:39 +00:00
/// The function to get the key from an array element.
2019-04-25 01:16:26 +00:00
static Key & extractKey(Element & elem) { return elem; }
2020-05-23 14:28:05 +00:00
/// The function to get the result part from an array element.
static Result & extractResult(Element & elem) { return elem; }
2019-05-02 22:51:39 +00:00
/// Used when fallback to comparison based sorting is needed.
/// TODO: Correct handling of NaNs, NULLs, etc
2019-04-27 17:52:43 +00:00
static bool less(Key x, Key y)
2019-04-26 22:07:32 +00:00
{
return x < y;
}
};
template <typename KeyBits>
2019-05-02 22:51:39 +00:00
struct RadixSortIdentityTransform
{
static constexpr bool transform_is_simple = true;
2019-05-02 22:51:39 +00:00
static KeyBits forward(KeyBits x) { return x; }
static KeyBits backward(KeyBits x) { return x; }
};
2019-05-02 22:51:39 +00:00
2019-04-25 01:16:26 +00:00
template <typename TElement>
2019-05-02 22:51:39 +00:00
struct RadixSortUIntTraits
{
2019-04-25 01:16:26 +00:00
using Element = TElement;
2020-05-23 14:28:05 +00:00
using Result = Element;
2019-04-25 01:16:26 +00:00
using Key = Element;
using CountType = uint32_t;
2019-05-02 22:51:39 +00:00
using KeyBits = Key;
static constexpr size_t PART_SIZE_BITS = 8;
2019-05-02 22:51:39 +00:00
using Transform = RadixSortIdentityTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
2019-04-25 01:16:26 +00:00
static Key & extractKey(Element & elem) { return elem; }
2020-05-23 14:28:05 +00:00
static Result & extractResult(Element & elem) { return elem; }
2019-04-26 22:07:32 +00:00
2019-04-27 17:52:43 +00:00
static bool less(Key x, Key y)
2019-04-26 22:07:32 +00:00
{
return x < y;
}
};
template <typename KeyBits>
2019-05-02 22:51:39 +00:00
struct RadixSortSignedTransform
2019-04-26 22:07:32 +00:00
{
2019-05-02 22:51:39 +00:00
static constexpr bool transform_is_simple = true;
2019-04-26 22:07:32 +00:00
2019-05-02 22:51:39 +00:00
static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
};
2019-04-02 20:15:02 +00:00
2019-04-25 01:16:26 +00:00
template <typename TElement>
2019-05-02 22:51:39 +00:00
struct RadixSortIntTraits
{
2019-05-02 22:51:39 +00:00
using Element = TElement;
2020-05-23 14:28:05 +00:00
using Result = Element;
2019-05-02 22:51:39 +00:00
using Key = Element;
using CountType = uint32_t;
using KeyBits = std::make_unsigned_t<Key>;
2019-04-26 22:07:32 +00:00
2019-05-02 22:51:39 +00:00
static constexpr size_t PART_SIZE_BITS = 8;
2019-04-26 22:07:32 +00:00
2019-05-02 22:51:39 +00:00
using Transform = RadixSortSignedTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
2019-04-25 01:16:26 +00:00
static Key & extractKey(Element & elem) { return elem; }
2020-05-23 14:28:05 +00:00
static Result & extractResult(Element & elem) { return elem; }
2019-04-26 22:07:32 +00:00
2019-04-27 17:52:43 +00:00
static bool less(Key x, Key y)
2019-04-26 22:07:32 +00:00
{
return x < y;
}
};
2019-02-18 16:29:37 +00:00
2019-04-25 01:16:26 +00:00
template <typename T>
2019-11-02 05:55:06 +00:00
using RadixSortNumTraits = std::conditional_t<
is_integral_v<T>,
std::conditional_t<is_unsigned_v<T>, RadixSortUIntTraits<T>, RadixSortIntTraits<T>>,
RadixSortFloatTraits<T>>;
2019-02-18 16:29:37 +00:00
template <typename Traits>
struct RadixSort
{
private:
using Element = typename Traits::Element;
2020-05-23 14:28:05 +00:00
using Result = typename Traits::Result;
using Key = typename Traits::Key;
using CountType = typename Traits::CountType;
using KeyBits = typename Traits::KeyBits;
2019-04-27 17:52:43 +00:00
// Use insertion sort if the size of the array is less than equal to this threshold
static constexpr size_t INSERTION_SORT_THRESHOLD = 64;
2019-04-26 22:07:32 +00:00
static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS;
static constexpr size_t PART_BITMASK = HISTOGRAM_SIZE - 1;
static constexpr size_t KEY_BITS = sizeof(Key) * 8;
static constexpr size_t NUM_PASSES = (KEY_BITS + (Traits::PART_SIZE_BITS - 1)) / Traits::PART_SIZE_BITS;
static ALWAYS_INLINE KeyBits getPart(size_t N, KeyBits x)
{
if (Traits::Transform::transform_is_simple)
x = Traits::Transform::forward(x);
return (x >> (N * Traits::PART_SIZE_BITS)) & PART_BITMASK;
}
static KeyBits keyToBits(Key x) { return ext::bit_cast<KeyBits>(x); }
static Key bitsToKey(KeyBits x) { return ext::bit_cast<Key>(x); }
2019-04-27 17:52:43 +00:00
static void insertionSortInternal(Element *arr, size_t size)
2019-04-26 22:07:32 +00:00
{
2019-04-27 17:52:43 +00:00
Element * end = arr + size;
for (Element * i = arr + 1; i < end; ++i)
2019-04-29 01:50:18 +00:00
{
2019-04-27 17:52:43 +00:00
if (Traits::less(Traits::extractKey(*i), Traits::extractKey(*(i - 1))))
2019-04-26 22:07:32 +00:00
{
Element * j;
Element tmp = *i;
*i = *(i - 1);
2019-04-27 17:52:43 +00:00
for (j = i - 1; j > arr && Traits::less(Traits::extractKey(tmp), Traits::extractKey(*(j - 1))); --j)
2019-04-26 22:07:32 +00:00
*j = *(j - 1);
*j = tmp;
}
2019-04-29 01:50:18 +00:00
}
2019-04-26 22:07:32 +00:00
}
2019-04-27 17:52:43 +00:00
/* Main MSD radix sort subroutine
* Puts elements to buckets based on PASS-th digit, then recursively calls insertion sort or itself on the buckets
*/
template <size_t PASS>
2019-05-02 22:51:39 +00:00
static inline void radixSortMSDInternal(Element * arr, size_t size, size_t limit)
2019-04-26 22:07:32 +00:00
{
2019-05-02 22:51:39 +00:00
Element * last_list[HISTOGRAM_SIZE + 1];
2019-04-27 17:52:43 +00:00
Element ** last = last_list + 1;
2019-04-26 22:07:32 +00:00
size_t count[HISTOGRAM_SIZE] = {0};
for (Element * i = arr; i < arr + size; ++i)
++count[getPart(PASS, *i)];
2019-04-27 17:52:43 +00:00
last_list[0] = last_list[1] = arr;
2019-04-26 22:07:32 +00:00
2019-04-27 17:52:43 +00:00
size_t buckets_for_recursion = HISTOGRAM_SIZE;
2019-04-26 22:07:32 +00:00
Element * finish = arr + size;
for (size_t i = 1; i < HISTOGRAM_SIZE; ++i)
{
last[i] = last[i - 1] + count[i - 1];
if (last[i] >= arr + limit)
{
2019-04-27 17:52:43 +00:00
buckets_for_recursion = i;
2019-04-26 22:07:32 +00:00
finish = last[i];
}
}
2019-04-27 17:52:43 +00:00
/* At this point, we have the following variables:
* count[i] is the size of i-th bucket
* last[i] is a pointer to the beginning of i-th bucket, last[-1] == last[0]
* buckets_for_recursion is the number of buckets that should be sorted, the last of them only partially
* finish is a pointer to the end of the first buckets_for_recursion buckets
*/
// Scatter array elements to buckets until the first buckets_for_recursion buckets are full
for (size_t i = 0; i < buckets_for_recursion; ++i)
2019-04-26 22:07:32 +00:00
{
Element * end = last[i - 1] + count[i];
if (end == finish)
{
last[i] = end;
break;
}
while (last[i] != end)
{
Element swapper = *last[i];
KeyBits tag = getPart(PASS, swapper);
if (tag != i)
{
do
{
std::swap(swapper, *last[tag]++);
} while ((tag = getPart(PASS, swapper)) != i);
*last[i] = swapper;
}
++last[i];
}
}
if constexpr (PASS > 0)
{
2019-04-27 17:52:43 +00:00
// Recursively sort buckets, except the last one
for (size_t i = 0; i < buckets_for_recursion - 1; ++i)
2019-04-26 22:07:32 +00:00
{
Element * start = last[i - 1];
2019-04-27 17:52:43 +00:00
size_t subsize = last[i] - last[i - 1];
radixSortMSDInternalHelper<PASS - 1>(start, subsize, subsize);
2019-04-26 22:07:32 +00:00
}
2019-04-27 17:52:43 +00:00
// Sort last necessary bucket with limit
Element * start = last[buckets_for_recursion - 2];
size_t subsize = last[buckets_for_recursion - 1] - last[buckets_for_recursion - 2];
size_t sublimit = limit - (last[buckets_for_recursion - 1] - arr);
radixSortMSDInternalHelper<PASS - 1>(start, subsize, sublimit);
2019-04-26 22:07:32 +00:00
}
}
2019-04-27 17:52:43 +00:00
// A helper to choose sorting algorithm based on array length
template <size_t PASS>
2019-05-02 22:51:39 +00:00
static inline void radixSortMSDInternalHelper(Element * arr, size_t size, size_t limit)
2019-04-26 22:07:32 +00:00
{
2019-04-27 17:52:43 +00:00
if (size <= INSERTION_SORT_THRESHOLD)
insertionSortInternal(arr, size);
2019-04-26 22:07:32 +00:00
else
2019-04-27 17:52:43 +00:00
radixSortMSDInternal<PASS>(arr, size, limit);
2019-04-26 22:07:32 +00:00
}
public:
2020-05-23 14:28:05 +00:00
/** Least significant digit radix sort (stable).
*
* This function will sort inplace (modify 'arr')
* but if 'destination' is provided, it will write result directly to destination
* instead of finishing sorting 'arr' at the last step.
* In this case it will fill only Result parts of the Element into destination.
* It is handy to avoid unnecessary data movements.
*/
static void executeLSD(Element * arr, size_t size, bool reverse = false, Result * destination = nullptr)
{
2017-05-07 20:25:26 +00:00
/// If the array is smaller than 256, then it is better to use another algorithm.
/// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time.
/// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met.
CountType histograms[HISTOGRAM_SIZE * NUM_PASSES] = {0};
typename Traits::Allocator allocator;
2017-05-07 20:25:26 +00:00
/// We will do several passes through the array. On each pass, the data is transferred to another array. Let's allocate this temporary array.
Element * swap_buffer = reinterpret_cast<Element *>(allocator.allocate(size * sizeof(Element)));
2017-05-07 20:25:26 +00:00
/// Transform the array and calculate the histogram.
2017-09-12 21:06:11 +00:00
/// NOTE This is slightly suboptimal. Look at https://github.com/powturbo/TurboHist
for (size_t i = 0; i < size; ++i)
{
if (!Traits::Transform::transform_is_simple)
Traits::extractKey(arr[i]) = bitsToKey(Traits::Transform::forward(keyToBits(Traits::extractKey(arr[i]))));
2019-04-25 01:16:26 +00:00
for (size_t pass = 0; pass < NUM_PASSES; ++pass)
++histograms[pass * HISTOGRAM_SIZE + getPart(pass, keyToBits(Traits::extractKey(arr[i])))];
}
{
2017-05-07 20:25:26 +00:00
/// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions minus one.
size_t sums[NUM_PASSES] = {0};
for (size_t i = 0; i < HISTOGRAM_SIZE; ++i)
{
2019-04-25 01:16:26 +00:00
for (size_t pass = 0; pass < NUM_PASSES; ++pass)
{
2019-04-25 01:16:26 +00:00
size_t tmp = histograms[pass * HISTOGRAM_SIZE + i] + sums[pass];
histograms[pass * HISTOGRAM_SIZE + i] = sums[pass] - 1;
sums[pass] = tmp;
}
}
}
bool direct_copy_to_destination = (destination);
2017-05-07 20:25:26 +00:00
/// Move the elements in the order starting from the least bit piece, and then do a few passes on the number of pieces.
for (size_t pass = 0; pass < NUM_PASSES - direct_copy_to_destination; ++pass)
{
2019-04-25 01:16:26 +00:00
Element * writer = pass % 2 ? arr : swap_buffer;
Element * reader = pass % 2 ? swap_buffer : arr;
for (size_t i = 0; i < size; ++i)
{
2019-04-25 01:16:26 +00:00
size_t pos = getPart(pass, keyToBits(Traits::extractKey(reader[i])));
2017-05-07 20:25:26 +00:00
/// Place the element on the next free position.
2019-04-25 01:16:26 +00:00
auto & dest = writer[++histograms[pass * HISTOGRAM_SIZE + pos]];
dest = reader[i];
2017-05-07 20:25:26 +00:00
/// On the last pass, we do the reverse transformation.
2019-04-25 01:16:26 +00:00
if (!Traits::Transform::transform_is_simple && pass == NUM_PASSES - 1)
Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(reader[i]))));
}
}
if (direct_copy_to_destination)
{
size_t pass = NUM_PASSES - 1;
2020-05-23 14:28:05 +00:00
Result * writer = destination;
Element * reader = pass % 2 ? swap_buffer : arr;
2020-05-21 11:35:54 +00:00
if (reverse)
{
2020-05-21 11:35:54 +00:00
for (size_t i = 0; i < size; ++i)
{
size_t pos = getPart(pass, keyToBits(Traits::extractKey(reader[i])));
2020-05-21 11:35:54 +00:00
/// Place the element on the next free position.
2020-05-23 14:28:05 +00:00
writer[size - 1 - (++histograms[pass * HISTOGRAM_SIZE + pos])] = Traits::extractResult(reader[i]);
2020-05-21 11:35:54 +00:00
}
2020-05-21 11:39:40 +00:00
}
else
2020-05-21 11:35:54 +00:00
{
for (size_t i = 0; i < size; ++i)
{
size_t pos = getPart(pass, keyToBits(Traits::extractKey(reader[i])));
/// Place the element on the next free position.
2020-05-23 14:28:05 +00:00
writer[++histograms[pass * HISTOGRAM_SIZE + pos]] = Traits::extractResult(reader[i]);
2020-05-21 11:35:54 +00:00
}
}
2020-05-23 14:28:05 +00:00
}
else if (NUM_PASSES % 2)
{
/// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array.
/// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array.
memcpy(arr, swap_buffer, size * sizeof(Element));
2020-05-23 14:28:05 +00:00
}
else if (reverse)
{
std::reverse(arr, arr + size);
}
allocator.deallocate(swap_buffer, size * sizeof(Element));
}
2019-04-26 22:07:32 +00:00
/* Most significant digit radix sort
* Usually slower than LSD and is not stable, but allows partial sorting
2019-04-27 17:52:43 +00:00
*
* Based on https://github.com/voutcn/kxsort, license:
* The MIT License
* Copyright (c) 2016 Dinghua Li <voutcn@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
2019-04-26 22:07:32 +00:00
*/
2019-05-02 22:51:39 +00:00
static void executeMSD(Element * arr, size_t size, size_t limit)
2019-04-26 22:07:32 +00:00
{
limit = std::min(limit, size);
2019-04-27 17:52:43 +00:00
radixSortMSDInternalHelper<NUM_PASSES - 1>(arr, size, limit);
2019-04-26 22:07:32 +00:00
}
};
2019-04-25 01:16:26 +00:00
/// Helper functions for numeric types.
/// Use RadixSort with custom traits for complex types instead.
2019-04-02 20:15:02 +00:00
template <typename T>
2019-04-27 17:52:43 +00:00
void radixSortLSD(T *arr, size_t size)
2019-04-26 22:07:32 +00:00
{
2019-04-27 17:52:43 +00:00
RadixSort<RadixSortNumTraits<T>>::executeLSD(arr, size);
2019-04-26 22:07:32 +00:00
}
template <typename T>
2019-04-27 17:52:43 +00:00
void radixSortMSD(T *arr, size_t size, size_t limit)
2019-04-02 20:15:02 +00:00
{
2019-04-27 17:52:43 +00:00
RadixSort<RadixSortNumTraits<T>>::executeMSD(arr, size, limit);
2019-04-02 20:15:02 +00:00
}