2015-11-21 12:47:41 +00:00
|
|
|
#pragma once
|
|
|
|
|
2019-04-02 20:15:02 +00:00
|
|
|
|
2015-11-21 12:47:41 +00:00
|
|
|
#include <string.h>
|
2016-12-24 01:03:10 +00:00
|
|
|
#if !defined(__APPLE__) && !defined(__FreeBSD__)
|
2015-11-21 12:47:41 +00:00
|
|
|
#include <malloc.h>
|
2016-10-26 22:27:38 +00:00
|
|
|
#endif
|
2019-04-02 20:15:02 +00:00
|
|
|
#include <algorithm>
|
|
|
|
#include <cmath>
|
2016-10-26 22:27:38 +00:00
|
|
|
#include <cstdlib>
|
2015-11-21 12:47:41 +00:00
|
|
|
#include <cstdint>
|
2020-05-24 14:42:20 +00:00
|
|
|
#include <cassert>
|
2015-11-21 12:47:41 +00:00
|
|
|
#include <type_traits>
|
2020-11-15 20:08:18 +00:00
|
|
|
#include <memory>
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2017-06-06 17:18:32 +00:00
|
|
|
#include <ext/bit_cast.h>
|
2020-09-15 09:55:57 +00:00
|
|
|
#include <common/extended_types.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Core/Defines.h>
|
2015-11-21 12:47:41 +00:00
|
|
|
|
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/** Radix sort, has the following functionality:
|
2020-05-24 10:49:43 +00:00
|
|
|
*
|
2017-05-07 20:25:26 +00:00
|
|
|
* Can sort unsigned, signed numbers, and floats.
|
|
|
|
* Can sort an array of fixed length elements that contain something else besides the key.
|
2020-05-24 10:49:43 +00:00
|
|
|
* Can sort an array and form sorted result containing some transformation of elements.
|
|
|
|
* Can do partial sort.
|
2017-05-10 04:00:19 +00:00
|
|
|
* Customizable radix size.
|
2015-11-23 20:24:54 +00:00
|
|
|
*
|
2020-05-24 10:49:43 +00:00
|
|
|
* Two flavours of radix sort are implemented:
|
|
|
|
*
|
|
|
|
* 1. LSB, stable.
|
|
|
|
* 2. MSB, unstable, with support for partial sort.
|
2015-11-21 12:47:41 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/** Used as a template parameter. See below.
|
2015-11-21 12:47:41 +00:00
|
|
|
*/
|
2020-11-25 17:12:14 +00:00
|
|
|
struct RadixSortAllocator
|
2015-11-21 12:47:41 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
void * allocate(size_t size)
|
|
|
|
{
|
2020-11-25 17:04:25 +00:00
|
|
|
return ::operator new(size);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2017-12-01 18:36:55 +00:00
|
|
|
void deallocate(void * ptr, size_t /*size*/)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2020-11-25 17:04:25 +00:00
|
|
|
::operator delete(ptr);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2015-11-21 12:47:41 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
/** A transformation that transforms the bit representation of a key into an unsigned integer number,
|
|
|
|
* that the order relation over the keys will match the order relation over the obtained unsigned numbers.
|
|
|
|
* For floats this conversion does the following:
|
|
|
|
* if the signed bit is set, it flips all other bits.
|
|
|
|
* In this case, NaN-s are bigger than all normal numbers.
|
|
|
|
*/
|
2015-11-21 12:47:41 +00:00
|
|
|
template <typename KeyBits>
|
2019-05-02 22:51:39 +00:00
|
|
|
struct RadixSortFloatTransform
|
2015-11-21 12:47:41 +00:00
|
|
|
{
|
2019-05-02 22:51:39 +00:00
|
|
|
/// Is it worth writing the result in memory, or is it better to do calculation every time again?
|
|
|
|
static constexpr bool transform_is_simple = false;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
static KeyBits forward(KeyBits x)
|
|
|
|
{
|
|
|
|
return x ^ ((-(x >> (sizeof(KeyBits) * 8 - 1))) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
|
|
|
|
}
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
static KeyBits backward(KeyBits x)
|
|
|
|
{
|
|
|
|
return x ^ (((x >> (sizeof(KeyBits) * 8 - 1)) - 1) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
|
|
|
|
}
|
|
|
|
};
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-04-26 22:07:32 +00:00
|
|
|
|
2019-04-25 01:16:26 +00:00
|
|
|
template <typename TElement>
|
2019-05-02 22:51:39 +00:00
|
|
|
struct RadixSortFloatTraits
|
2015-11-21 12:47:41 +00:00
|
|
|
{
|
2020-05-23 14:28:05 +00:00
|
|
|
/// The type of the element. It can be a structure with a key and some other payload. Or just a key.
|
|
|
|
using Element = TElement;
|
|
|
|
|
|
|
|
/// The key to sort by.
|
|
|
|
using Key = Element;
|
|
|
|
|
|
|
|
/// Part of the element that you need in the result array.
|
|
|
|
/// There are cases when elements are sorted by one part but you need other parts in array of results.
|
|
|
|
using Result = Element;
|
|
|
|
|
|
|
|
/// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t.
|
|
|
|
using CountType = uint32_t;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
/// The type to which the key is transformed to do bit operations. This UInt is the same size as the key.
|
|
|
|
using KeyBits = std::conditional_t<sizeof(Key) == 8, uint64_t, uint32_t>;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
|
|
|
|
|
|
|
|
/// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits.
|
|
|
|
using Transform = RadixSortFloatTransform<KeyBits>;
|
|
|
|
|
|
|
|
/// An object with the functions allocate and deallocate.
|
|
|
|
/// Can be used, for example, to allocate memory for a temporary array on the stack.
|
|
|
|
/// To do this, the allocator itself is created on the stack.
|
2020-11-25 17:12:14 +00:00
|
|
|
using Allocator = RadixSortAllocator;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
/// The function to get the key from an array element.
|
2019-04-25 01:16:26 +00:00
|
|
|
static Key & extractKey(Element & elem) { return elem; }
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2020-05-23 14:28:05 +00:00
|
|
|
/// The function to get the result part from an array element.
|
|
|
|
static Result & extractResult(Element & elem) { return elem; }
|
2020-05-17 14:22:17 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
/// Used when fallback to comparison based sorting is needed.
|
|
|
|
/// TODO: Correct handling of NaNs, NULLs, etc
|
2019-04-27 17:52:43 +00:00
|
|
|
static bool less(Key x, Key y)
|
2019-04-26 22:07:32 +00:00
|
|
|
{
|
|
|
|
return x < y;
|
|
|
|
}
|
2015-11-21 12:47:41 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
template <typename KeyBits>
|
2019-05-02 22:51:39 +00:00
|
|
|
struct RadixSortIdentityTransform
|
2015-11-21 12:47:41 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr bool transform_is_simple = true;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
static KeyBits forward(KeyBits x) { return x; }
|
|
|
|
static KeyBits backward(KeyBits x) { return x; }
|
2015-11-21 12:47:41 +00:00
|
|
|
};
|
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
|
2019-04-25 01:16:26 +00:00
|
|
|
template <typename TElement>
|
2019-05-02 22:51:39 +00:00
|
|
|
struct RadixSortUIntTraits
|
2015-11-21 12:47:41 +00:00
|
|
|
{
|
2019-04-25 01:16:26 +00:00
|
|
|
using Element = TElement;
|
2020-05-23 14:28:05 +00:00
|
|
|
using Result = Element;
|
2019-04-25 01:16:26 +00:00
|
|
|
using Key = Element;
|
2017-04-01 07:20:54 +00:00
|
|
|
using CountType = uint32_t;
|
2019-05-02 22:51:39 +00:00
|
|
|
using KeyBits = Key;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr size_t PART_SIZE_BITS = 8;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
using Transform = RadixSortIdentityTransform<KeyBits>;
|
2020-11-25 17:12:14 +00:00
|
|
|
using Allocator = RadixSortAllocator;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-04-25 01:16:26 +00:00
|
|
|
static Key & extractKey(Element & elem) { return elem; }
|
2020-05-23 14:28:05 +00:00
|
|
|
static Result & extractResult(Element & elem) { return elem; }
|
2019-04-26 22:07:32 +00:00
|
|
|
|
2019-04-27 17:52:43 +00:00
|
|
|
static bool less(Key x, Key y)
|
2019-04-26 22:07:32 +00:00
|
|
|
{
|
|
|
|
return x < y;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
template <typename KeyBits>
|
2019-05-02 22:51:39 +00:00
|
|
|
struct RadixSortSignedTransform
|
2019-04-26 22:07:32 +00:00
|
|
|
{
|
2019-05-02 22:51:39 +00:00
|
|
|
static constexpr bool transform_is_simple = true;
|
2019-04-26 22:07:32 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
|
|
|
|
static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
|
2015-11-21 12:47:41 +00:00
|
|
|
};
|
|
|
|
|
2019-04-02 20:15:02 +00:00
|
|
|
|
2019-04-25 01:16:26 +00:00
|
|
|
template <typename TElement>
|
2019-05-02 22:51:39 +00:00
|
|
|
struct RadixSortIntTraits
|
2015-11-21 12:47:41 +00:00
|
|
|
{
|
2019-05-02 22:51:39 +00:00
|
|
|
using Element = TElement;
|
2020-05-23 14:28:05 +00:00
|
|
|
using Result = Element;
|
2019-05-02 22:51:39 +00:00
|
|
|
using Key = Element;
|
|
|
|
using CountType = uint32_t;
|
2020-08-19 11:52:17 +00:00
|
|
|
using KeyBits = make_unsigned_t<Key>;
|
2019-04-26 22:07:32 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
static constexpr size_t PART_SIZE_BITS = 8;
|
2019-04-26 22:07:32 +00:00
|
|
|
|
2019-05-02 22:51:39 +00:00
|
|
|
using Transform = RadixSortSignedTransform<KeyBits>;
|
2020-11-25 17:12:14 +00:00
|
|
|
using Allocator = RadixSortAllocator;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-04-25 01:16:26 +00:00
|
|
|
static Key & extractKey(Element & elem) { return elem; }
|
2020-05-23 14:28:05 +00:00
|
|
|
static Result & extractResult(Element & elem) { return elem; }
|
2019-04-26 22:07:32 +00:00
|
|
|
|
2019-04-27 17:52:43 +00:00
|
|
|
static bool less(Key x, Key y)
|
2019-04-26 22:07:32 +00:00
|
|
|
{
|
|
|
|
return x < y;
|
|
|
|
}
|
2015-11-21 12:47:41 +00:00
|
|
|
};
|
|
|
|
|
2019-02-18 16:29:37 +00:00
|
|
|
|
2019-04-25 01:16:26 +00:00
|
|
|
template <typename T>
|
2019-11-02 05:55:06 +00:00
|
|
|
using RadixSortNumTraits = std::conditional_t<
|
2020-08-19 11:52:17 +00:00
|
|
|
is_integer_v<T>,
|
2019-11-02 05:55:06 +00:00
|
|
|
std::conditional_t<is_unsigned_v<T>, RadixSortUIntTraits<T>, RadixSortIntTraits<T>>,
|
|
|
|
RadixSortFloatTraits<T>>;
|
2019-02-18 16:29:37 +00:00
|
|
|
|
2015-11-21 12:47:41 +00:00
|
|
|
|
|
|
|
template <typename Traits>
|
|
|
|
struct RadixSort
|
|
|
|
{
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
using Element = typename Traits::Element;
|
2020-05-23 14:28:05 +00:00
|
|
|
using Result = typename Traits::Result;
|
2017-04-01 07:20:54 +00:00
|
|
|
using Key = typename Traits::Key;
|
2017-05-10 04:00:19 +00:00
|
|
|
using CountType = typename Traits::CountType;
|
2017-04-01 07:20:54 +00:00
|
|
|
using KeyBits = typename Traits::KeyBits;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2019-04-27 17:52:43 +00:00
|
|
|
// Use insertion sort if the size of the array is less than equal to this threshold
|
|
|
|
static constexpr size_t INSERTION_SORT_THRESHOLD = 64;
|
2019-04-26 22:07:32 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS;
|
|
|
|
static constexpr size_t PART_BITMASK = HISTOGRAM_SIZE - 1;
|
|
|
|
static constexpr size_t KEY_BITS = sizeof(Key) * 8;
|
|
|
|
static constexpr size_t NUM_PASSES = (KEY_BITS + (Traits::PART_SIZE_BITS - 1)) / Traits::PART_SIZE_BITS;
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2020-05-24 14:42:20 +00:00
|
|
|
|
|
|
|
static KeyBits keyToBits(Key x) { return ext::bit_cast<KeyBits>(x); }
|
|
|
|
static Key bitsToKey(KeyBits x) { return ext::bit_cast<Key>(x); }
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static ALWAYS_INLINE KeyBits getPart(size_t N, KeyBits x)
|
|
|
|
{
|
|
|
|
if (Traits::Transform::transform_is_simple)
|
|
|
|
x = Traits::Transform::forward(x);
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return (x >> (N * Traits::PART_SIZE_BITS)) & PART_BITMASK;
|
|
|
|
}
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2020-05-24 14:42:20 +00:00
|
|
|
static ALWAYS_INLINE KeyBits extractPart(size_t N, Element & elem)
|
|
|
|
{
|
|
|
|
return getPart(N, keyToBits(Traits::extractKey(elem)));
|
|
|
|
}
|
2015-11-21 12:47:41 +00:00
|
|
|
|
2020-05-24 14:42:20 +00:00
|
|
|
static void insertionSortInternal(Element * arr, size_t size)
|
2019-04-26 22:07:32 +00:00
|
|
|
{
|
2019-04-27 17:52:43 +00:00
|
|
|
Element * end = arr + size;
|
|
|
|
for (Element * i = arr + 1; i < end; ++i)
|
2019-04-29 01:50:18 +00:00
|
|
|
{
|
2019-04-27 17:52:43 +00:00
|
|
|
if (Traits::less(Traits::extractKey(*i), Traits::extractKey(*(i - 1))))
|
2019-04-26 22:07:32 +00:00
|
|
|
{
|
|
|
|
Element * j;
|
|
|
|
Element tmp = *i;
|
|
|
|
*i = *(i - 1);
|
2019-04-27 17:52:43 +00:00
|
|
|
for (j = i - 1; j > arr && Traits::less(Traits::extractKey(tmp), Traits::extractKey(*(j - 1))); --j)
|
2019-04-26 22:07:32 +00:00
|
|
|
*j = *(j - 1);
|
|
|
|
*j = tmp;
|
|
|
|
}
|
2019-04-29 01:50:18 +00:00
|
|
|
}
|
2019-04-26 22:07:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-05-23 15:29:43 +00:00
|
|
|
template <bool DIRECT_WRITE_TO_DESTINATION>
|
|
|
|
static NO_INLINE void radixSortLSDInternal(Element * arr, size_t size, bool reverse, Result * destination)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2017-05-07 20:25:26 +00:00
|
|
|
/// If the array is smaller than 256, then it is better to use another algorithm.
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time.
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met.
|
2020-06-08 22:37:35 +00:00
|
|
|
std::unique_ptr<CountType[]> histograms{new CountType[HISTOGRAM_SIZE * NUM_PASSES]{}};
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
typename Traits::Allocator allocator;
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// We will do several passes through the array. On each pass, the data is transferred to another array. Let's allocate this temporary array.
|
2017-04-01 07:20:54 +00:00
|
|
|
Element * swap_buffer = reinterpret_cast<Element *>(allocator.allocate(size * sizeof(Element)));
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Transform the array and calculate the histogram.
|
2017-09-12 21:06:11 +00:00
|
|
|
/// NOTE This is slightly suboptimal. Look at https://github.com/powturbo/TurboHist
|
2017-04-01 07:20:54 +00:00
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
if (!Traits::Transform::transform_is_simple)
|
|
|
|
Traits::extractKey(arr[i]) = bitsToKey(Traits::Transform::forward(keyToBits(Traits::extractKey(arr[i]))));
|
|
|
|
|
2019-04-25 01:16:26 +00:00
|
|
|
for (size_t pass = 0; pass < NUM_PASSES; ++pass)
|
2020-05-24 14:42:20 +00:00
|
|
|
++histograms[pass * HISTOGRAM_SIZE + extractPart(pass, arr[i])];
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions minus one.
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t sums[NUM_PASSES] = {0};
|
|
|
|
|
|
|
|
for (size_t i = 0; i < HISTOGRAM_SIZE; ++i)
|
|
|
|
{
|
2019-04-25 01:16:26 +00:00
|
|
|
for (size_t pass = 0; pass < NUM_PASSES; ++pass)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-04-25 01:16:26 +00:00
|
|
|
size_t tmp = histograms[pass * HISTOGRAM_SIZE + i] + sums[pass];
|
|
|
|
histograms[pass * HISTOGRAM_SIZE + i] = sums[pass] - 1;
|
|
|
|
sums[pass] = tmp;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Move the elements in the order starting from the least bit piece, and then do a few passes on the number of pieces.
|
2020-05-23 15:29:43 +00:00
|
|
|
for (size_t pass = 0; pass < NUM_PASSES - DIRECT_WRITE_TO_DESTINATION; ++pass)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-04-25 01:16:26 +00:00
|
|
|
Element * writer = pass % 2 ? arr : swap_buffer;
|
|
|
|
Element * reader = pass % 2 ? swap_buffer : arr;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
2020-05-24 14:42:20 +00:00
|
|
|
size_t pos = extractPart(pass, reader[i]);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Place the element on the next free position.
|
2019-04-25 01:16:26 +00:00
|
|
|
auto & dest = writer[++histograms[pass * HISTOGRAM_SIZE + pos]];
|
2017-04-01 07:20:54 +00:00
|
|
|
dest = reader[i];
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// On the last pass, we do the reverse transformation.
|
2019-04-25 01:16:26 +00:00
|
|
|
if (!Traits::Transform::transform_is_simple && pass == NUM_PASSES - 1)
|
2017-04-01 07:20:54 +00:00
|
|
|
Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(reader[i]))));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-23 15:29:43 +00:00
|
|
|
if (DIRECT_WRITE_TO_DESTINATION)
|
2020-05-17 14:22:17 +00:00
|
|
|
{
|
2020-05-23 15:22:04 +00:00
|
|
|
constexpr size_t pass = NUM_PASSES - 1;
|
2020-05-23 14:28:05 +00:00
|
|
|
Result * writer = destination;
|
2020-05-17 14:22:17 +00:00
|
|
|
Element * reader = pass % 2 ? swap_buffer : arr;
|
|
|
|
|
2020-05-21 11:35:54 +00:00
|
|
|
if (reverse)
|
2020-05-17 14:22:17 +00:00
|
|
|
{
|
2020-05-21 11:35:54 +00:00
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
2020-05-24 14:42:20 +00:00
|
|
|
size_t pos = extractPart(pass, reader[i]);
|
2020-05-23 14:28:05 +00:00
|
|
|
writer[size - 1 - (++histograms[pass * HISTOGRAM_SIZE + pos])] = Traits::extractResult(reader[i]);
|
2020-05-21 11:35:54 +00:00
|
|
|
}
|
2020-05-21 11:39:40 +00:00
|
|
|
}
|
|
|
|
else
|
2020-05-21 11:35:54 +00:00
|
|
|
{
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
2020-05-24 14:42:20 +00:00
|
|
|
size_t pos = extractPart(pass, reader[i]);
|
2020-05-23 14:28:05 +00:00
|
|
|
writer[++histograms[pass * HISTOGRAM_SIZE + pos]] = Traits::extractResult(reader[i]);
|
2020-05-21 11:35:54 +00:00
|
|
|
}
|
2020-05-17 14:22:17 +00:00
|
|
|
}
|
2020-05-23 14:28:05 +00:00
|
|
|
}
|
2020-05-23 15:22:04 +00:00
|
|
|
else
|
2020-05-17 14:22:17 +00:00
|
|
|
{
|
|
|
|
/// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array.
|
2020-05-23 15:22:04 +00:00
|
|
|
if (NUM_PASSES % 2)
|
|
|
|
memcpy(arr, swap_buffer, size * sizeof(Element));
|
|
|
|
|
2020-05-24 14:42:20 +00:00
|
|
|
/// TODO This is suboptimal, we can embed it to the last pass.
|
2020-05-23 15:22:04 +00:00
|
|
|
if (reverse)
|
|
|
|
std::reverse(arr, arr + size);
|
2020-05-17 14:22:17 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
allocator.deallocate(swap_buffer, size * sizeof(Element));
|
|
|
|
}
|
2019-04-26 22:07:32 +00:00
|
|
|
|
2020-05-24 14:42:20 +00:00
|
|
|
|
|
|
|
/* Main MSD radix sort subroutine.
|
|
|
|
* Puts elements to buckets based on PASS-th digit, then recursively calls insertion sort or itself on the buckets.
|
|
|
|
*
|
|
|
|
* TODO: Provide support for 'reverse' and 'DIRECT_WRITE_TO_DESTINATION'.
|
|
|
|
*
|
|
|
|
* Invariant: higher significant parts of the elements than PASS are constant within arr or is is the first PASS.
|
|
|
|
* PASS is counted from least significant (0), so the first pass is NUM_PASSES - 1.
|
|
|
|
*/
|
|
|
|
template <size_t PASS>
|
|
|
|
static inline void radixSortMSDInternal(Element * arr, size_t size, size_t limit)
|
|
|
|
{
|
2020-05-24 18:05:06 +00:00
|
|
|
// std::cerr << PASS << ", " << size << ", " << limit << "\n";
|
2020-05-24 14:48:53 +00:00
|
|
|
|
2020-05-24 14:42:20 +00:00
|
|
|
/// The beginning of every i-1-th bucket. 0th element will be equal to 1st.
|
|
|
|
/// Last element will point to array end.
|
2020-06-08 22:45:38 +00:00
|
|
|
std::unique_ptr<Element *[]> prev_buckets{new Element*[HISTOGRAM_SIZE + 1]};
|
2020-05-24 14:42:20 +00:00
|
|
|
/// The beginning of every i-th bucket (the same array shifted by one).
|
|
|
|
Element ** buckets = &prev_buckets[1];
|
|
|
|
|
|
|
|
prev_buckets[0] = arr;
|
|
|
|
prev_buckets[1] = arr;
|
|
|
|
|
|
|
|
/// The end of the range of buckets that we need with limit.
|
|
|
|
Element * finish = arr + size;
|
|
|
|
|
|
|
|
/// Count histogram of current element parts.
|
|
|
|
|
2020-05-24 15:51:23 +00:00
|
|
|
/// We use loop unrolling to minimize data dependencies and increase instruction level parallelism.
|
|
|
|
/// Unroll 8 times looks better on experiments;
|
|
|
|
/// also it corresponds with the results from https://github.com/powturbo/TurboHist
|
|
|
|
|
|
|
|
static constexpr size_t UNROLL_COUNT = 8;
|
2020-06-08 22:45:38 +00:00
|
|
|
std::unique_ptr<CountType[]> count{new CountType[HISTOGRAM_SIZE * UNROLL_COUNT]{}};
|
2020-05-24 15:48:11 +00:00
|
|
|
size_t unrolled_size = size / UNROLL_COUNT * UNROLL_COUNT;
|
|
|
|
|
|
|
|
for (Element * elem = arr; elem < arr + unrolled_size; elem += UNROLL_COUNT)
|
|
|
|
for (size_t i = 0; i < UNROLL_COUNT; ++i)
|
|
|
|
++count[i * HISTOGRAM_SIZE + extractPart(PASS, elem[i])];
|
2020-05-24 14:42:20 +00:00
|
|
|
|
2020-05-24 15:48:11 +00:00
|
|
|
for (Element * elem = arr + unrolled_size; elem < arr + size; ++elem)
|
2020-05-24 14:42:20 +00:00
|
|
|
++count[extractPart(PASS, *elem)];
|
2020-05-24 15:48:11 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < HISTOGRAM_SIZE; ++i)
|
|
|
|
for (size_t j = 1; j < UNROLL_COUNT; ++j)
|
|
|
|
count[i] += count[j * HISTOGRAM_SIZE + i];
|
2020-05-24 14:42:20 +00:00
|
|
|
|
|
|
|
/// Fill pointers to buckets according to the histogram.
|
|
|
|
|
|
|
|
/// How many buckets we will recurse into.
|
|
|
|
ssize_t buckets_for_recursion = HISTOGRAM_SIZE;
|
2020-05-24 15:26:07 +00:00
|
|
|
bool finish_early = false;
|
2020-05-24 14:42:20 +00:00
|
|
|
|
|
|
|
for (size_t i = 1; i < HISTOGRAM_SIZE; ++i)
|
|
|
|
{
|
|
|
|
/// Positions are just a cumulative sum of counts.
|
|
|
|
buckets[i] = buckets[i - 1] + count[i - 1];
|
|
|
|
|
|
|
|
/// If this bucket starts after limit, we don't need it.
|
2020-05-24 15:26:07 +00:00
|
|
|
if (!finish_early && buckets[i] >= arr + limit)
|
2020-05-24 14:42:20 +00:00
|
|
|
{
|
|
|
|
buckets_for_recursion = i;
|
|
|
|
finish = buckets[i];
|
2020-05-24 15:26:07 +00:00
|
|
|
finish_early = true;
|
2020-05-24 14:42:20 +00:00
|
|
|
/// We cannot break here, because we need correct pointers to all buckets, see the next loop.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* At this point, we have the following variables:
|
|
|
|
* count[i] is the size of i-th bucket
|
|
|
|
* buckets[i] is a pointer to the beginning of i-th bucket, buckets[-1] == buckets[0]
|
|
|
|
* buckets_for_recursion is the number of buckets that should be sorted, the last of them only partially
|
|
|
|
* finish is a pointer to the end of the first buckets_for_recursion buckets
|
|
|
|
*/
|
|
|
|
|
|
|
|
/// Scatter array elements to buckets until the first buckets_for_recursion buckets are full
|
|
|
|
/// After the above loop, buckets are shifted towards the end and now pointing to the beginning of i+1th bucket.
|
|
|
|
|
|
|
|
for (ssize_t i = 0; /* guarded by 'finish' */; ++i)
|
|
|
|
{
|
|
|
|
assert(i < buckets_for_recursion);
|
|
|
|
|
|
|
|
/// We look at i-1th index, because bucket pointers are shifted right on every loop iteration,
|
|
|
|
/// and all buckets before i was completely shifted to the beginning of the next bucket.
|
|
|
|
/// So, the beginning of i-th bucket is at buckets[i - 1].
|
|
|
|
|
|
|
|
Element * bucket_end = buckets[i - 1] + count[i];
|
|
|
|
|
|
|
|
/// Fill this bucket.
|
|
|
|
while (buckets[i] != bucket_end)
|
|
|
|
{
|
|
|
|
Element swapper = *buckets[i];
|
|
|
|
KeyBits tag = extractPart(PASS, swapper);
|
|
|
|
|
|
|
|
if (tag != KeyBits(i))
|
|
|
|
{
|
|
|
|
/// Invariant: tag > i, because the elements with less tags are already at the right places.
|
|
|
|
assert(tag > KeyBits(i));
|
|
|
|
|
|
|
|
/// While the tag (digit) of the element is not that we need,
|
|
|
|
/// swap the element with the next element in the bucket for that tag.
|
|
|
|
|
|
|
|
/// Interesting observation:
|
|
|
|
/// - we will definitely find the needed element,
|
|
|
|
/// because the tag's bucket will contain at least one "wrong" element,
|
|
|
|
/// because the "right" element is appeared in our bucket.
|
|
|
|
|
|
|
|
/// After this loop we shift buckets[i] and buckets[tag] pointers to the right for all found tags.
|
|
|
|
/// And all positions that were traversed are filled with the proper values.
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
std::swap(swapper, *buckets[tag]);
|
|
|
|
++buckets[tag];
|
|
|
|
tag = extractPart(PASS, swapper);
|
|
|
|
} while (tag != KeyBits(i));
|
|
|
|
*buckets[i] = swapper;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Now we have the right element at this place.
|
|
|
|
++buckets[i];
|
|
|
|
}
|
2020-05-24 15:26:07 +00:00
|
|
|
|
|
|
|
if (bucket_end == finish)
|
|
|
|
break;
|
2020-05-24 14:42:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Recursion for the relevant buckets.
|
|
|
|
|
|
|
|
if constexpr (PASS > 0)
|
|
|
|
{
|
|
|
|
/// Recursively sort buckets, except the last one
|
|
|
|
for (ssize_t i = 0; i < buckets_for_recursion - 1; ++i)
|
|
|
|
{
|
|
|
|
Element * start = buckets[i - 1];
|
|
|
|
ssize_t subsize = count[i];
|
|
|
|
|
|
|
|
radixSortMSDInternalHelper<PASS - 1>(start, subsize, subsize);
|
|
|
|
}
|
|
|
|
|
2020-05-24 15:26:07 +00:00
|
|
|
/// Sort the last necessary bucket with limit
|
2020-05-24 14:42:20 +00:00
|
|
|
{
|
|
|
|
ssize_t i = buckets_for_recursion - 1;
|
|
|
|
|
|
|
|
Element * start = buckets[i - 1];
|
|
|
|
ssize_t subsize = count[i];
|
2020-05-24 14:48:53 +00:00
|
|
|
ssize_t sublimit = limit - (start - arr);
|
2020-05-24 14:42:20 +00:00
|
|
|
|
|
|
|
radixSortMSDInternalHelper<PASS - 1>(start, subsize, sublimit);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// A helper to choose sorting algorithm based on array length
|
|
|
|
template <size_t PASS>
|
|
|
|
static inline void radixSortMSDInternalHelper(Element * arr, size_t size, size_t limit)
|
|
|
|
{
|
|
|
|
if (size <= INSERTION_SORT_THRESHOLD)
|
|
|
|
insertionSortInternal(arr, size);
|
|
|
|
else
|
|
|
|
radixSortMSDInternal<PASS>(arr, size, limit);
|
|
|
|
}
|
|
|
|
|
2020-05-23 15:29:43 +00:00
|
|
|
public:
|
|
|
|
/** Least significant digit radix sort (stable).
|
|
|
|
* This function will sort inplace (modify 'arr')
|
|
|
|
*/
|
|
|
|
static void executeLSD(Element * arr, size_t size)
|
|
|
|
{
|
|
|
|
radixSortLSDInternal<false>(arr, size, false, nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** This function will start to sort inplace (modify 'arr')
|
|
|
|
* but on the last step it will write result directly to the destination
|
|
|
|
* instead of finishing sorting 'arr'.
|
|
|
|
* In this case it will fill only Result parts of the Element into destination.
|
|
|
|
* It is handy to avoid unnecessary data movements.
|
|
|
|
*/
|
|
|
|
static void executeLSD(Element * arr, size_t size, bool reverse, Result * destination)
|
|
|
|
{
|
|
|
|
radixSortLSDInternal<true>(arr, size, reverse, destination);
|
|
|
|
}
|
|
|
|
|
2019-04-26 22:07:32 +00:00
|
|
|
/* Most significant digit radix sort
|
2020-05-24 15:26:07 +00:00
|
|
|
* Is not stable, but allows partial sorting.
|
2020-05-24 18:05:06 +00:00
|
|
|
* And it's more cache-friendly and usually faster than LSD variant.
|
2019-04-27 17:52:43 +00:00
|
|
|
*
|
2020-05-24 18:05:06 +00:00
|
|
|
* NOTE: It's beneficial over std::partial_sort only if limit is above ~2% of size for 8 bit radix.
|
|
|
|
* NOTE: When lowering down limit to 1%, the radix of 4..6 or 10..12 bit started to become beneficial.
|
|
|
|
* For less than 1% limit, it's not recommended to use.
|
|
|
|
* NOTE: For huge arrays without limit, the radix 11 suddenly becomes better... but not for smaller arrays.
|
|
|
|
* Maybe it because histogram will fit in half of L1d cache (2048 * 4 = 16384).
|
2020-05-24 15:35:48 +00:00
|
|
|
*
|
2019-04-27 17:52:43 +00:00
|
|
|
* Based on https://github.com/voutcn/kxsort, license:
|
|
|
|
* The MIT License
|
|
|
|
* Copyright (c) 2016 Dinghua Li <voutcn@gmail.com>
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
|
|
* a copy of this software and associated documentation files (the
|
|
|
|
* "Software"), to deal in the Software without restriction, including
|
|
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
|
|
* the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be
|
|
|
|
* included in all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
|
* SOFTWARE.
|
2019-04-26 22:07:32 +00:00
|
|
|
*/
|
2019-05-02 22:51:39 +00:00
|
|
|
static void executeMSD(Element * arr, size_t size, size_t limit)
|
2019-04-26 22:07:32 +00:00
|
|
|
{
|
|
|
|
limit = std::min(limit, size);
|
2019-04-27 17:52:43 +00:00
|
|
|
radixSortMSDInternalHelper<NUM_PASSES - 1>(arr, size, limit);
|
2019-04-26 22:07:32 +00:00
|
|
|
}
|
2015-11-21 12:47:41 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2019-04-25 01:16:26 +00:00
|
|
|
/// Helper functions for numeric types.
|
|
|
|
/// Use RadixSort with custom traits for complex types instead.
|
2019-04-02 20:15:02 +00:00
|
|
|
|
|
|
|
template <typename T>
|
2020-05-24 14:42:20 +00:00
|
|
|
void radixSortLSD(T * arr, size_t size)
|
2019-04-26 22:07:32 +00:00
|
|
|
{
|
2019-04-27 17:52:43 +00:00
|
|
|
RadixSort<RadixSortNumTraits<T>>::executeLSD(arr, size);
|
2019-04-26 22:07:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename T>
|
2020-05-24 14:42:20 +00:00
|
|
|
void radixSortMSD(T * arr, size_t size, size_t limit)
|
2019-04-02 20:15:02 +00:00
|
|
|
{
|
2019-04-27 17:52:43 +00:00
|
|
|
RadixSort<RadixSortNumTraits<T>>::executeMSD(arr, size, limit);
|
2019-04-02 20:15:02 +00:00
|
|
|
}
|