Fix loop unrolling

This commit is contained in:
Alexey Milovidov 2020-05-23 18:22:04 +03:00
parent 1dc284d9f1
commit 82f03fa8ce
3 changed files with 17 additions and 17 deletions

View File

@ -109,6 +109,7 @@ class QuantileTDigest
using KeyBits = UInt32;
static constexpr size_t PART_SIZE_BITS = 8;
static constexpr bool DIRECT_WRITE_TO_DESTINATION = false;
using Transform = RadixSortFloatTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;

View File

@ -119,6 +119,8 @@ namespace
{
using Element = ValueWithIndex<T>;
using Result = size_t;
static constexpr bool DIRECT_WRITE_TO_DESTINATION = true;
static T & extractKey(Element & elem) { return elem.value; }
static size_t extractResult(Element & elem) { return elem.index; }
};

View File

@ -87,6 +87,7 @@ struct RadixSortFloatTraits
using KeyBits = std::conditional_t<sizeof(Key) == 8, uint64_t, uint32_t>;
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
static constexpr bool DIRECT_WRITE_TO_DESTINATION = false;
/// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits.
using Transform = RadixSortFloatTransform<KeyBits>;
@ -131,6 +132,7 @@ struct RadixSortUIntTraits
using KeyBits = Key;
static constexpr size_t PART_SIZE_BITS = 8;
static constexpr bool DIRECT_WRITE_TO_DESTINATION = false;
using Transform = RadixSortIdentityTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
@ -165,6 +167,7 @@ struct RadixSortIntTraits
using KeyBits = std::make_unsigned_t<Key>;
static constexpr size_t PART_SIZE_BITS = 8;
static constexpr bool DIRECT_WRITE_TO_DESTINATION = false;
using Transform = RadixSortSignedTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
@ -328,7 +331,7 @@ public:
* In this case it will fill only Result parts of the Element into destination.
* It is handy to avoid unnecessary data movements.
*/
static void executeLSD(Element * arr, size_t size, bool reverse = false, Result * destination = nullptr)
static NO_INLINE void executeLSD(Element * arr, size_t size, bool reverse = false, Result * destination = nullptr)
{
/// If the array is smaller than 256, then it is better to use another algorithm.
@ -368,10 +371,8 @@ public:
}
}
bool direct_copy_to_destination = (destination);
/// Move the elements in the order starting from the least bit piece, and then do a few passes on the number of pieces.
for (size_t pass = 0; pass < NUM_PASSES - direct_copy_to_destination; ++pass)
for (size_t pass = 0; pass < NUM_PASSES - Traits::DIRECT_WRITE_TO_DESTINATION; ++pass)
{
Element * writer = pass % 2 ? arr : swap_buffer;
Element * reader = pass % 2 ? swap_buffer : arr;
@ -390,9 +391,9 @@ public:
}
}
if (direct_copy_to_destination)
if (Traits::DIRECT_WRITE_TO_DESTINATION)
{
size_t pass = NUM_PASSES - 1;
constexpr size_t pass = NUM_PASSES - 1;
Result * writer = destination;
Element * reader = pass % 2 ? swap_buffer : arr;
@ -401,8 +402,6 @@ public:
for (size_t i = 0; i < size; ++i)
{
size_t pos = getPart(pass, keyToBits(Traits::extractKey(reader[i])));
/// Place the element on the next free position.
writer[size - 1 - (++histograms[pass * HISTOGRAM_SIZE + pos])] = Traits::extractResult(reader[i]);
}
}
@ -411,21 +410,19 @@ public:
for (size_t i = 0; i < size; ++i)
{
size_t pos = getPart(pass, keyToBits(Traits::extractKey(reader[i])));
/// Place the element on the next free position.
writer[++histograms[pass * HISTOGRAM_SIZE + pos]] = Traits::extractResult(reader[i]);
}
}
}
else if (NUM_PASSES % 2)
else
{
/// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array.
/// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array.
memcpy(arr, swap_buffer, size * sizeof(Element));
}
else if (reverse)
{
std::reverse(arr, arr + size);
if (NUM_PASSES % 2)
memcpy(arr, swap_buffer, size * sizeof(Element));
/// This is suboptimal, we can embed it to the last pass.
if (reverse)
std::reverse(arr, arr + size);
}
allocator.deallocate(swap_buffer, size * sizeof(Element));