2014-12-27 06:04:55 +00:00
|
|
|
#include <iostream>
|
|
|
|
#include <iomanip>
|
|
|
|
#include <mutex>
|
|
|
|
#include <atomic>
|
|
|
|
|
|
|
|
//#define DBMS_HASH_MAP_DEBUG_RESIZES
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Interpreters/AggregationCommon.h>
|
2014-12-27 06:04:55 +00:00
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/HashTable/HashMap.h>
|
|
|
|
#include <Common/HashTable/TwoLevelHashMap.h>
|
|
|
|
//#include <Common/HashTable/HashTableWithSmallLocks.h>
|
|
|
|
//#include <Common/HashTable/HashTableMerge.h>
|
2014-12-27 06:04:55 +00:00
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/ReadBufferFromFile.h>
|
2018-12-28 18:15:26 +00:00
|
|
|
#include <Compression/CompressedReadBuffer.h>
|
2014-12-27 06:04:55 +00:00
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/Stopwatch.h>
|
2019-01-11 19:12:36 +00:00
|
|
|
#include <Common/ThreadPool.h>
|
2014-12-27 06:04:55 +00:00
|
|
|
|
|
|
|
|
2016-05-28 10:35:44 +00:00
|
|
|
using Key = UInt64;
|
|
|
|
using Value = UInt64;
|
|
|
|
using Source = std::vector<Key>;
|
2014-12-27 06:04:55 +00:00
|
|
|
|
|
|
|
|
|
|
|
template <typename Map>
|
|
|
|
struct AggregateIndependent
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
template <typename Creator, typename Updater>
|
|
|
|
static void NO_INLINE execute(const Source & data, size_t num_threads, std::vector<std::unique_ptr<Map>> & results,
|
|
|
|
Creator && creator, Updater && updater,
|
|
|
|
ThreadPool & pool)
|
|
|
|
{
|
|
|
|
results.reserve(num_threads);
|
|
|
|
for (size_t i = 0; i < num_threads; ++i)
|
2019-07-07 23:25:15 +00:00
|
|
|
results.emplace_back(std::make_unique<Map>());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < num_threads; ++i)
|
|
|
|
{
|
|
|
|
auto begin = data.begin() + (data.size() * i) / num_threads;
|
|
|
|
auto end = data.begin() + (data.size() * (i + 1)) / num_threads;
|
|
|
|
auto & map = *results[i];
|
|
|
|
|
2019-10-17 14:41:27 +00:00
|
|
|
pool.scheduleOrThrowOnError([&, begin, end]()
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
for (auto it = begin; it != end; ++it)
|
|
|
|
{
|
2019-08-20 09:58:44 +00:00
|
|
|
typename Map::LookupResult place;
|
2017-04-01 07:20:54 +00:00
|
|
|
bool inserted;
|
|
|
|
map.emplace(*it, place, inserted);
|
|
|
|
|
|
|
|
if (inserted)
|
2019-10-29 15:16:51 +00:00
|
|
|
creator(place->getMapped());
|
2017-04-01 07:20:54 +00:00
|
|
|
else
|
2019-10-29 15:16:51 +00:00
|
|
|
updater(place->getMapped());
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
pool.wait();
|
|
|
|
}
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
2021-04-18 09:17:02 +00:00
|
|
|
#if !defined(__clang__)
|
2014-12-27 06:04:55 +00:00
|
|
|
#pragma GCC diagnostic push
|
|
|
|
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
2015-01-22 01:13:13 +00:00
|
|
|
#endif
|
2014-12-27 06:04:55 +00:00
|
|
|
|
|
|
|
template <typename Map>
|
|
|
|
struct AggregateIndependentWithSequentialKeysOptimization
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
template <typename Creator, typename Updater>
|
|
|
|
static void NO_INLINE execute(const Source & data, size_t num_threads, std::vector<std::unique_ptr<Map>> & results,
|
|
|
|
Creator && creator, Updater && updater,
|
|
|
|
ThreadPool & pool)
|
|
|
|
{
|
|
|
|
results.reserve(num_threads);
|
|
|
|
for (size_t i = 0; i < num_threads; ++i)
|
2019-07-07 23:25:15 +00:00
|
|
|
results.emplace_back(std::make_unique<Map>());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < num_threads; ++i)
|
|
|
|
{
|
|
|
|
auto begin = data.begin() + (data.size() * i) / num_threads;
|
|
|
|
auto end = data.begin() + (data.size() * (i + 1)) / num_threads;
|
|
|
|
auto & map = *results[i];
|
|
|
|
|
2019-10-17 14:41:27 +00:00
|
|
|
pool.scheduleOrThrowOnError([&, begin, end]()
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-08-20 09:58:44 +00:00
|
|
|
typename Map::LookupResult place = nullptr;
|
2017-04-01 07:20:54 +00:00
|
|
|
Key prev_key {};
|
|
|
|
for (auto it = begin; it != end; ++it)
|
|
|
|
{
|
|
|
|
if (it != begin && *it == prev_key)
|
|
|
|
{
|
2020-03-19 23:48:53 +00:00
|
|
|
assert(place != nullptr);
|
2019-10-29 15:16:51 +00:00
|
|
|
updater(place->getMapped());
|
2017-04-01 07:20:54 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
prev_key = *it;
|
|
|
|
|
|
|
|
bool inserted;
|
|
|
|
map.emplace(*it, place, inserted);
|
2020-03-19 23:48:53 +00:00
|
|
|
assert(place != nullptr);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (inserted)
|
2019-10-29 15:16:51 +00:00
|
|
|
creator(place->getMapped());
|
2017-04-01 07:20:54 +00:00
|
|
|
else
|
2019-10-29 15:16:51 +00:00
|
|
|
updater(place->getMapped());
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
pool.wait();
|
|
|
|
}
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
2021-04-18 09:17:02 +00:00
|
|
|
#if !defined(__clang__)
|
2014-12-27 06:04:55 +00:00
|
|
|
#pragma GCC diagnostic pop
|
2015-01-22 01:13:13 +00:00
|
|
|
#endif
|
2014-12-27 06:04:55 +00:00
|
|
|
|
|
|
|
|
|
|
|
template <typename Map>
|
|
|
|
struct MergeSequential
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
template <typename Merger>
|
|
|
|
static void NO_INLINE execute(Map ** source_maps, size_t num_maps, Map *& result_map,
|
|
|
|
Merger && merger,
|
2017-12-01 18:36:55 +00:00
|
|
|
ThreadPool &)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
for (size_t i = 1; i < num_maps; ++i)
|
|
|
|
{
|
|
|
|
auto begin = source_maps[i]->begin();
|
|
|
|
auto end = source_maps[i]->end();
|
|
|
|
for (auto it = begin; it != end; ++it)
|
2019-10-29 15:16:51 +00:00
|
|
|
merger((*source_maps[0])[it->getKey()], it->getMapped());
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
result_map = source_maps[0];
|
|
|
|
}
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
template <typename Map>
|
2017-03-25 20:12:56 +00:00
|
|
|
struct MergeSequentialTransposed /// In practice not better than usual.
|
2014-12-27 06:04:55 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
template <typename Merger>
|
|
|
|
static void NO_INLINE execute(Map ** source_maps, size_t num_maps, Map *& result_map,
|
|
|
|
Merger && merger,
|
2017-12-01 18:36:55 +00:00
|
|
|
ThreadPool &)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
std::vector<typename Map::iterator> iterators(num_maps);
|
|
|
|
for (size_t i = 1; i < num_maps; ++i)
|
|
|
|
iterators[i] = source_maps[i]->begin();
|
|
|
|
|
|
|
|
result_map = source_maps[0];
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
bool finish = true;
|
|
|
|
for (size_t i = 1; i < num_maps; ++i)
|
|
|
|
{
|
|
|
|
if (iterators[i] == source_maps[i]->end())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
finish = false;
|
2019-10-29 15:16:51 +00:00
|
|
|
merger((*result_map)[iterators[i]->getKey()], iterators[i]->getMapped());
|
2017-04-01 07:20:54 +00:00
|
|
|
++iterators[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (finish)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
template <typename Map, typename ImplMerge>
|
|
|
|
struct MergeParallelForTwoLevelTable
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
template <typename Merger>
|
|
|
|
static void NO_INLINE execute(Map ** source_maps, size_t num_maps, Map *& result_map,
|
|
|
|
Merger && merger,
|
|
|
|
ThreadPool & pool)
|
|
|
|
{
|
|
|
|
for (size_t bucket = 0; bucket < Map::NUM_BUCKETS; ++bucket)
|
2019-10-17 14:41:27 +00:00
|
|
|
pool.scheduleOrThrowOnError([&, bucket, num_maps]
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
std::vector<typename Map::Impl *> section(num_maps);
|
|
|
|
for (size_t i = 0; i < num_maps; ++i)
|
|
|
|
section[i] = &source_maps[i]->impls[bucket];
|
|
|
|
|
2019-01-04 13:54:39 +00:00
|
|
|
typename Map::Impl * res;
|
|
|
|
ImplMerge::execute(section.data(), num_maps, res, merger, pool);
|
2017-04-01 07:20:54 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
pool.wait();
|
|
|
|
result_map = source_maps[0];
|
|
|
|
}
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Map, typename Aggregate, typename Merge>
|
|
|
|
struct Work
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
template <typename Creator, typename Updater, typename Merger>
|
|
|
|
static void NO_INLINE execute(const Source & data, size_t num_threads,
|
|
|
|
Creator && creator, Updater && updater, Merger && merger,
|
|
|
|
ThreadPool & pool)
|
|
|
|
{
|
|
|
|
std::vector<std::unique_ptr<Map>> intermediate_results;
|
|
|
|
|
|
|
|
Stopwatch watch;
|
|
|
|
|
|
|
|
Aggregate::execute(data, num_threads, intermediate_results, std::forward<Creator>(creator), std::forward<Updater>(updater), pool);
|
|
|
|
size_t num_maps = intermediate_results.size();
|
|
|
|
|
|
|
|
watch.stop();
|
|
|
|
double time_aggregated = watch.elapsedSeconds();
|
|
|
|
std::cerr
|
|
|
|
<< "Aggregated in " << time_aggregated
|
|
|
|
<< " (" << data.size() / time_aggregated << " elem/sec.)"
|
|
|
|
<< std::endl;
|
|
|
|
|
|
|
|
size_t size_before_merge = 0;
|
|
|
|
std::cerr << "Sizes: ";
|
|
|
|
for (size_t i = 0; i < num_threads; ++i)
|
|
|
|
{
|
|
|
|
std::cerr << (i == 0 ? "" : ", ") << intermediate_results[i]->size();
|
|
|
|
size_before_merge += intermediate_results[i]->size();
|
|
|
|
}
|
|
|
|
std::cerr << std::endl;
|
|
|
|
|
|
|
|
watch.restart();
|
|
|
|
|
|
|
|
std::vector<Map*> intermediate_results_ptrs(num_maps);
|
|
|
|
for (size_t i = 0; i < num_maps; ++i)
|
|
|
|
intermediate_results_ptrs[i] = intermediate_results[i].get();
|
|
|
|
|
|
|
|
Map * result_map;
|
|
|
|
Merge::execute(intermediate_results_ptrs.data(), num_maps, result_map, std::forward<Merger>(merger), pool);
|
|
|
|
|
|
|
|
watch.stop();
|
|
|
|
double time_merged = watch.elapsedSeconds();
|
|
|
|
std::cerr
|
|
|
|
<< "Merged in " << time_merged
|
|
|
|
<< " (" << size_before_merge / time_merged << " elem/sec.)"
|
|
|
|
<< std::endl;
|
|
|
|
|
|
|
|
double time_total = time_aggregated + time_merged;
|
|
|
|
std::cerr
|
|
|
|
<< "Total in " << time_total
|
|
|
|
<< " (" << data.size() / time_total << " elem/sec.)"
|
|
|
|
<< std::endl;
|
|
|
|
std::cerr << "Size: " << result_map->size() << std::endl << std::endl;
|
|
|
|
}
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2016-05-28 10:35:44 +00:00
|
|
|
using Map = HashMap<Key, Value, HashCRC32<Key>>;
|
|
|
|
using MapTwoLevel = TwoLevelHashMap<Key, Value, HashCRC32<Key>>;
|
|
|
|
using Mutex = std::mutex;
|
2014-12-27 06:04:55 +00:00
|
|
|
|
|
|
|
|
|
|
|
struct Creator
|
|
|
|
{
|
2017-12-01 18:36:55 +00:00
|
|
|
void operator()(Value &) const {}
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
2021-04-18 09:17:02 +00:00
|
|
|
#if !defined(__clang__)
|
2014-12-27 06:04:55 +00:00
|
|
|
#pragma GCC diagnostic push
|
|
|
|
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
2015-01-22 01:13:13 +00:00
|
|
|
#endif
|
2014-12-27 06:04:55 +00:00
|
|
|
|
|
|
|
struct Updater
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
void operator()(Value & x) const { ++x; }
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
2021-04-18 09:17:02 +00:00
|
|
|
#if !defined(__clang__)
|
2014-12-27 06:04:55 +00:00
|
|
|
#pragma GCC diagnostic pop
|
2015-01-22 01:13:13 +00:00
|
|
|
#endif
|
2014-12-27 06:04:55 +00:00
|
|
|
|
|
|
|
struct Merger
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
void operator()(Value & dst, const Value & src) const { dst += src; }
|
2014-12-27 06:04:55 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
int main(int argc, char ** argv)
|
|
|
|
{
|
2020-03-18 03:27:32 +00:00
|
|
|
size_t n = std::stol(argv[1]);
|
|
|
|
size_t num_threads = std::stol(argv[2]);
|
|
|
|
size_t method = argc <= 3 ? 0 : std::stol(argv[3]);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
std::cerr << std::fixed << std::setprecision(2);
|
|
|
|
|
|
|
|
ThreadPool pool(num_threads);
|
|
|
|
|
|
|
|
Source data(n);
|
|
|
|
|
|
|
|
{
|
|
|
|
Stopwatch watch;
|
|
|
|
DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
|
|
|
|
DB::CompressedReadBuffer in2(in1);
|
|
|
|
|
2018-09-02 03:00:04 +00:00
|
|
|
in2.readStrict(reinterpret_cast<char*>(data.data()), sizeof(data[0]) * n);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
watch.stop();
|
|
|
|
std::cerr << std::fixed << std::setprecision(2)
|
|
|
|
<< "Vector. Size: " << n
|
|
|
|
<< ", elapsed: " << watch.elapsedSeconds()
|
|
|
|
<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
|
|
|
|
<< std::endl << std::endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
Creator creator;
|
|
|
|
Updater updater;
|
|
|
|
Merger merger;
|
|
|
|
|
|
|
|
if (!method || method == 1)
|
|
|
|
Work<
|
|
|
|
Map,
|
|
|
|
AggregateIndependent<Map>,
|
|
|
|
MergeSequential<Map>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 2)
|
|
|
|
Work<
|
|
|
|
Map,
|
|
|
|
AggregateIndependentWithSequentialKeysOptimization<Map>,
|
|
|
|
MergeSequential<Map>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 3)
|
|
|
|
Work<
|
|
|
|
Map,
|
|
|
|
AggregateIndependent<Map>,
|
|
|
|
MergeSequentialTransposed<Map>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 4)
|
|
|
|
Work<
|
|
|
|
Map,
|
|
|
|
AggregateIndependentWithSequentialKeysOptimization<Map>,
|
|
|
|
MergeSequentialTransposed<Map>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 5)
|
|
|
|
Work<
|
|
|
|
MapTwoLevel,
|
|
|
|
AggregateIndependent<MapTwoLevel>,
|
|
|
|
MergeSequential<MapTwoLevel>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 6)
|
|
|
|
Work<
|
|
|
|
MapTwoLevel,
|
|
|
|
AggregateIndependentWithSequentialKeysOptimization<MapTwoLevel>,
|
|
|
|
MergeSequential<MapTwoLevel>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 7)
|
|
|
|
Work<
|
|
|
|
MapTwoLevel,
|
|
|
|
AggregateIndependent<MapTwoLevel>,
|
|
|
|
MergeSequentialTransposed<MapTwoLevel>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 8)
|
|
|
|
Work<
|
|
|
|
MapTwoLevel,
|
|
|
|
AggregateIndependentWithSequentialKeysOptimization<MapTwoLevel>,
|
|
|
|
MergeSequentialTransposed<MapTwoLevel>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 9)
|
|
|
|
Work<
|
|
|
|
MapTwoLevel,
|
|
|
|
AggregateIndependent<MapTwoLevel>,
|
|
|
|
MergeParallelForTwoLevelTable<MapTwoLevel, MergeSequential<MapTwoLevel::Impl>>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 10)
|
|
|
|
Work<
|
|
|
|
MapTwoLevel,
|
|
|
|
AggregateIndependentWithSequentialKeysOptimization<MapTwoLevel>,
|
|
|
|
MergeParallelForTwoLevelTable<MapTwoLevel, MergeSequential<MapTwoLevel::Impl>>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 13)
|
|
|
|
Work<
|
|
|
|
MapTwoLevel,
|
|
|
|
AggregateIndependent<MapTwoLevel>,
|
|
|
|
MergeParallelForTwoLevelTable<MapTwoLevel, MergeSequentialTransposed<MapTwoLevel::Impl>>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
if (!method || method == 14)
|
|
|
|
Work<
|
|
|
|
MapTwoLevel,
|
|
|
|
AggregateIndependentWithSequentialKeysOptimization<MapTwoLevel>,
|
|
|
|
MergeParallelForTwoLevelTable<MapTwoLevel, MergeSequentialTransposed<MapTwoLevel::Impl>>
|
|
|
|
>::execute(data, num_threads, creator, updater, merger, pool);
|
|
|
|
|
|
|
|
return 0;
|
2014-12-27 06:04:55 +00:00
|
|
|
}
|