ClickHouse/src/Functions/generateSnowflakeID.cpp
2024-05-28 09:22:52 +00:00

221 lines
9.2 KiB
C++

#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsRandom.h>
#include <Functions/FunctionHelpers.h>
#include <Core/ServerUUID.h>
#include <Common/Logger.h>
#include <Common/logger_useful.h>
#include "base/types.h"
namespace DB
{
namespace
{
/* Snowflake ID
https://en.wikipedia.org/wiki/Snowflake_ID
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|0| timestamp |
├─┼ ┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
| | machine_id | machine_seq_num |
└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
- The first 41 (+ 1 top zero bit) bits is the timestamp (millisecond since Unix epoch 1 Jan 1970)
- The middle 10 bits are the machine ID
- The last 12 bits are a counter to disambiguate multiple snowflakeIDs generated within the same millisecond by different processes
*/
/// bit counts
constexpr auto timestamp_bits_count = 41;
constexpr auto machine_id_bits_count = 10;
constexpr auto machine_seq_num_bits_count = 12;
/// bits masks for Snowflake ID components
constexpr uint64_t machine_id_mask = ((1ull << machine_id_bits_count) - 1) << machine_seq_num_bits_count;
constexpr uint64_t machine_seq_num_mask = (1ull << machine_seq_num_bits_count) - 1;
/// max values
constexpr uint64_t max_machine_seq_num = machine_seq_num_mask;
uint64_t getTimestamp()
{
auto now = std::chrono::system_clock::now();
auto ticks_since_epoch = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
return static_cast<uint64_t>(ticks_since_epoch) & ((1ull << timestamp_bits_count) - 1);
}
uint64_t getMachineIdImpl()
{
UUID server_uuid = ServerUUID::get();
/// hash into 64 bits
uint64_t hi = UUIDHelpers::getHighBytes(server_uuid);
uint64_t lo = UUIDHelpers::getLowBytes(server_uuid);
/// return only 10 bits
return (((hi * 11) ^ (lo * 17)) & machine_id_mask) >> machine_seq_num_bits_count;
}
uint64_t getMachineId()
{
static uint64_t machine_id = getMachineIdImpl();
return machine_id;
}
struct SnowflakeId
{
uint64_t timestamp;
uint64_t machine_id;
uint64_t machine_seq_num;
};
SnowflakeId toSnowflakeId(uint64_t snowflake)
{
return {.timestamp = (snowflake >> (machine_id_bits_count + machine_seq_num_bits_count)),
.machine_id = ((snowflake & machine_id_mask) >> machine_seq_num_bits_count),
.machine_seq_num = (snowflake & machine_seq_num_mask)};
}
uint64_t fromSnowflakeId(SnowflakeId components)
{
return (components.timestamp << (machine_id_bits_count + machine_seq_num_bits_count) |
components.machine_id << (machine_seq_num_bits_count) |
components.machine_seq_num);
}
struct SnowflakeIdRange
{
SnowflakeId begin; /// inclusive
SnowflakeId end; /// exclusive
};
/// To get the range of `input_rows_count` Snowflake IDs from `max(available, now)`:
/// 1. calculate Snowflake ID by current timestamp (`now`)
/// 2. `begin = max(available, now)`
/// 3. Calculate `end = begin + input_rows_count` handling `machine_seq_num` overflow
SnowflakeIdRange getRangeOfAvailableIds(const SnowflakeId & available, size_t input_rows_count)
{
/// 1. `now`
SnowflakeId begin = {.timestamp = getTimestamp(), .machine_id = getMachineId(), .machine_seq_num = 0};
/// 2. `begin`
if (begin.timestamp <= available.timestamp)
{
begin.timestamp = available.timestamp;
begin.machine_seq_num = available.machine_seq_num;
}
/// 3. `end = begin + input_rows_count`
SnowflakeId end;
const uint64_t seq_nums_in_current_timestamp_left = (max_machine_seq_num - begin.machine_seq_num + 1);
if (input_rows_count >= seq_nums_in_current_timestamp_left)
/// if sequence numbers in current timestamp is not enough for rows --> depending on how many elements input_rows_count overflows, forward timestamp by at least 1 tick
end.timestamp = begin.timestamp + 1 + (input_rows_count - seq_nums_in_current_timestamp_left) / (max_machine_seq_num + 1);
else
end.timestamp = begin.timestamp;
end.machine_id = begin.machine_id;
end.machine_seq_num = (begin.machine_seq_num + input_rows_count) & machine_seq_num_mask;
return {begin, end};
}
struct Data
{
/// Guarantee counter monotonicity within one timestamp across all threads generating Snowflake IDs simultaneously.
static inline std::atomic<uint64_t> lowest_available_snowflake_id = 0;
SnowflakeId reserveRange(size_t input_rows_count)
{
uint64_t available_snowflake_id = lowest_available_snowflake_id.load();
SnowflakeIdRange range;
do
{
range = getRangeOfAvailableIds(toSnowflakeId(available_snowflake_id), input_rows_count);
}
while (!lowest_available_snowflake_id.compare_exchange_weak(available_snowflake_id, fromSnowflakeId(range.end)));
/// CAS failed --> another thread updated `lowest_available_snowflake_id` and we re-try
/// else --> our thread reserved ID range [begin, end) and return the beginning of the range
return range.begin;
}
};
}
class FunctionGenerateSnowflakeID : public IFunction
{
public:
static constexpr auto name = "generateSnowflakeID";
static FunctionPtr create(ContextPtr /*context*/) { return std::make_shared<FunctionGenerateSnowflakeID>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 0; }
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
bool useDefaultImplementationForNulls() const override { return false; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
bool isVariadic() const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
FunctionArgumentDescriptors mandatory_args;
FunctionArgumentDescriptors optional_args{
{"expr", nullptr, nullptr, "Arbitrary expression"}
};
validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
return std::make_shared<DataTypeUInt64>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & /*arguments*/, const DataTypePtr &, size_t input_rows_count) const override
{
auto col_res = ColumnVector<UInt64>::create();
typename ColumnVector<UInt64>::Container & vec_to = col_res->getData();
if (input_rows_count != 0)
{
vec_to.resize(input_rows_count);
Data data;
SnowflakeId snowflake_id = data.reserveRange(input_rows_count); /// returns begin of available snowflake ids range
for (UInt64 & to_row : vec_to)
{
to_row = fromSnowflakeId(snowflake_id);
if (snowflake_id.machine_seq_num == max_machine_seq_num)
{
/// handle overflow
snowflake_id.machine_seq_num = 0;
++snowflake_id.timestamp;
}
else
{
++snowflake_id.machine_seq_num;
}
}
}
return col_res;
}
};
REGISTER_FUNCTION(GenerateSnowflakeID)
{
FunctionDocumentation::Description description = R"(Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. Function generateSnowflakeID guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)";
FunctionDocumentation::Syntax syntax = "generateSnowflakeID([expression])";
FunctionDocumentation::Arguments arguments = {{"expression", "The expression is used to bypass common subexpression elimination if the function is called multiple times in a query but otherwise ignored. Optional."}};
FunctionDocumentation::ReturnedValue returned_value = "A value of type UInt64";
FunctionDocumentation::Examples examples = {{"single", "SELECT generateSnowflakeID()", "7201148511606784000"}, {"multiple", "SELECT generateSnowflakeID(1), generateSnowflakeID(2)", ""}};
FunctionDocumentation::Categories categories = {"Snowflake ID"};
factory.registerFunction<FunctionGenerateSnowflakeID>({description, syntax, arguments, returned_value, examples, categories});
}
}