ClickHouse/src/Functions/hilbertEncode.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

285 lines
9.9 KiB
C++
Raw Normal View History

2024-05-28 17:59:06 +00:00
#include <Common/BitHelpers.h>
#include <Functions/FunctionSpaceFillingCurve.h>
#include <Functions/PerformanceAdaptors.h>
#include <limits>
#include <optional>
2024-02-19 20:21:52 +00:00
#include <Functions/FunctionFactory.h>
2024-02-22 22:06:52 +00:00
namespace DB
{
2024-05-28 17:59:06 +00:00
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ARGUMENT_OUT_OF_BOUND;
}
namespace HilbertDetails
{
template <UInt8 bit_step>
class HilbertEncodeLookupTable
{
public:
constexpr static UInt8 LOOKUP_TABLE[0] = {};
};
template <>
class HilbertEncodeLookupTable<1>
{
public:
constexpr static UInt8 LOOKUP_TABLE[16] = {
4, 1, 11, 2,
0, 15, 5, 6,
10, 9, 3, 12,
14, 7, 13, 8
};
};
template <>
class HilbertEncodeLookupTable<2>
{
public:
constexpr static UInt8 LOOKUP_TABLE[64] = {
0, 51, 20, 5, 17, 18, 39, 6,
46, 45, 24, 9, 15, 60, 43, 10,
16, 1, 62, 31, 35, 2, 61, 44,
4, 55, 8, 59, 21, 22, 25, 26,
42, 41, 38, 37, 11, 56, 7, 52,
28, 13, 50, 19, 47, 14, 49, 32,
58, 27, 12, 63, 57, 40, 29, 30,
54, 23, 34, 33, 53, 36, 3, 48
};
};
template <>
class HilbertEncodeLookupTable<3>
{
public:
constexpr static UInt8 LOOKUP_TABLE[256] = {
64, 1, 206, 79, 16, 211, 84, 21, 131, 2, 205, 140, 81, 82, 151, 22, 4,
199, 8, 203, 158, 157, 88, 25, 69, 70, 73, 74, 31, 220, 155, 26, 186,
185, 182, 181, 32, 227, 100, 37, 59, 248, 55, 244, 97, 98, 167, 38, 124,
61, 242, 115, 174, 173, 104, 41, 191, 62, 241, 176, 47, 236, 171, 42, 0,
195, 68, 5, 250, 123, 60, 255, 65, 66, 135, 6, 249, 184, 125, 126, 142,
141, 72, 9, 246, 119, 178, 177, 15, 204, 139, 10, 245, 180, 51, 240, 80,
17, 222, 95, 96, 33, 238, 111, 147, 18, 221, 156, 163, 34, 237, 172, 20,
215, 24, 219, 36, 231, 40, 235, 85, 86, 89, 90, 101, 102, 105, 106, 170,
169, 166, 165, 154, 153, 150, 149, 43, 232, 39, 228, 27, 216, 23, 212, 108,
45, 226, 99, 92, 29, 210, 83, 175, 46, 225, 160, 159, 30, 209, 144, 48,
243, 116, 53, 202, 75, 12, 207, 113, 114, 183, 54, 201, 136, 77, 78, 190,
189, 120, 57, 198, 71, 130, 129, 63, 252, 187, 58, 197, 132, 3, 192, 234,
107, 44, 239, 112, 49, 254, 127, 233, 168, 109, 110, 179, 50, 253, 188, 230,
103, 162, 161, 52, 247, 56, 251, 229, 164, 35, 224, 117, 118, 121, 122, 218,
91, 28, 223, 138, 137, 134, 133, 217, 152, 93, 94, 11, 200, 7, 196, 214,
87, 146, 145, 76, 13, 194, 67, 213, 148, 19, 208, 143, 14, 193, 128,
};
};
}
template <UInt8 bit_step>
class FunctionHilbertEncode2DWIthLookupTableImpl
{
static_assert(bit_step <= 3, "bit_step should not be more than 3 to fit in UInt8");
public:
static UInt64 encode(UInt64 x, UInt64 y)
{
UInt64 hilbert_code = 0;
const auto leading_zeros_count = getLeadingZeroBits(x | y);
const auto used_bits = std::numeric_limits<UInt64>::digits - leading_zeros_count;
auto [current_shift, state] = getInitialShiftAndState(used_bits);
while (current_shift >= 0)
{
const UInt8 x_bits = (x >> current_shift) & STEP_MASK;
const UInt8 y_bits = (y >> current_shift) & STEP_MASK;
const auto hilbert_bits = getCodeAndUpdateState(x_bits, y_bits, state);
hilbert_code |= (hilbert_bits << getHilbertShift(current_shift));
current_shift -= bit_step;
}
return hilbert_code;
}
private:
// for bit_step = 3
// LOOKUP_TABLE[SSXXXYYY] = SSHHHHHH
// where SS - 2 bits for state, XXX - 3 bits of x, YYY - 3 bits of y
// State is rotation of curve on every step, left/up/right/down - therefore 2 bits
static UInt64 getCodeAndUpdateState(UInt8 x_bits, UInt8 y_bits, UInt8& state)
{
const UInt8 table_index = state | (x_bits << bit_step) | y_bits;
const auto table_code = HilbertDetails::HilbertEncodeLookupTable<bit_step>::LOOKUP_TABLE[table_index];
state = table_code & STATE_MASK;
return table_code & HILBERT_MASK;
}
// hilbert code is double size of input values
static constexpr UInt8 getHilbertShift(UInt8 shift)
{
return shift << 1;
}
static std::pair<Int8, UInt8> getInitialShiftAndState(UInt8 used_bits)
{
UInt8 iterations = used_bits / bit_step;
Int8 initial_shift = iterations * bit_step;
if (initial_shift < used_bits)
{
++iterations;
}
else
{
initial_shift -= bit_step;
}
UInt8 state = iterations % 2 == 0 ? LEFT_STATE : DEFAULT_STATE;
return {initial_shift, state};
}
constexpr static UInt8 STEP_MASK = (1 << bit_step) - 1;
constexpr static UInt8 HILBERT_SHIFT = getHilbertShift(bit_step);
constexpr static UInt8 HILBERT_MASK = (1 << HILBERT_SHIFT) - 1;
constexpr static UInt8 STATE_MASK = 0b11 << HILBERT_SHIFT;
constexpr static UInt8 LEFT_STATE = 0b01 << HILBERT_SHIFT;
constexpr static UInt8 DEFAULT_STATE = bit_step % 2 == 0 ? LEFT_STATE : 0;
};
class FunctionHilbertEncode : public FunctionSpaceFillingCurveEncode
{
public:
static constexpr auto name = "hilbertEncode";
static FunctionPtr create(ContextPtr)
{
return std::make_shared<FunctionHilbertEncode>();
}
String getName() const override { return name; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
size_t num_dimensions = arguments.size();
size_t vector_start_index = 0;
const auto * const_col = typeid_cast<const ColumnConst *>(arguments[0].column.get());
const ColumnTuple * mask;
if (const_col)
mask = typeid_cast<const ColumnTuple *>(const_col->getDataColumnPtr().get());
else
mask = typeid_cast<const ColumnTuple *>(arguments[0].column.get());
if (mask)
{
num_dimensions = mask->tupleSize();
vector_start_index = 1;
for (size_t i = 0; i < num_dimensions; i++)
{
auto ratio = mask->getColumn(i).getUInt(0);
if (ratio > 32)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
"Illegal argument {} of function {}, should be a number in range 0-32",
arguments[0].column->getName(), getName());
}
}
auto non_const_arguments = arguments;
for (auto & argument : non_const_arguments)
argument.column = argument.column->convertToFullColumnIfConst();
auto col_res = ColumnUInt64::create();
ColumnUInt64::Container & vec_res = col_res->getData();
vec_res.resize(input_rows_count);
const auto expand = [mask](const UInt64 value, const UInt8 column_id)
{
if (mask)
return value << mask->getColumn(column_id).getUInt(0);
return value;
};
const ColumnPtr & col0 = non_const_arguments[0 + vector_start_index].column;
if (num_dimensions == 1)
{
for (size_t i = 0; i < input_rows_count; ++i)
{
vec_res[i] = expand(col0->getUInt(i), 0);
}
return col_res;
}
const ColumnPtr & col1 = non_const_arguments[1 + vector_start_index].column;
if (num_dimensions == 2)
{
for (size_t i = 0; i < input_rows_count; ++i)
{
vec_res[i] = FunctionHilbertEncode2DWIthLookupTableImpl<3>::encode(
expand(col0->getUInt(i), 0),
expand(col1->getUInt(i), 1));
}
return col_res;
}
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal number of UInt arguments of function {}: should be not more than 2 dimensions",
getName());
}
};
REGISTER_FUNCTION(HilbertEncode)
{
factory.registerFunction<FunctionHilbertEncode>(FunctionDocumentation{
2024-02-23 21:18:05 +00:00
.description=R"(
2024-05-28 17:59:06 +00:00
Calculates code for Hilbert Curve for a list of unsigned integers
2024-02-22 16:08:13 +00:00
The function has two modes of operation:
- Simple
- Expanded
Simple: accepts up to 2 unsigned integers as arguments and produces a UInt64 code.
[example:simple]
2024-05-28 17:59:06 +00:00
Produces: `31`
2024-02-22 16:08:13 +00:00
Expanded: accepts a range mask (tuple) as a first argument and up to 2 unsigned integers as other arguments.
2024-05-28 17:59:06 +00:00
Each number in the mask configures the number of bits by which the corresponding argument will be shifted left, effectively scaling the argument within its range.
2024-02-22 16:08:13 +00:00
[example:range_expanded]
2024-05-28 17:59:06 +00:00
Produces: `4031541586602`
2024-02-22 16:08:13 +00:00
Note: tuple size must be equal to the number of the other arguments
Range expansion can be beneficial when you need a similar distribution for arguments with wildly different ranges (or cardinality)
For example: 'IP Address' (0...FFFFFFFF) and 'Country code' (0...FF)
2024-05-28 17:59:06 +00:00
For a single argument without a tuple, the function returns the argument itself as the Hilbert index, since no dimensional mapping is needed.
2024-02-22 16:08:13 +00:00
[example:identity]
Produces: `1`
2024-05-28 17:59:06 +00:00
If a single argument is provided with a tuple specifying bit shifts, the function shifts the argument left by the specified number of bits.
2024-02-22 16:08:13 +00:00
[example:identity_expanded]
Produces: `512`
The function also accepts columns as arguments:
[example:from_table]
But the range tuple must still be a constant:
[example:from_table_range]
2024-05-28 17:59:06 +00:00
Please note that you can fit only so much bits of information into Hilbert code as UInt64 has.
2024-02-22 16:08:13 +00:00
Two arguments will have a range of maximum 2^32 (64/2) each
All overflow will be clamped to zero
)",
2024-02-23 21:18:05 +00:00
.examples{
2024-05-28 17:59:06 +00:00
{"simple", "SELECT hilbertEncode(3, 4)", ""},
{"range_expanded", "SELECT hilbertEncode((10,6), 1024, 16)", ""},
2024-02-23 21:18:05 +00:00
{"identity", "SELECT hilbertEncode(1)", ""},
{"identity_expanded", "SELECT hilbertEncode(tuple(2), 128)", ""},
{"from_table", "SELECT hilbertEncode(n1, n2) FROM table", ""},
{"from_table_range", "SELECT hilbertEncode((1,2), n1, n2) FROM table", ""},
},
.categories {"Hilbert coding", "Hilbert Curve"}
});
}
}