ClickHouse/src/Functions/hilbertEncode.cpp

#include "hilbertEncode2DLUT.h"
#include <Common/BitHelpers.h>
#include <Functions/PerformanceAdaptors.h>
#include <limits>
#include <optional>
#include <Functions/FunctionFactory.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int ARGUMENT_OUT_OF_BOUND;
}


class FunctionHilbertEncode : public FunctionSpaceFillingCurveEncode
{
public:
    static constexpr auto name = "hilbertEncode";
    static FunctionPtr create(ContextPtr)
    {
        return std::make_shared<FunctionHilbertEncode>();
    }

    String getName() const override { return name; }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
        if (input_rows_count == 0)
            return ColumnUInt64::create();

        size_t num_dimensions = arguments.size();
        size_t vector_start_index = 0;
        const auto * const_col = typeid_cast<const ColumnConst *>(arguments[0].column.get());
        const ColumnTuple * mask;
        if (const_col)
            mask = typeid_cast<const ColumnTuple *>(const_col->getDataColumnPtr().get());
        else
            mask = typeid_cast<const ColumnTuple *>(arguments[0].column.get());
        if (mask)
        {
            num_dimensions = mask->tupleSize();
            vector_start_index = 1;
            for (size_t i = 0; i < num_dimensions; i++)
            {
                auto ratio = mask->getColumn(i).getUInt(0);
                if (ratio > 32)
                    throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
                                    "Illegal argument {} of function {}, should be a number in range 0-32",
                                    arguments[0].column->getName(), getName());
            }
        }

        auto col_res = ColumnUInt64::create();
        ColumnUInt64::Container & vec_res = col_res->getData();
        vec_res.resize(input_rows_count);

        const auto expand = [mask](const UInt64 value, const UInt8 column_num)
        {
            if (mask)
                return value << mask->getColumn(column_num).getUInt(0);
            return value;
        };

        const ColumnPtr & col0 = arguments[0 + vector_start_index].column;
        if (num_dimensions == 1)
        {
            for (size_t i = 0; i < input_rows_count; ++i)
            {
                vec_res[i] = expand(col0->getUInt(i), 0);
            }
            return col_res;
        }

        const ColumnPtr & col1 = arguments[1 + vector_start_index].column;
        if (num_dimensions == 2)
        {
            for (size_t i = 0; i < input_rows_count; ++i)
            {
                vec_res[i] = FunctionHilbertEncode2DWIthLookupTableImpl<3>::encode(
                    expand(col0->getUInt(i), 0),
                    expand(col1->getUInt(i), 1));
            }
            return col_res;
        }

        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
                        "Illegal number of UInt arguments of function {}: should be not more than 2 dimensions",
                        getName());
    }
};


REGISTER_FUNCTION(HilbertEncode)
{
    factory.registerFunction<FunctionHilbertEncode>(FunctionDocumentation{
        .description=R"(
Calculates code for Hilbert Curve for a list of unsigned integers.

The function has two modes of operation:
- Simple
- Expanded

Simple: accepts up to 2 unsigned integers as arguments and produces a UInt64 code.
[example:simple]
Produces: `31`

Expanded: accepts a range mask (tuple) as a first argument and up to 2 unsigned integers as other arguments.
Each number in the mask configures the number of bits by which the corresponding argument will be shifted left, effectively scaling the argument within its range.
[example:range_expanded]
Produces: `4031541586602`
Note: tuple size must be equal to the number of the other arguments

Range expansion can be beneficial when you need a similar distribution for arguments with wildly different ranges (or cardinality)
For example: 'IP Address' (0...FFFFFFFF) and 'Country code' (0...FF)

For a single argument without a tuple, the function returns the argument itself as the Hilbert index, since no dimensional mapping is needed.
[example:identity]
Produces: `1`

If a single argument is provided with a tuple specifying bit shifts, the function shifts the argument left by the specified number of bits.
[example:identity_expanded]
Produces: `512`

The function also accepts columns as arguments:
[example:from_table]

But the range tuple must still be a constant:
[example:from_table_range]

Please note that you can fit only so much bits of information into Hilbert code as UInt64 has.
Two arguments will have a range of maximum 2^32 (64/2) each
All overflow will be clamped to zero
)",
        .examples{
            {"simple", "SELECT hilbertEncode(3, 4)", ""},
            {"range_expanded", "SELECT hilbertEncode((10,6), 1024, 16)", ""},
            {"identity", "SELECT hilbertEncode(1)", ""},
            {"identity_expanded", "SELECT hilbertEncode(tuple(2), 128)", ""},
            {"from_table", "SELECT hilbertEncode(n1, n2) FROM table", ""},
            {"from_table_range", "SELECT hilbertEncode((1,2), n1, n2) FROM table", ""},
        },
        .categories {"Hilbert coding", "Hilbert Curve"}
    });
}

}
fix gtest by adding header with Impl 2024-05-29 09:58:39 +00:00			`#include "hilbertEncode2DLUT.h"`
review 2024-05-28 17:59:06 +00:00			`#include <Common/BitHelpers.h>`
			`#include <Functions/PerformanceAdaptors.h>`
			`#include <limits>`
			`#include <optional>`
hilbert encode function added 2024-02-19 20:21:52 +00:00			`#include <Functions/FunctionFactory.h>`
initial file of hilbertEncode + separate common functions code 2024-02-18 23:07:39 +00:00

style check 2024-02-22 22:06:52 +00:00			`namespace DB`
			`{`
initial file of hilbertEncode + separate common functions code 2024-02-18 23:07:39 +00:00
review 2024-05-28 17:59:06 +00:00			`namespace ErrorCodes`
			`{`
			`extern const int ILLEGAL_TYPE_OF_ARGUMENT;`
			`extern const int ARGUMENT_OUT_OF_BOUND;`
			`}`


			`class FunctionHilbertEncode : public FunctionSpaceFillingCurveEncode`
			`{`
			`public:`
			`static constexpr auto name = "hilbertEncode";`
			`static FunctionPtr create(ContextPtr)`
			`{`
			`return std::make_shared<FunctionHilbertEncode>();`
			`}`

			`String getName() const override { return name; }`

			`ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override`
			`{`
hilbert 2024-05-29 17:36:40 +00:00			`if (input_rows_count == 0)`
			`return ColumnUInt64::create();`
style 2024-05-29 17:42:59 +00:00
review 2024-05-28 17:59:06 +00:00			`size_t num_dimensions = arguments.size();`
			`size_t vector_start_index = 0;`
			`const auto * const_col = typeid_cast<const ColumnConst *>(arguments[0].column.get());`
			`const ColumnTuple * mask;`
			`if (const_col)`
			`mask = typeid_cast<const ColumnTuple *>(const_col->getDataColumnPtr().get());`
			`else`
			`mask = typeid_cast<const ColumnTuple *>(arguments[0].column.get());`
			`if (mask)`
			`{`
			`num_dimensions = mask->tupleSize();`
			`vector_start_index = 1;`
			`for (size_t i = 0; i < num_dimensions; i++)`
			`{`
			`auto ratio = mask->getColumn(i).getUInt(0);`
			`if (ratio > 32)`
			`throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,`
			`"Illegal argument {} of function {}, should be a number in range 0-32",`
			`arguments[0].column->getName(), getName());`
			`}`
			`}`

			`auto col_res = ColumnUInt64::create();`
			`ColumnUInt64::Container & vec_res = col_res->getData();`
			`vec_res.resize(input_rows_count);`

review fixes 2024-06-01 09:49:26 +00:00			`const auto expand = [mask](const UInt64 value, const UInt8 column_num)`
review 2024-05-28 17:59:06 +00:00			`{`
			`if (mask)`
review fixes 2024-06-01 09:49:26 +00:00			`return value << mask->getColumn(column_num).getUInt(0);`
review 2024-05-28 17:59:06 +00:00			`return value;`
			`};`

review fixes 2024-06-01 09:49:26 +00:00			`const ColumnPtr & col0 = arguments[0 + vector_start_index].column;`
review 2024-05-28 17:59:06 +00:00			`if (num_dimensions == 1)`
			`{`
			`for (size_t i = 0; i < input_rows_count; ++i)`
			`{`
			`vec_res[i] = expand(col0->getUInt(i), 0);`
			`}`
			`return col_res;`
			`}`

review fixes 2024-06-01 09:49:26 +00:00			`const ColumnPtr & col1 = arguments[1 + vector_start_index].column;`
review 2024-05-28 17:59:06 +00:00			`if (num_dimensions == 2)`
			`{`
			`for (size_t i = 0; i < input_rows_count; ++i)`
			`{`
			`vec_res[i] = FunctionHilbertEncode2DWIthLookupTableImpl<3>::encode(`
			`expand(col0->getUInt(i), 0),`
			`expand(col1->getUInt(i), 1));`
			`}`
			`return col_res;`
			`}`

			`throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,`
			`"Illegal number of UInt arguments of function {}: should be not more than 2 dimensions",`
			`getName());`
			`}`
			`};`


initial file of hilbertEncode + separate common functions code 2024-02-18 23:07:39 +00:00			`REGISTER_FUNCTION(HilbertEncode)`
			`{`
			`factory.registerFunction<FunctionHilbertEncode>(FunctionDocumentation{`
add hilbert decode 2024-02-23 21:18:05 +00:00			`.description=R"(`
docs added 2024-05-28 18:25:37 +00:00			`Calculates code for Hilbert Curve for a list of unsigned integers.`
initial file of hilbertEncode + separate common functions code 2024-02-18 23:07:39 +00:00
refactoring + ut + description + ratio 2024-02-22 16:08:13 +00:00			`The function has two modes of operation:`
			`- Simple`
			`- Expanded`

			`Simple: accepts up to 2 unsigned integers as arguments and produces a UInt64 code.`
			`[example:simple]`
review 2024-05-28 17:59:06 +00:00			Produces: `31`
refactoring + ut + description + ratio 2024-02-22 16:08:13 +00:00
			`Expanded: accepts a range mask (tuple) as a first argument and up to 2 unsigned integers as other arguments.`
review 2024-05-28 17:59:06 +00:00			`Each number in the mask configures the number of bits by which the corresponding argument will be shifted left, effectively scaling the argument within its range.`
refactoring + ut + description + ratio 2024-02-22 16:08:13 +00:00			`[example:range_expanded]`
review 2024-05-28 17:59:06 +00:00			Produces: `4031541586602`
refactoring + ut + description + ratio 2024-02-22 16:08:13 +00:00			`Note: tuple size must be equal to the number of the other arguments`

			`Range expansion can be beneficial when you need a similar distribution for arguments with wildly different ranges (or cardinality)`
			`For example: 'IP Address' (0...FFFFFFFF) and 'Country code' (0...FF)`

review 2024-05-28 17:59:06 +00:00			`For a single argument without a tuple, the function returns the argument itself as the Hilbert index, since no dimensional mapping is needed.`
refactoring + ut + description + ratio 2024-02-22 16:08:13 +00:00			`[example:identity]`
			Produces: `1`

review 2024-05-28 17:59:06 +00:00			`If a single argument is provided with a tuple specifying bit shifts, the function shifts the argument left by the specified number of bits.`
refactoring + ut + description + ratio 2024-02-22 16:08:13 +00:00			`[example:identity_expanded]`
			Produces: `512`

			`The function also accepts columns as arguments:`
			`[example:from_table]`

			`But the range tuple must still be a constant:`
			`[example:from_table_range]`

review 2024-05-28 17:59:06 +00:00			`Please note that you can fit only so much bits of information into Hilbert code as UInt64 has.`
refactoring + ut + description + ratio 2024-02-22 16:08:13 +00:00			`Two arguments will have a range of maximum 2^32 (64/2) each`
			`All overflow will be clamped to zero`
initial file of hilbertEncode + separate common functions code 2024-02-18 23:07:39 +00:00			`)",`
add hilbert decode 2024-02-23 21:18:05 +00:00			`.examples{`
review 2024-05-28 17:59:06 +00:00			`{"simple", "SELECT hilbertEncode(3, 4)", ""},`
			`{"range_expanded", "SELECT hilbertEncode((10,6), 1024, 16)", ""},`
add hilbert decode 2024-02-23 21:18:05 +00:00			`{"identity", "SELECT hilbertEncode(1)", ""},`
			`{"identity_expanded", "SELECT hilbertEncode(tuple(2), 128)", ""},`
			`{"from_table", "SELECT hilbertEncode(n1, n2) FROM table", ""},`
			`{"from_table_range", "SELECT hilbertEncode((1,2), n1, n2) FROM table", ""},`
			`},`
			`.categories {"Hilbert coding", "Hilbert Curve"}`
initial file of hilbertEncode + separate common functions code 2024-02-18 23:07:39 +00:00			`});`
			`}`

			`}`