ClickHouse/src/Functions/FunctionBase58Conversion.h

#pragma once
#include "config_functions.h"

#if USE_BASEX
#    include <Columns/ColumnConst.h>
#    include <Common/MemorySanitizer.h>
#    include <Columns/ColumnString.h>
#    include <DataTypes/DataTypeString.h>
#    include <Functions/FunctionFactory.h>
#    include <Functions/FunctionHelpers.h>
#    include <IO/WriteHelpers.h>
#    include <base_x.hh>


namespace DB
{

namespace ErrorCodes
{
    extern const int ILLEGAL_COLUMN;
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int BAD_ARGUMENTS;
}

struct Base58Encode
{
    static constexpr auto name = "base58Encode";

    static void process(const ColumnString & input, ColumnString::MutablePtr & dst_column, const std::string & alphabet, size_t input_rows_count)
    {
        auto & dst_data = dst_column->getChars();
        auto & dst_offsets = dst_column->getOffsets();

        /// Wikipedia states Base58 has efficiency of 73%, and we take 1.5 scale to avoid reallocation in most cases
        size_t current_allocated_size = ceil(1.5 * input.getChars().size());

        dst_data.resize(current_allocated_size);
        dst_offsets.resize(input_rows_count);

        const ColumnString::Offsets & src_offsets = input.getOffsets();

        const auto * source = input.getChars().raw_data();
        auto * dst = dst_data.data();
        auto * dst_pos = dst;

        size_t src_offset_prev = 0;
        size_t processed_size = 0;

        const auto& encoder = (alphabet == "bitcoin") ? Base58::bitcoin() :
                             ((alphabet == "flickr") ? Base58::flickr() :
                             ((alphabet == "ripple") ? Base58::ripple() :
                                                       Base58::base58())); //GMP

        std::string encoded;
        for (size_t row = 0; row < input_rows_count; ++row)
        {
            size_t srclen = src_offsets[row] - src_offset_prev - 1;
            /// Why we didn't use char* here?
            /// We don't know the size of the result string beforehand (it's not byte-to-byte encoding),
            /// so we may need to do many resizes (the worst case -- we'll do it for each row)
            /// This way we do exponential resizes and one final resize after whole operation is complete
            encoded.clear();
            if (srclen)
                try
                {
                    encoder.encode(encoded, source, srclen);
                }
                catch (const std::invalid_argument& e)
                {
                    throw Exception(e.what(), ErrorCodes::BAD_ARGUMENTS);
                }
                catch (const std::domain_error& e)
                {
                    throw Exception(e.what(), ErrorCodes::BAD_ARGUMENTS);
                }
            size_t outlen = encoded.size();

            if (processed_size + outlen >= current_allocated_size)
            {
                current_allocated_size += current_allocated_size;
                dst_data.resize(current_allocated_size);
                auto processed_offset = dst_pos - dst;
                dst = dst_data.data();
                dst_pos = dst;
                dst_pos += processed_offset;
            }
            std::memcpy(dst_pos, encoded.c_str(), ++outlen);

            source += srclen + 1;
            dst_pos += outlen;

            dst_offsets[row] = dst_pos - dst;
            src_offset_prev = src_offsets[row];
            processed_size += outlen;
        }

        dst_data.resize(dst_pos - dst);
    }
};

struct Base58Decode
{
    static constexpr auto name = "base58Decode";

    static void process(const ColumnString & input, ColumnString::MutablePtr & dst_column, const std::string & alphabet, size_t input_rows_count)
    {
        auto & dst_data = dst_column->getChars();
        auto & dst_offsets = dst_column->getOffsets();

        /// We allocate probably even more then needed to avoid many resizes
        size_t current_allocated_size = input.getChars().size();

        dst_data.resize(current_allocated_size);
        dst_offsets.resize(input_rows_count);

        const ColumnString::Offsets & src_offsets = input.getOffsets();

        const auto * source = input.getChars().raw_data();
        auto * dst = dst_data.data();
        auto * dst_pos = dst;

        size_t src_offset_prev = 0;
        size_t processed_size = 0;

        const auto& decoder = (alphabet == "bitcoin") ? Base58::bitcoin() :
                             ((alphabet == "flickr") ? Base58::flickr() :
                             ((alphabet == "ripple") ? Base58::ripple() :
                                                       Base58::base58()));

        std::string decoded;
        for (size_t row = 0; row < input_rows_count; ++row)
        {
            size_t srclen = src_offsets[row] - src_offset_prev - 1;
            /// Why we didn't use char* here?
            /// We don't know the size of the result string beforehand (it's not byte-to-byte encoding),
            /// so we may need to do many resizes (the worst case -- we'll do it for each row)
            /// This way we do exponential resizes and one final resize after whole operation is complete
            decoded.clear();
            if (srclen)
                try
                {
                    decoder.decode(decoded, source, srclen);
                }
                catch (const std::invalid_argument& e)
                {
                    throw Exception(e.what(), ErrorCodes::BAD_ARGUMENTS);
                }
                catch (const std::domain_error& e)
                {
                    throw Exception(e.what(), ErrorCodes::BAD_ARGUMENTS);
                }
            size_t outlen = decoded.size();

            if (processed_size + outlen >= current_allocated_size)
            {
                current_allocated_size += current_allocated_size;
                dst_data.resize(current_allocated_size);
                auto processed_offset = dst_pos - dst;
                dst = dst_data.data();
                dst_pos = dst;
                dst_pos += processed_offset;
            }
            std::memcpy(dst_pos, decoded.c_str(), ++outlen);

            source += srclen + 1;
            dst_pos += outlen;

            dst_offsets[row] = dst_pos - dst;
            src_offset_prev = src_offsets[row];
            processed_size += outlen;
        }

        dst_data.resize(dst_pos - dst);
    }
};

template <typename Func>
class FunctionBase58Conversion : public IFunction
{
public:
    static constexpr auto name = Func::name;

    static FunctionPtr create(ContextPtr)
    {
        return std::make_shared<FunctionBase58Conversion>();
    }

    String getName() const override
    {
        return Func::name;
    }

    bool isVariadic() const override { return true; }

    size_t getNumberOfArguments() const override { return 0; }

    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }

    bool useDefaultImplementationForConstants() const override { return true; }

    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }

    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
    {
        if (arguments.size() != 1 && arguments.size() != 2)
            throw Exception(
                "Wrong number of arguments for function " + getName() + ":  1 or 2 expected.",
                ErrorCodes::BAD_ARGUMENTS);

        if (!isString(arguments[0].type))
            throw Exception(
                "Illegal type " + arguments[0].type->getName() + " of 1st argument of function " + getName() + ". Must be String.",
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

        if (arguments.size() == 2 && !isString(arguments[1].type))
            throw Exception(
                "Illegal type " + arguments[1].type->getName() + " of 2nd argument of function " + getName() + ". Must be String.",
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

        return std::make_shared<DataTypeString>();
    }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
        const ColumnPtr column_string = arguments[0].column;
        const ColumnString * input = checkAndGetColumn<ColumnString>(column_string.get());
        if (!input)
            throw Exception(
                "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName() + ", must be String",
                ErrorCodes::ILLEGAL_COLUMN);

        std::string alphabet = "bitcoin";

        if (arguments.size() == 2)
        {
            const auto * alphabet_column = checkAndGetColumn<ColumnConst>(arguments[1].column.get());

            if (!alphabet_column)
                throw Exception("Second argument for function " + getName() + " must be constant String", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

            alphabet = alphabet_column->getValue<DB::String>();
            if (alphabet != "bitcoin" && alphabet != "ripple" && alphabet != "flickr" && alphabet != "gmp")
                throw Exception("Second argument for function " + getName() + " must be 'bitcoin', 'ripple', 'gmp' or 'flickr'", ErrorCodes::ILLEGAL_COLUMN);

        }

        auto dst_column = ColumnString::create();

        Func::process(*input, dst_column, alphabet, input_rows_count);

        return dst_column;
    }
};
}

#endif
initial setup 2022-06-15 06:49:55 +00:00			`#pragma once`
			`#include "config_functions.h"`

rename variable + fix handling of ENABLE_LIBRARIES 2022-06-20 18:53:47 +00:00			`#if USE_BASEX`
initial setup 2022-06-15 06:49:55 +00:00			`# include <Columns/ColumnConst.h>`
			`# include <Common/MemorySanitizer.h>`
			`# include <Columns/ColumnString.h>`
			`# include <DataTypes/DataTypeString.h>`
			`# include <Functions/FunctionFactory.h>`
			`# include <Functions/FunctionHelpers.h>`
			`# include <IO/WriteHelpers.h>`
			`# include <base_x.hh>`


			`namespace DB`
			`{`

			`namespace ErrorCodes`
			`{`
			`extern const int ILLEGAL_COLUMN;`
			`extern const int ILLEGAL_TYPE_OF_ARGUMENT;`
			`extern const int BAD_ARGUMENTS;`
			`}`

			`struct Base58Encode`
			`{`
			`static constexpr auto name = "base58Encode";`

small fixes 2022-06-20 14:30:54 +00:00			`static void process(const ColumnString & input, ColumnString::MutablePtr & dst_column, const std::string & alphabet, size_t input_rows_count)`
initial setup 2022-06-15 06:49:55 +00:00			`{`
wip upload 2022-06-16 10:11:41 +00:00			`auto & dst_data = dst_column->getChars();`
			`auto & dst_offsets = dst_column->getOffsets();`

small fixes 2022-06-20 14:30:54 +00:00			`/// Wikipedia states Base58 has efficiency of 73%, and we take 1.5 scale to avoid reallocation in most cases`
			`size_t current_allocated_size = ceil(1.5 * input.getChars().size());`
wip upload 2022-06-16 10:11:41 +00:00
			`dst_data.resize(current_allocated_size);`
			`dst_offsets.resize(input_rows_count);`

Added tests + minor updates 2022-06-19 18:10:28 +00:00			`const ColumnString::Offsets & src_offsets = input.getOffsets();`
wip upload 2022-06-16 10:11:41 +00:00
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`const auto * source = input.getChars().raw_data();`
wip upload 2022-06-16 10:11:41 +00:00			`auto * dst = dst_data.data();`
			`auto * dst_pos = dst;`

			`size_t src_offset_prev = 0;`
			`size_t processed_size = 0;`

			`const auto& encoder = (alphabet == "bitcoin") ? Base58::bitcoin() :`
			`((alphabet == "flickr") ? Base58::flickr() :`
remove wrong code 2022-06-16 20:52:45 +00:00			`((alphabet == "ripple") ? Base58::ripple() :`
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`Base58::base58())); //GMP`
initial setup 2022-06-15 06:49:55 +00:00
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`std::string encoded;`
wip upload 2022-06-16 10:11:41 +00:00			`for (size_t row = 0; row < input_rows_count; ++row)`
			`{`
			`size_t srclen = src_offsets[row] - src_offset_prev - 1;`
remove wrong code 2022-06-16 20:52:45 +00:00			`/// Why we didn't use char* here?`
wip upload 2022-06-16 10:11:41 +00:00			`/// We don't know the size of the result string beforehand (it's not byte-to-byte encoding),`
			`/// so we may need to do many resizes (the worst case -- we'll do it for each row)`
remove wrong code 2022-06-16 20:52:45 +00:00			`/// This way we do exponential resizes and one final resize after whole operation is complete`
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`encoded.clear();`
fix zero byte 2022-06-17 18:48:14 +00:00			`if (srclen)`
updated exception handling 2022-06-23 19:36:57 +00:00			`try`
			`{`
			`encoder.encode(encoded, source, srclen);`
			`}`
			`catch (const std::invalid_argument& e)`
			`{`
			`throw Exception(e.what(), ErrorCodes::BAD_ARGUMENTS);`
			`}`
			`catch (const std::domain_error& e)`
			`{`
			`throw Exception(e.what(), ErrorCodes::BAD_ARGUMENTS);`
			`}`
wip upload 2022-06-16 10:11:41 +00:00			`size_t outlen = encoded.size();`

			`if (processed_size + outlen >= current_allocated_size)`
			`{`
			`current_allocated_size += current_allocated_size;`
			`dst_data.resize(current_allocated_size);`
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`auto processed_offset = dst_pos - dst;`
			`dst = dst_data.data();`
			`dst_pos = dst;`
			`dst_pos += processed_offset;`
wip upload 2022-06-16 10:11:41 +00:00			`}`
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`std::memcpy(dst_pos, encoded.c_str(), ++outlen);`
wip upload 2022-06-16 10:11:41 +00:00
			`source += srclen + 1;`
fix zero byte 2022-06-17 18:48:14 +00:00			`dst_pos += outlen;`
wip upload 2022-06-16 10:11:41 +00:00
			`dst_offsets[row] = dst_pos - dst;`
			`src_offset_prev = src_offsets[row];`
			`processed_size += outlen;`
			`}`

			`dst_data.resize(dst_pos - dst);`
initial setup 2022-06-15 06:49:55 +00:00			`}`
			`};`

			`struct Base58Decode`
			`{`
			`static constexpr auto name = "base58Decode";`

small fixes 2022-06-20 14:30:54 +00:00			`static void process(const ColumnString & input, ColumnString::MutablePtr & dst_column, const std::string & alphabet, size_t input_rows_count)`
initial setup 2022-06-15 06:49:55 +00:00			`{`
remove wrong code 2022-06-16 20:52:45 +00:00			`auto & dst_data = dst_column->getChars();`
			`auto & dst_offsets = dst_column->getOffsets();`
initial setup 2022-06-15 06:49:55 +00:00
small fixes 2022-06-20 14:30:54 +00:00			`/// We allocate probably even more then needed to avoid many resizes`
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`size_t current_allocated_size = input.getChars().size();`
initial setup 2022-06-15 06:49:55 +00:00
remove wrong code 2022-06-16 20:52:45 +00:00			`dst_data.resize(current_allocated_size);`
			`dst_offsets.resize(input_rows_count);`

Added tests + minor updates 2022-06-19 18:10:28 +00:00			`const ColumnString::Offsets & src_offsets = input.getOffsets();`
remove wrong code 2022-06-16 20:52:45 +00:00
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`const auto * source = input.getChars().raw_data();`
remove wrong code 2022-06-16 20:52:45 +00:00			`auto * dst = dst_data.data();`
			`auto * dst_pos = dst;`

			`size_t src_offset_prev = 0;`
			`size_t processed_size = 0;`

			`const auto& decoder = (alphabet == "bitcoin") ? Base58::bitcoin() :`
			`((alphabet == "flickr") ? Base58::flickr() :`
			`((alphabet == "ripple") ? Base58::ripple() :`
			`Base58::base58()));`

Added tests + minor updates 2022-06-19 18:10:28 +00:00			`std::string decoded;`
remove wrong code 2022-06-16 20:52:45 +00:00			`for (size_t row = 0; row < input_rows_count; ++row)`
			`{`
			`size_t srclen = src_offsets[row] - src_offset_prev - 1;`
			`/// Why we didn't use char* here?`
			`/// We don't know the size of the result string beforehand (it's not byte-to-byte encoding),`
			`/// so we may need to do many resizes (the worst case -- we'll do it for each row)`
			`/// This way we do exponential resizes and one final resize after whole operation is complete`
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`decoded.clear();`
			`if (srclen)`
updated exception handling 2022-06-23 19:36:57 +00:00			`try`
			`{`
			`decoder.decode(decoded, source, srclen);`
			`}`
			`catch (const std::invalid_argument& e)`
			`{`
			`throw Exception(e.what(), ErrorCodes::BAD_ARGUMENTS);`
			`}`
			`catch (const std::domain_error& e)`
			`{`
			`throw Exception(e.what(), ErrorCodes::BAD_ARGUMENTS);`
			`}`
remove wrong code 2022-06-16 20:52:45 +00:00			`size_t outlen = decoded.size();`

			`if (processed_size + outlen >= current_allocated_size)`
			`{`
			`current_allocated_size += current_allocated_size;`
			`dst_data.resize(current_allocated_size);`
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`auto processed_offset = dst_pos - dst;`
			`dst = dst_data.data();`
			`dst_pos = dst;`
			`dst_pos += processed_offset;`
remove wrong code 2022-06-16 20:52:45 +00:00			`}`
Added tests + minor updates 2022-06-19 18:10:28 +00:00			`std::memcpy(dst_pos, decoded.c_str(), ++outlen);`
remove wrong code 2022-06-16 20:52:45 +00:00
			`source += srclen + 1;`
fix zero byte 2022-06-17 18:48:14 +00:00			`dst_pos += outlen;`
remove wrong code 2022-06-16 20:52:45 +00:00
			`dst_offsets[row] = dst_pos - dst;`
			`src_offset_prev = src_offsets[row];`
			`processed_size += outlen;`
			`}`

			`dst_data.resize(dst_pos - dst);`
initial setup 2022-06-15 06:49:55 +00:00			`}`
			`};`

			`template <typename Func>`
			`class FunctionBase58Conversion : public IFunction`
			`{`
			`public:`
			`static constexpr auto name = Func::name;`

			`static FunctionPtr create(ContextPtr)`
			`{`
			`return std::make_shared<FunctionBase58Conversion>();`
			`}`

			`String getName() const override`
			`{`
			`return Func::name;`
			`}`

			`bool isVariadic() const override { return true; }`

			`size_t getNumberOfArguments() const override { return 0; }`

			`bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /arguments/) const override { return true; }`

			`bool useDefaultImplementationForConstants() const override { return true; }`

			`ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }`

			`DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override`
			`{`
remove wrong code 2022-06-16 20:52:45 +00:00			`if (arguments.size() != 1 && arguments.size() != 2)`
initial setup 2022-06-15 06:49:55 +00:00			`throw Exception(`
remove wrong code 2022-06-16 20:52:45 +00:00			`"Wrong number of arguments for function " + getName() + ": 1 or 2 expected.",`
initial setup 2022-06-15 06:49:55 +00:00			`ErrorCodes::BAD_ARGUMENTS);`

			`if (!isString(arguments[0].type))`
			`throw Exception(`
			`"Illegal type " + arguments[0].type->getName() + " of 1st argument of function " + getName() + ". Must be String.",`
			`ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);`

fix zero byte 2022-06-17 18:48:14 +00:00			`if (arguments.size() == 2 && !isString(arguments[1].type))`
initial setup 2022-06-15 06:49:55 +00:00			`throw Exception(`
			`"Illegal type " + arguments[1].type->getName() + " of 2nd argument of function " + getName() + ". Must be String.",`
			`ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);`

			`return std::make_shared<DataTypeString>();`
			`}`

			`ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override`
			`{`
			`const ColumnPtr column_string = arguments[0].column;`
			`const ColumnString * input = checkAndGetColumn<ColumnString>(column_string.get());`
			`if (!input)`
			`throw Exception(`
small fixes 2022-06-20 14:30:54 +00:00			`"Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName() + ", must be String",`
initial setup 2022-06-15 06:49:55 +00:00			`ErrorCodes::ILLEGAL_COLUMN);`

			`std::string alphabet = "bitcoin";`

			`if (arguments.size() == 2)`
			`{`
			`const auto * alphabet_column = checkAndGetColumn<ColumnConst>(arguments[1].column.get());`

			`if (!alphabet_column)`
			`throw Exception("Second argument for function " + getName() + " must be constant String", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);`

small fixes 2022-06-20 14:30:54 +00:00			`alphabet = alphabet_column->getValue<DB::String>();`
			`if (alphabet != "bitcoin" && alphabet != "ripple" && alphabet != "flickr" && alphabet != "gmp")`
wip upload 2022-06-16 10:11:41 +00:00			`throw Exception("Second argument for function " + getName() + " must be 'bitcoin', 'ripple', 'gmp' or 'flickr'", ErrorCodes::ILLEGAL_COLUMN);`
initial setup 2022-06-15 06:49:55 +00:00
			`}`

			`auto dst_column = ColumnString::create();`

Added tests + minor updates 2022-06-19 18:10:28 +00:00			`Func::process(*input, dst_column, alphabet, input_rows_count);`
initial setup 2022-06-15 06:49:55 +00:00
			`return dst_column;`
			`}`
			`};`
			`}`

			`#endif`