mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 23:52:03 +00:00
Add support for FixedString input to base64 functions
This commit is contained in:
parent
8ec4c77352
commit
14141c9071
@ -571,13 +571,13 @@ Similar to base58Decode, but returns an empty string in case of error.
|
||||
|
||||
## base64Encode(s)
|
||||
|
||||
Encodes ‘s’ string into base64
|
||||
Encodes ‘s’ FixedString or String into base64.
|
||||
|
||||
Alias: `TO_BASE64`.
|
||||
|
||||
## base64Decode(s)
|
||||
|
||||
Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception.
|
||||
Decode base64-encoded FixedString or String ‘s’ into original string. In case of failure raises an exception.
|
||||
|
||||
Alias: `FROM_BASE64`.
|
||||
|
||||
|
@ -2,21 +2,19 @@
|
||||
#include "config.h"
|
||||
|
||||
#if USE_BASE64
|
||||
# include <Columns/ColumnConst.h>
|
||||
# include <Common/MemorySanitizer.h>
|
||||
# include <Columns/ColumnFixedString.h>
|
||||
# include <Columns/ColumnString.h>
|
||||
# include <DataTypes/DataTypeString.h>
|
||||
# include <Functions/FunctionFactory.h>
|
||||
# include <Functions/FunctionHelpers.h>
|
||||
# include <Functions/GatherUtils/Algorithms.h>
|
||||
# include <IO/WriteHelpers.h>
|
||||
# include <Functions/IFunction.h>
|
||||
# include <Interpreters/Context_fwd.h>
|
||||
# include <turbob64.h>
|
||||
# include <Common/MemorySanitizer.h>
|
||||
|
||||
# include <span>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
using namespace GatherUtils;
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
@ -25,33 +23,86 @@ namespace ErrorCodes
|
||||
extern const int INCORRECT_DATA;
|
||||
}
|
||||
|
||||
namespace Detail
|
||||
{
|
||||
inline size_t base64Decode(const std::span<const UInt8> src, UInt8 * dst)
|
||||
{
|
||||
# if defined(__aarch64__)
|
||||
return tb64sdec(reinterpret_cast<const uint8_t *>(src.data()), src.size(), reinterpret_cast<uint8_t *>(dst));
|
||||
# else
|
||||
return _tb64d(reinterpret_cast<const uint8_t *>(src.data()), src.size(), reinterpret_cast<uint8_t *>(dst));
|
||||
# endif
|
||||
}
|
||||
}
|
||||
|
||||
struct Base64Encode
|
||||
{
|
||||
static constexpr auto name = "base64Encode";
|
||||
static size_t getBufferSize(size_t string_length, size_t string_count)
|
||||
|
||||
static size_t getBufferSize(const size_t string_length, const size_t string_count)
|
||||
{
|
||||
return ((string_length - string_count) / 3 + string_count) * 4 + string_count;
|
||||
}
|
||||
|
||||
static size_t performCoding(const std::span<const UInt8> src, UInt8 * dst)
|
||||
{
|
||||
/*
|
||||
* Some bug in sse arm64 implementation?
|
||||
* `base64Encode(repeat('a', 46))` returns wrong padding character
|
||||
*/
|
||||
# if defined(__aarch64__)
|
||||
return tb64senc(reinterpret_cast<const uint8_t *>(src.data()), src.size(), reinterpret_cast<uint8_t *>(dst));
|
||||
# else
|
||||
return _tb64e(reinterpret_cast<const uint8_t *>(src.data()), src.size(), reinterpret_cast<uint8_t *>(dst));
|
||||
# endif
|
||||
}
|
||||
};
|
||||
|
||||
struct Base64Decode
|
||||
{
|
||||
static constexpr auto name = "base64Decode";
|
||||
|
||||
static size_t getBufferSize(size_t string_length, size_t string_count)
|
||||
static size_t getBufferSize(const size_t string_length, const size_t string_count)
|
||||
{
|
||||
return ((string_length - string_count) / 4 + string_count) * 3 + string_count;
|
||||
}
|
||||
|
||||
static size_t performCoding(const std::span<const UInt8> src, UInt8 * dst)
|
||||
{
|
||||
const auto outlen = Detail::base64Decode(src, dst);
|
||||
if (src.size() > 0 && !outlen)
|
||||
throw Exception(
|
||||
ErrorCodes::INCORRECT_DATA,
|
||||
"Failed to {} input '{}'",
|
||||
name,
|
||||
String(reinterpret_cast<const char *>(src.data()), src.size()));
|
||||
|
||||
return outlen;
|
||||
}
|
||||
};
|
||||
|
||||
struct TryBase64Decode
|
||||
{
|
||||
static constexpr auto name = "tryBase64Decode";
|
||||
|
||||
static size_t getBufferSize(size_t string_length, size_t string_count)
|
||||
static size_t getBufferSize(const size_t string_length, const size_t string_count)
|
||||
{
|
||||
return Base64Decode::getBufferSize(string_length, string_count);
|
||||
}
|
||||
|
||||
static size_t performCoding(const std::span<const UInt8> src, UInt8 * dst)
|
||||
{
|
||||
if (src.empty())
|
||||
return 0;
|
||||
|
||||
const auto outlen = Detail::base64Decode(src, dst);
|
||||
// during decoding character array can be partially polluted
|
||||
// if fail, revert back and clean
|
||||
if (!outlen)
|
||||
*dst = 0;
|
||||
|
||||
return outlen;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Func>
|
||||
@ -71,99 +122,60 @@ public:
|
||||
if (arguments.size() != 1)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong number of arguments for function {}: 1 expected.", getName());
|
||||
|
||||
if (!WhichDataType(arguments[0].type).isString())
|
||||
if (!WhichDataType(arguments[0].type).isStringOrFixedString())
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of 1st argument of function {}. Must be String.",
|
||||
arguments[0].type->getName(), getName());
|
||||
"Illegal type {} of 1st argument of function {}. Must be FixedString or String.",
|
||||
arguments[0].type->getName(),
|
||||
getName());
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const override
|
||||
{
|
||||
const ColumnPtr column_string = arguments[0].column;
|
||||
const ColumnString * input = checkAndGetColumn<ColumnString>(column_string.get());
|
||||
const auto & input_column = arguments[0].column;
|
||||
if (const auto * src_column_as_fixed_string = checkAndGetColumn<ColumnFixedString>(*input_column))
|
||||
return execute(*src_column_as_fixed_string, input_rows_count);
|
||||
else if (const auto * src_column_as_string = checkAndGetColumn<ColumnString>(*input_column))
|
||||
return execute(*src_column_as_string, input_rows_count);
|
||||
|
||||
if (!input)
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Illegal column {} of first argument of function {}, must be of type String",
|
||||
arguments[0].column->getName(), getName());
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Illegal column {} of first argument of function {}, must be of type FixedString or String.",
|
||||
input_column->getName(),
|
||||
getName());
|
||||
}
|
||||
|
||||
private:
|
||||
static ColumnPtr execute(const ColumnString & src_column, const size_t src_row_count)
|
||||
{
|
||||
auto dst_column = ColumnString::create();
|
||||
auto & dst_data = dst_column->getChars();
|
||||
auto & dst_chars = dst_column->getChars();
|
||||
auto & dst_offsets = dst_column->getOffsets();
|
||||
|
||||
size_t reserve = Func::getBufferSize(input->getChars().size(), input->size());
|
||||
dst_data.resize(reserve);
|
||||
dst_offsets.resize(input_rows_count);
|
||||
const auto reserve = Func::getBufferSize(src_column.byteSize(), src_column.size());
|
||||
dst_chars.resize(reserve);
|
||||
dst_offsets.resize(src_row_count);
|
||||
|
||||
const ColumnString::Offsets & src_offsets = input->getOffsets();
|
||||
const auto & src_chars = src_column.getChars();
|
||||
const auto & src_offsets = src_column.getOffsets();
|
||||
|
||||
const auto * source = input->getChars().data();
|
||||
auto * dst = dst_data.data();
|
||||
auto * dst = dst_chars.data();
|
||||
auto * dst_pos = dst;
|
||||
const auto * src = src_chars.data();
|
||||
|
||||
size_t src_offset_prev = 0;
|
||||
|
||||
for (size_t row = 0; row < input_rows_count; ++row)
|
||||
for (size_t row = 0; row < src_row_count; ++row)
|
||||
{
|
||||
size_t srclen = src_offsets[row] - src_offset_prev - 1;
|
||||
size_t outlen = 0;
|
||||
|
||||
if constexpr (std::is_same_v<Func, Base64Encode>)
|
||||
{
|
||||
/*
|
||||
* Some bug in sse arm64 implementation?
|
||||
* `base64Encode(repeat('a', 46))` returns wrong padding character
|
||||
*/
|
||||
#if defined(__aarch64__)
|
||||
outlen = tb64senc(reinterpret_cast<const uint8_t *>(source), srclen, reinterpret_cast<uint8_t *>(dst_pos));
|
||||
#else
|
||||
outlen = _tb64e(reinterpret_cast<const uint8_t *>(source), srclen, reinterpret_cast<uint8_t *>(dst_pos));
|
||||
#endif
|
||||
}
|
||||
else if constexpr (std::is_same_v<Func, Base64Decode>)
|
||||
{
|
||||
if (srclen > 0)
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
outlen = tb64sdec(reinterpret_cast<const uint8_t *>(source), srclen, reinterpret_cast<uint8_t *>(dst_pos));
|
||||
#else
|
||||
outlen = _tb64d(reinterpret_cast<const uint8_t *>(source), srclen, reinterpret_cast<uint8_t *>(dst_pos));
|
||||
#endif
|
||||
|
||||
if (!outlen)
|
||||
throw Exception(
|
||||
ErrorCodes::INCORRECT_DATA,
|
||||
"Failed to {} input '{}'",
|
||||
getName(), String(reinterpret_cast<const char *>(source), srclen));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (srclen > 0)
|
||||
{
|
||||
// during decoding character array can be partially polluted
|
||||
// if fail, revert back and clean
|
||||
auto * savepoint = dst_pos;
|
||||
outlen = _tb64d(reinterpret_cast<const uint8_t *>(source), srclen, reinterpret_cast<uint8_t *>(dst_pos));
|
||||
if (!outlen)
|
||||
{
|
||||
outlen = 0;
|
||||
dst_pos = savepoint; //-V1048
|
||||
// clean the symbol
|
||||
dst_pos[0] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
const size_t src_length = src_offsets[row] - src_offset_prev - 1;
|
||||
const auto outlen = Func::performCoding({src, src_length}, dst_pos);
|
||||
|
||||
/// Base64 library is using AVX-512 with some shuffle operations.
|
||||
/// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle.
|
||||
__msan_unpoison(dst_pos, outlen);
|
||||
|
||||
source += srclen + 1;
|
||||
src += src_length + 1;
|
||||
dst_pos += outlen;
|
||||
*dst_pos = '\0';
|
||||
dst_pos += 1;
|
||||
@ -172,8 +184,44 @@ public:
|
||||
src_offset_prev = src_offsets[row];
|
||||
}
|
||||
|
||||
dst_data.resize(dst_pos - dst);
|
||||
dst_chars.resize(dst_pos - dst);
|
||||
return dst_column;
|
||||
}
|
||||
|
||||
static ColumnPtr execute(const ColumnFixedString & src_column, const size_t src_row_count)
|
||||
{
|
||||
auto dst_column = ColumnString::create();
|
||||
auto & dst_chars = dst_column->getChars();
|
||||
auto & dst_offsets = dst_column->getOffsets();
|
||||
|
||||
const auto reserve = Func::getBufferSize(src_column.byteSize(), src_column.size());
|
||||
dst_chars.resize(reserve);
|
||||
dst_offsets.resize(src_row_count);
|
||||
|
||||
const auto & src_chars = src_column.getChars();
|
||||
const auto & src_n = src_column.getN();
|
||||
|
||||
auto * dst = dst_chars.data();
|
||||
auto * dst_pos = dst;
|
||||
const auto * src = src_chars.data();
|
||||
|
||||
for (size_t row = 0; row < src_row_count; ++row)
|
||||
{
|
||||
const auto outlen = Func::performCoding({src, src_n}, dst_pos);
|
||||
|
||||
/// Base64 library is using AVX-512 with some shuffle operations.
|
||||
/// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle.
|
||||
__msan_unpoison(dst_pos, outlen);
|
||||
|
||||
src += src_n;
|
||||
dst_pos += outlen;
|
||||
*dst_pos = '\0';
|
||||
dst_pos += 1;
|
||||
|
||||
dst_offsets[row] = dst_pos - dst;
|
||||
}
|
||||
|
||||
dst_chars.resize(dst_pos - dst);
|
||||
return dst_column;
|
||||
}
|
||||
};
|
||||
|
@ -1,8 +1,7 @@
|
||||
#include <Functions/FunctionBase64Conversion.h>
|
||||
|
||||
#if USE_BASE64
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -15,4 +14,5 @@ REGISTER_FUNCTION(Base64Decode)
|
||||
factory.registerAlias("FROM_BASE64", "base64Decode", FunctionFactory::CaseInsensitive);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -1,10 +1,7 @@
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionBase64Conversion.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if USE_BASE64
|
||||
# include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -17,4 +14,5 @@ REGISTER_FUNCTION(Base64Encode)
|
||||
factory.registerAlias("TO_BASE64", "base64Encode", FunctionFactory::CaseInsensitive);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include <Functions/FunctionBase64Conversion.h>
|
||||
|
||||
#if USE_BASE64
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -10,4 +10,5 @@ REGISTER_FUNCTION(TryBase64Decode)
|
||||
factory.registerFunction<FunctionBase64Conversion<TryBase64Decode>>();
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user