ClickHouse/src/Functions/FunctionBase64Conversion.h

177 lines
5.4 KiB
C++
Raw Normal View History

2020-10-10 18:37:02 +00:00
#pragma once
#if !defined(ARCADIA_BUILD)
# include "config_functions.h"
#endif
2018-10-12 08:46:53 +00:00
#if USE_BASE64
# include <Columns/ColumnConst.h>
2020-09-19 19:43:26 +00:00
# include <Common/MemorySanitizer.h>
# include <Columns/ColumnString.h>
# include <DataTypes/DataTypeString.h>
# include <Functions/FunctionFactory.h>
# include <Functions/FunctionHelpers.h>
# include <Functions/GatherUtils/Algorithms.h>
# include <IO/WriteHelpers.h>
# include <turbob64.h>
2018-10-10 01:04:07 +00:00
namespace DB
{
using namespace GatherUtils;
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int INCORRECT_DATA;
}
struct Base64Encode
{
static constexpr auto name = "base64Encode";
2018-11-02 19:06:05 +00:00
static size_t getBufferSize(size_t string_length, size_t string_count)
{
2018-11-24 01:48:06 +00:00
return ((string_length - string_count) / 3 + string_count) * 4 + string_count;
2018-11-02 19:06:05 +00:00
}
2018-10-10 01:04:07 +00:00
};
struct Base64Decode
{
static constexpr auto name = "base64Decode";
2018-11-02 19:06:05 +00:00
static size_t getBufferSize(size_t string_length, size_t string_count)
{
2018-11-24 01:48:06 +00:00
return ((string_length - string_count) / 4 + string_count) * 3 + string_count;
2018-11-02 19:06:05 +00:00
}
2018-10-10 01:04:07 +00:00
};
2018-11-02 19:06:05 +00:00
struct TryBase64Decode
{
static constexpr auto name = "tryBase64Decode";
static size_t getBufferSize(size_t string_length, size_t string_count)
{
return Base64Decode::getBufferSize(string_length, string_count);
}
};
2018-10-10 01:04:07 +00:00
template <typename Func>
class FunctionBase64Conversion : public IFunction
{
public:
static constexpr auto name = Func::name;
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionBase64Conversion>();
}
String getName() const override
{
return Func::name;
}
size_t getNumberOfArguments() const override
{
return 1;
}
2018-10-11 16:22:50 +00:00
bool useDefaultImplementationForConstants() const override
{
return true;
}
2018-10-10 01:04:07 +00:00
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (!WhichDataType(arguments[0].type).isString())
throw Exception(
"Illegal type " + arguments[0].type->getName() + " of 1 argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
2018-10-10 01:04:07 +00:00
{
2020-10-17 15:22:42 +00:00
const ColumnPtr column_string = arguments[0].column;
2018-10-10 01:04:07 +00:00
const ColumnString * input = checkAndGetColumn<ColumnString>(column_string.get());
if (!input)
throw Exception(
2020-10-17 15:22:42 +00:00
"Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(),
2018-10-10 01:04:07 +00:00
ErrorCodes::ILLEGAL_COLUMN);
auto dst_column = ColumnString::create();
auto & dst_data = dst_column->getChars();
auto & dst_offsets = dst_column->getOffsets();
2018-11-02 19:06:05 +00:00
size_t reserve = Func::getBufferSize(input->getChars().size(), input->size());
dst_data.resize(reserve);
dst_offsets.resize(input_rows_count);
2018-10-10 01:04:07 +00:00
const ColumnString::Offsets & src_offsets = input->getOffsets();
2020-10-17 15:22:42 +00:00
const auto * source = input->getChars().data();
auto * dst = dst_data.data();
auto * dst_pos = dst;
2018-10-10 01:04:07 +00:00
size_t src_offset_prev = 0;
for (size_t row = 0; row < input_rows_count; ++row)
{
size_t srclen = src_offsets[row] - src_offset_prev - 1;
size_t outlen = 0;
if constexpr (std::is_same_v<Func, Base64Encode>)
{
2020-02-22 08:44:23 +00:00
outlen = _tb64e(reinterpret_cast<const uint8_t *>(source), srclen, reinterpret_cast<uint8_t *>(dst_pos));
2018-10-10 01:04:07 +00:00
}
2018-11-02 19:06:05 +00:00
else if constexpr (std::is_same_v<Func, Base64Decode>)
2018-10-10 01:04:07 +00:00
{
2019-12-25 19:44:00 +00:00
if (srclen > 0)
2018-10-10 01:04:07 +00:00
{
2020-02-22 08:44:23 +00:00
outlen = _tb64d(reinterpret_cast<const uint8_t *>(source), srclen, reinterpret_cast<uint8_t *>(dst_pos));
2019-12-25 19:44:00 +00:00
if (!outlen)
throw Exception("Failed to " + getName() + " input '" + String(reinterpret_cast<const char *>(source), srclen) + "'", ErrorCodes::INCORRECT_DATA);
2018-10-10 01:04:07 +00:00
}
}
2018-11-02 19:06:05 +00:00
else
{
2019-12-25 19:44:00 +00:00
if (srclen > 0)
2018-11-02 19:06:05 +00:00
{
2019-12-25 19:44:00 +00:00
// during decoding character array can be partially polluted
// if fail, revert back and clean
2020-10-17 15:22:42 +00:00
auto * savepoint = dst_pos;
2020-02-22 08:44:23 +00:00
outlen = _tb64d(reinterpret_cast<const uint8_t *>(source), srclen, reinterpret_cast<uint8_t *>(dst_pos));
2019-12-25 19:44:00 +00:00
if (!outlen)
{
outlen = 0;
dst_pos = savepoint;
// clean the symbol
dst_pos[0] = 0;
}
2018-11-02 19:06:05 +00:00
}
}
2018-10-10 01:04:07 +00:00
2020-09-19 19:43:26 +00:00
/// Base64 library is using AVX-512 with some shuffle operations.
/// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle.
__msan_unpoison(dst_pos, outlen);
2018-10-10 01:04:07 +00:00
source += srclen + 1;
2020-12-17 02:20:56 +00:00
dst_pos += outlen;
*dst_pos = '\0';
dst_pos += 1;
2018-10-10 01:04:07 +00:00
dst_offsets[row] = dst_pos - dst;
src_offset_prev = src_offsets[row];
}
dst_data.resize(dst_pos - dst);
2018-10-10 01:04:07 +00:00
2020-10-17 15:22:42 +00:00
return dst_column;
2018-10-10 01:04:07 +00:00
}
};
}
2019-12-26 01:42:51 +00:00
2018-11-23 18:54:23 +00:00
#endif