2020-12-08 13:02:09 +00:00
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Functions/FunctionStringToString.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/find_symbols.h>
|
2020-12-08 13:02:09 +00:00
|
|
|
|
2020-12-12 20:47:37 +00:00
|
|
|
|
2020-12-08 13:02:09 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
2020-12-12 20:47:37 +00:00
|
|
|
struct EncodeXMLComponentName
|
2020-12-08 13:02:09 +00:00
|
|
|
{
|
|
|
|
static constexpr auto name = "encodeXMLComponent";
|
|
|
|
};
|
|
|
|
|
2020-12-12 20:47:37 +00:00
|
|
|
class FunctionEncodeXMLComponentImpl
|
2020-12-08 13:02:09 +00:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
static void vector(
|
|
|
|
const ColumnString::Chars & data,
|
|
|
|
const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
2020-12-12 00:09:11 +00:00
|
|
|
/// 6 is the maximum size amplification (the maximum length of encoded entity: ")
|
2020-12-09 08:55:03 +00:00
|
|
|
res_data.resize(data.size() * 6);
|
2020-12-08 13:02:09 +00:00
|
|
|
size_t size = offsets.size();
|
|
|
|
res_offsets.resize(size);
|
|
|
|
|
|
|
|
size_t prev_offset = 0;
|
|
|
|
size_t res_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
const char * src_data = reinterpret_cast<const char *>(&data[prev_offset]);
|
|
|
|
size_t src_size = offsets[i] - prev_offset;
|
|
|
|
size_t dst_size = execute(src_data, src_size, reinterpret_cast<char *>(res_data.data() + res_offset));
|
|
|
|
|
2020-12-09 08:55:03 +00:00
|
|
|
res_offset += dst_size;
|
2020-12-08 13:02:09 +00:00
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
res_data.resize(res_offset);
|
|
|
|
}
|
|
|
|
|
2020-12-09 04:40:11 +00:00
|
|
|
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
2020-12-08 13:02:09 +00:00
|
|
|
{
|
|
|
|
throw Exception("Function encodeXML cannot work with FixedString argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
static size_t execute(const char * src, size_t src_size, char * dst)
|
|
|
|
{
|
|
|
|
const char * src_prev_pos = src;
|
|
|
|
const char * src_curr_pos = src;
|
|
|
|
const char * src_end = src + src_size;
|
|
|
|
char * dst_pos = dst;
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
src_curr_pos = find_first_symbols<'<', '&', '>', '"', '\''>(src_curr_pos, src_end);
|
|
|
|
|
|
|
|
if (src_curr_pos == src_end)
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else if (*src_curr_pos == '<')
|
|
|
|
{
|
|
|
|
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
|
|
|
|
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
|
|
|
|
dst_pos += bytes_to_copy;
|
2020-12-12 00:05:35 +00:00
|
|
|
memcpy(dst_pos, "<", 4);
|
2020-12-08 13:02:09 +00:00
|
|
|
dst_pos += 4;
|
|
|
|
src_prev_pos = src_curr_pos + 1;
|
2020-12-12 00:06:09 +00:00
|
|
|
++src_curr_pos;
|
2020-12-08 13:02:09 +00:00
|
|
|
}
|
|
|
|
else if (*src_curr_pos == '&')
|
|
|
|
{
|
|
|
|
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
|
|
|
|
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
|
|
|
|
dst_pos += bytes_to_copy;
|
2020-12-12 00:05:35 +00:00
|
|
|
memcpy(dst_pos, "&", 5);
|
2020-12-08 13:02:09 +00:00
|
|
|
dst_pos += 5;
|
|
|
|
src_prev_pos = src_curr_pos + 1;
|
2020-12-12 00:06:09 +00:00
|
|
|
++src_curr_pos;
|
2020-12-08 13:02:09 +00:00
|
|
|
}
|
|
|
|
else if (*src_curr_pos == '>')
|
|
|
|
{
|
|
|
|
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
|
|
|
|
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
|
|
|
|
dst_pos += bytes_to_copy;
|
2020-12-12 00:05:35 +00:00
|
|
|
memcpy(dst_pos, ">", 4);
|
2020-12-08 13:02:09 +00:00
|
|
|
dst_pos += 4;
|
|
|
|
src_prev_pos = src_curr_pos + 1;
|
2020-12-12 00:06:09 +00:00
|
|
|
++src_curr_pos;
|
2020-12-08 13:02:09 +00:00
|
|
|
}
|
|
|
|
else if (*src_curr_pos == '"')
|
|
|
|
{
|
|
|
|
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
|
|
|
|
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
|
|
|
|
dst_pos += bytes_to_copy;
|
2020-12-12 00:05:35 +00:00
|
|
|
memcpy(dst_pos, """, 6);
|
2020-12-08 13:02:09 +00:00
|
|
|
dst_pos += 6;
|
|
|
|
src_prev_pos = src_curr_pos + 1;
|
2020-12-12 00:06:09 +00:00
|
|
|
++src_curr_pos;
|
2020-12-08 13:02:09 +00:00
|
|
|
}
|
|
|
|
else if (*src_curr_pos == '\'')
|
|
|
|
{
|
|
|
|
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
|
|
|
|
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
|
|
|
|
dst_pos += bytes_to_copy;
|
2020-12-12 00:05:35 +00:00
|
|
|
memcpy(dst_pos, "'", 6);
|
2020-12-08 13:02:09 +00:00
|
|
|
dst_pos += 6;
|
|
|
|
src_prev_pos = src_curr_pos + 1;
|
2020-12-12 00:06:09 +00:00
|
|
|
++src_curr_pos;
|
2020-12-08 13:02:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (src_prev_pos < src_curr_pos)
|
|
|
|
{
|
|
|
|
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
|
|
|
|
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
|
|
|
|
dst_pos += bytes_to_copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
return dst_pos - dst;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-12-12 20:47:37 +00:00
|
|
|
using FunctionEncodeXMLComponent = FunctionStringToString<FunctionEncodeXMLComponentImpl, EncodeXMLComponentName>;
|
2020-12-08 13:02:09 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-07-04 07:01:39 +00:00
|
|
|
REGISTER_FUNCTION(EncodeXMLComponent)
|
2020-12-08 13:02:09 +00:00
|
|
|
{
|
2020-12-12 20:47:37 +00:00
|
|
|
factory.registerFunction<FunctionEncodeXMLComponent>();
|
2020-12-08 13:02:09 +00:00
|
|
|
}
|
|
|
|
}
|