add encodeXMLComponent function

This commit is contained in:
nauta 2020-12-08 21:02:09 +08:00
parent 8e615a1d8d
commit ab5f397022
5 changed files with 153 additions and 0 deletions

142
src/Functions/encodeXML.cpp Normal file
View File

@ -0,0 +1,142 @@
#include <Columns/ColumnString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <common/find_symbols.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
namespace
{
struct EncodeXMLName
{
static constexpr auto name = "encodeXMLComponent";
};
class FunctionEncodeXMLImpl
{
public:
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.resize(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < size; ++i)
{
const char * src_data = reinterpret_cast<const char *>(&data[prev_offset]);
size_t src_size = offsets[i] - prev_offset;
size_t dst_size = execute(src_data, src_size, reinterpret_cast<char *>(res_data.data() + res_offset));
res_offset += dst_size;
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
res_data.resize(res_offset);
}
static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception("Function encodeXML cannot work with FixedString argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
private:
static size_t execute(const char * src, size_t src_size, char * dst)
{
const char * src_prev_pos = src;
const char * src_curr_pos = src;
const char * src_end = src + src_size;
char * dst_pos = dst;
while (true)
{
src_curr_pos = find_first_symbols<'<', '&', '>', '"', '\''>(src_curr_pos, src_end);
if (src_curr_pos == src_end)
{
break;
}
else if (*src_curr_pos == '<')
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
memcpySmallAllowReadWriteOverflow15(dst_pos, "&lt;", 4);
dst_pos += 4;
src_prev_pos = src_curr_pos + 1;
src_curr_pos++;
}
else if (*src_curr_pos == '&')
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
memcpySmallAllowReadWriteOverflow15(dst_pos, "&amp;", 5);
dst_pos += 5;
src_prev_pos = src_curr_pos + 1;
src_curr_pos++;
}
else if (*src_curr_pos == '>')
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
memcpySmallAllowReadWriteOverflow15(dst_pos, "&gt;", 4);
dst_pos += 4;
src_prev_pos = src_curr_pos + 1;
src_curr_pos++;
}
else if (*src_curr_pos == '"')
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
memcpySmallAllowReadWriteOverflow15(dst_pos, "&quot;", 6);
dst_pos += 6;
src_prev_pos = src_curr_pos + 1;
src_curr_pos++;
}
else if (*src_curr_pos == '\'')
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
memcpySmallAllowReadWriteOverflow15(dst_pos, "&apos;", 6);
dst_pos += 6;
src_prev_pos = src_curr_pos + 1;
src_curr_pos++;
}
}
if (src_prev_pos < src_curr_pos)
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
}
return dst_pos - dst;
}
};
using FunctionEncodeXML = FunctionStringToString<FunctionEncodeXMLImpl, EncodeXMLName>;
}
void registerFunctionEncodeXML(FunctionFactory & factory)
{
factory.registerFunction<FunctionEncodeXML>();
}
}

View File

@ -33,6 +33,7 @@ void registerFunctionRegexpQuoteMeta(FunctionFactory &);
void registerFunctionNormalizeQuery(FunctionFactory &);
void registerFunctionNormalizedQueryHash(FunctionFactory &);
void registerFunctionCountMatches(FunctionFactory &);
void registerFunctionEncodeXML(FunctionFactory & factory);
#if USE_BASE64
void registerFunctionBase64Encode(FunctionFactory &);
@ -68,6 +69,7 @@ void registerFunctionsString(FunctionFactory & factory)
registerFunctionNormalizeQuery(factory);
registerFunctionNormalizedQueryHash(factory);
registerFunctionCountMatches(factory);
registerFunctionEncodeXML(factory);
#if USE_BASE64
registerFunctionBase64Encode(factory);
registerFunctionBase64Decode(factory);

View File

@ -224,6 +224,7 @@ SRCS(
dumpColumnStructure.cpp
e.cpp
empty.cpp
encodeXML.cpp
encrypt.cpp
endsWith.cpp
equals.cpp

View File

@ -0,0 +1,4 @@
Hello, &quot;world&quot;!
&lt;123&gt;
&amp;clickhouse
&apos;foo&apos;

View File

@ -0,0 +1,4 @@
SELECT encodeXMLComponent('Hello, "world"!');
SELECT encodeXMLComponent('<123>');
SELECT encodeXMLComponent('&clickhouse');
SELECT encodeXMLComponent('\'foo\'');