#include #include #include #include namespace DB { namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; } namespace { struct DecodeXMLComponentName { static constexpr auto name = "decodeXMLComponent"; }; class FunctionDecodeXMLComponentImpl { public: static void vector( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { res_data.resize(data.size() * 6); size_t size = offsets.size(); res_offsets.resize(size); size_t prev_offset = 0; size_t res_offset = 0; for (size_t i = 0; i < size; ++i) { const char * src_data = reinterpret_cast(&data[prev_offset]); size_t src_size = offsets[i] - prev_offset; size_t dst_size = execute(src_data, src_size, reinterpret_cast(res_data.data() + res_offset)); res_offset += dst_size; res_offsets[i] = res_offset; prev_offset = offsets[i]; } res_data.resize(res_offset); } [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) { throw Exception("Function decodeXMLComponent cannot work with FixedString argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } private: static size_t execute(const char * src, size_t src_size, char * dst) { const char * src_prev_pos = src; const char * src_curr_pos = src; const char * src_next_pos = src; const char * src_end = src + src_size; char * dst_pos = dst; while (true) { src_curr_pos = find_first_symbols<'&'>(src_curr_pos, src_end); if (src_curr_pos == src_end) { break; } else if (*src_curr_pos == '&') { src_next_pos = find_first_symbols<';'>(src_curr_pos, src_end); if (src_next_pos == src_end || src_next_pos - src_curr_pos < 3) { src_curr_pos = src_end; break; } else if (src_next_pos - src_curr_pos == 3) { if (strncmp(src_curr_pos, "<", 3) == 0) { size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; *dst_pos = '<'; ++dst_pos; src_prev_pos = src_curr_pos + 4; } else if (strncmp(src_curr_pos, ">", 3) == 0) { size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; *dst_pos = '>'; ++dst_pos; src_prev_pos = src_curr_pos + 4; } else { src_curr_pos = src_next_pos + 1; size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; src_prev_pos = src_curr_pos; } src_curr_pos += 4; } else if (src_next_pos - src_curr_pos == 4) { if (strncmp(src_curr_pos, "&", 4) == 0) { size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; *dst_pos = '&'; ++dst_pos; src_prev_pos = src_curr_pos + 5; } else { src_curr_pos = src_next_pos + 1; size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; src_prev_pos = src_curr_pos; } src_curr_pos += 5; } else if (src_next_pos - src_curr_pos == 5) { if (strncmp(src_curr_pos, """, 5) == 0) { size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; *dst_pos = '"'; ++dst_pos; src_prev_pos = src_curr_pos + 6; } else if (strncmp(src_curr_pos, "&apos", 5) == 0) { size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; *dst_pos = '\''; ++dst_pos; src_prev_pos = src_curr_pos + 6; } else { src_curr_pos = src_next_pos + 1; size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; src_prev_pos = src_curr_pos; } src_curr_pos += 6; } else { src_curr_pos = src_next_pos + 1; size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; src_prev_pos = src_curr_pos; } } } if (src_prev_pos < src_curr_pos) { size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; } return dst_pos - dst; } }; using FunctionDecodeXMLComponent = FunctionStringToString; } void registerFunctionDecodeXMLComponent(FunctionFactory & factory) { factory.registerFunction(); } }