From b413b450f5d7a6746a90fb4737c293174720c51f Mon Sep 17 00:00:00 2001 From: nauta <870284156@qq.com> Date: Sat, 26 Dec 2020 16:00:35 +0800 Subject: [PATCH 01/16] add decode XML function --- contrib/openssl | 1 + contrib/ryu | 1 + src/Functions/decodeXMLComponent.cpp | 191 ++++++++++++++++++ src/Functions/registerFunctionsString.cpp | 3 + src/Functions/ya.make | 1 + .../0_stateless/01621_decode_XML.reference | 4 + .../queries/0_stateless/01621_decode_XML.sql | 4 + 7 files changed, 205 insertions(+) create mode 160000 contrib/openssl create mode 160000 contrib/ryu create mode 100644 src/Functions/decodeXMLComponent.cpp create mode 100644 tests/queries/0_stateless/01621_decode_XML.reference create mode 100644 tests/queries/0_stateless/01621_decode_XML.sql diff --git a/contrib/openssl b/contrib/openssl new file mode 160000 index 00000000000..237260dd6a4 --- /dev/null +++ b/contrib/openssl @@ -0,0 +1 @@ +Subproject commit 237260dd6a4bca5cb5a321d366a8a9c807957455 diff --git a/contrib/ryu b/contrib/ryu new file mode 160000 index 00000000000..5b4a853534b --- /dev/null +++ b/contrib/ryu @@ -0,0 +1 @@ +Subproject commit 5b4a853534b47438b4d97935370f6b2397137c2b diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp new file mode 100644 index 00000000000..e546a8ba075 --- /dev/null +++ b/src/Functions/decodeXMLComponent.cpp @@ -0,0 +1,191 @@ +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +namespace +{ + struct DecodeXMLComponentName + { + static constexpr auto name = "decodeXMLComponent"; + }; + + class FunctionDecodeXMLComponentImpl + { + public: + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.resize(data.size() * 6); + size_t size = offsets.size(); + res_offsets.resize(size); + + size_t prev_offset = 0; + size_t res_offset = 0; + + for (size_t i = 0; i < size; ++i) + { + const char * src_data = reinterpret_cast(&data[prev_offset]); + size_t src_size = offsets[i] - prev_offset; + size_t dst_size = execute(src_data, src_size, reinterpret_cast(res_data.data() + res_offset)); + + res_offset += dst_size; + res_offsets[i] = res_offset; + prev_offset = offsets[i]; + } + + res_data.resize(res_offset); + } + + [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) + { + throw Exception("Function decodeXMLComponent cannot work with FixedString argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + + private: + static size_t execute(const char * src, size_t src_size, char * dst) + { + const char * src_prev_pos = src; + const char * src_curr_pos = src; + const char * src_next_pos = src; + const char * src_end = src + src_size; + char * dst_pos = dst; + + while (true) + { + src_curr_pos = find_first_symbols<'&'>(src_curr_pos, src_end); + + if (src_curr_pos == src_end) + { + break; + } + else if (*src_curr_pos == '&') + { + src_next_pos = find_first_symbols<';'>(src_curr_pos, src_end); + if (src_next_pos == src_end || src_next_pos - src_curr_pos < 3) + { + src_curr_pos = src_end; + break; + } + else if (src_next_pos - src_curr_pos == 3) + { + if (strncmp(src_curr_pos, "<", 3) == 0) + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + *dst_pos = '<'; + ++dst_pos; + src_prev_pos = src_curr_pos + 4; + } + else if (strncmp(src_curr_pos, ">", 3) == 0) + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + *dst_pos = '>'; + ++dst_pos; + src_prev_pos = src_curr_pos + 4; + } + else + { + src_curr_pos = src_next_pos + 1; + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_prev_pos = src_curr_pos; + } + src_curr_pos += 4; + } + else if (src_next_pos - src_curr_pos == 4) + { + if (strncmp(src_curr_pos, "&", 4) == 0) + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + *dst_pos = '&'; + ++dst_pos; + src_prev_pos = src_curr_pos + 5; + } + else + { + src_curr_pos = src_next_pos + 1; + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_prev_pos = src_curr_pos; + } + src_curr_pos += 5; + } + else if (src_next_pos - src_curr_pos == 5) + { + if (strncmp(src_curr_pos, """, 5) == 0) + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + *dst_pos = '"'; + ++dst_pos; + src_prev_pos = src_curr_pos + 6; + } + else if (strncmp(src_curr_pos, "&apos", 5) == 0) + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + *dst_pos = '\''; + ++dst_pos; + src_prev_pos = src_curr_pos + 6; + } + else + { + src_curr_pos = src_next_pos + 1; + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_prev_pos = src_curr_pos; + } + src_curr_pos += 6; + } + else + { + src_curr_pos = src_next_pos + 1; + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_prev_pos = src_curr_pos; + } + } + } + + if (src_prev_pos < src_curr_pos) + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + } + + return dst_pos - dst; + } + }; + + using FunctionDecodeXMLComponent = FunctionStringToString; + +} // namespace + +void registerFunctionDecodeXMLComponent(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} // namespace DB diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index 426cc8f8d56..5cf30dd83a6 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -34,6 +34,8 @@ void registerFunctionNormalizeQuery(FunctionFactory &); void registerFunctionNormalizedQueryHash(FunctionFactory &); void registerFunctionCountMatches(FunctionFactory &); void registerFunctionEncodeXMLComponent(FunctionFactory & factory); +void registerFunctionDecodeXMLComponent(FunctionFactory & factory); + #if USE_BASE64 void registerFunctionBase64Encode(FunctionFactory &); @@ -70,6 +72,7 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionNormalizedQueryHash(factory); registerFunctionCountMatches(factory); registerFunctionEncodeXMLComponent(factory); + registerFunctionDecodeXMLComponent(factory); #if USE_BASE64 registerFunctionBase64Encode(factory); registerFunctionBase64Decode(factory); diff --git a/src/Functions/ya.make b/src/Functions/ya.make index 7e64deef64d..9ebebab6658 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -220,6 +220,7 @@ SRCS( currentUser.cpp dateDiff.cpp date_trunc.cpp + decodeXMLComponent.cpp decrypt.cpp defaultValueOfArgumentType.cpp defaultValueOfTypeName.cpp diff --git a/tests/queries/0_stateless/01621_decode_XML.reference b/tests/queries/0_stateless/01621_decode_XML.reference new file mode 100644 index 00000000000..dab7a2d14e1 --- /dev/null +++ b/tests/queries/0_stateless/01621_decode_XML.reference @@ -0,0 +1,4 @@ +Hello, "world"! +<123> +&clickhouse +\'foo\' diff --git a/tests/queries/0_stateless/01621_decode_XML.sql b/tests/queries/0_stateless/01621_decode_XML.sql new file mode 100644 index 00000000000..4c9404e6925 --- /dev/null +++ b/tests/queries/0_stateless/01621_decode_XML.sql @@ -0,0 +1,4 @@ +SELECT decodeXMLComponent('Hello, "world"!'); +SELECT decodeXMLComponent('<123>'); +SELECT decodeXMLComponent('&clickhouse'); +SELECT decodeXMLComponent(''foo''); \ No newline at end of file From 9e5864f69441074bb49f6058b3bf48f71b5dc24b Mon Sep 17 00:00:00 2001 From: nauta <870284156@qq.com> Date: Sat, 26 Dec 2020 16:31:05 +0800 Subject: [PATCH 02/16] fix --- contrib/openssl | 1 - contrib/ryu | 1 - 2 files changed, 2 deletions(-) delete mode 160000 contrib/openssl delete mode 160000 contrib/ryu diff --git a/contrib/openssl b/contrib/openssl deleted file mode 160000 index 237260dd6a4..00000000000 --- a/contrib/openssl +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 237260dd6a4bca5cb5a321d366a8a9c807957455 diff --git a/contrib/ryu b/contrib/ryu deleted file mode 160000 index 5b4a853534b..00000000000 --- a/contrib/ryu +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5b4a853534b47438b4d97935370f6b2397137c2b From 839a4bfbd8572ab3ec14a729628431ab87a805b5 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 26 Dec 2020 15:01:14 +0300 Subject: [PATCH 03/16] Update decodeXMLComponent.cpp --- src/Functions/decodeXMLComponent.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index e546a8ba075..8060c112268 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -182,10 +182,10 @@ namespace using FunctionDecodeXMLComponent = FunctionStringToString; -} // namespace +} void registerFunctionDecodeXMLComponent(FunctionFactory & factory) { factory.registerFunction(); } -} // namespace DB +} From 6cdab36a7d2888280c808b01d7486e6a9c5652e6 Mon Sep 17 00:00:00 2001 From: nauta <870284156@qq.com> Date: Sat, 26 Dec 2020 20:18:48 +0800 Subject: [PATCH 04/16] fix bug --- src/Functions/decodeXMLComponent.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index 8060c112268..1cae1e2956c 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -26,7 +26,7 @@ namespace ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - res_data.resize(data.size() * 6); + res_data.resize(data.size()); size_t size = offsets.size(); res_offsets.resize(size); From ec8e3f2409c2979c9e687c775a7dcb91496bc5b7 Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Tue, 5 Jan 2021 21:08:36 +0800 Subject: [PATCH 05/16] add decode numeric entities --- src/Functions/decodeXMLComponent.cpp | 70 +++++++++++++++++-- .../0_stateless/01621_decode_XML.reference | 20 ++++++ .../queries/0_stateless/01621_decode_XML.sql | 24 ++++++- 3 files changed, 107 insertions(+), 7 deletions(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index 1cae1e2956c..b2f42eabd02 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -3,6 +3,7 @@ #include #include +#include namespace DB { namespace ErrorCodes @@ -53,6 +54,8 @@ namespace } private: + static const int min_XML_number = 32; + static const int max_XML_number = 126; static size_t execute(const char * src, size_t src_size, char * dst) { const char * src_prev_pos = src; @@ -72,7 +75,7 @@ namespace else if (*src_curr_pos == '&') { src_next_pos = find_first_symbols<';'>(src_curr_pos, src_end); - if (src_next_pos == src_end || src_next_pos - src_curr_pos < 3) + if (src_next_pos == src_end) { src_curr_pos = src_end; break; @@ -99,11 +102,12 @@ namespace } else { - src_curr_pos = src_next_pos + 1; + ++src_curr_pos; size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; src_prev_pos = src_curr_pos; + continue; } src_curr_pos += 4; } @@ -118,13 +122,34 @@ namespace ++dst_pos; src_prev_pos = src_curr_pos + 5; } + else if (*(src_curr_pos + 1) == '#' && isdigit(*(src_curr_pos + 2)) && isdigit(*(src_curr_pos + 3))) + { + char numeric_character = decodeNumberPart(src_curr_pos + 2); + if (numeric_character == '\0') + { + size_t bytes_to_copy = src_next_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_prev_pos = src_curr_pos + 5; + } + else + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + *dst_pos = '\0' + numeric_character; + ++dst_pos; + src_prev_pos = src_curr_pos + 5; + } + } else { - src_curr_pos = src_next_pos + 1; + ++src_curr_pos; size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; src_prev_pos = src_curr_pos; + continue; } src_curr_pos += 5; } @@ -148,19 +173,42 @@ namespace ++dst_pos; src_prev_pos = src_curr_pos + 6; } + else if ( + *(src_curr_pos + 1) == '#' && isdigit(*(src_curr_pos + 2)) && isdigit(*(src_curr_pos + 3)) + && isdigit(*(src_curr_pos + 4))) + { + char numeric_character = decodeNumberPart(src_curr_pos + 2); + if (numeric_character == '\0') + { + size_t bytes_to_copy = src_next_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_prev_pos = src_curr_pos + 6; + } + else + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + *dst_pos = '\0' + numeric_character; + ++dst_pos; + src_prev_pos = src_curr_pos + 6; + } + } else { - src_curr_pos = src_next_pos + 1; + ++src_curr_pos; size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; src_prev_pos = src_curr_pos; + continue; } src_curr_pos += 6; } else { - src_curr_pos = src_next_pos + 1; + ++src_curr_pos; size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; @@ -178,6 +226,16 @@ namespace return dst_pos - dst; } + + static inline char decodeNumberPart(const char * src) + { + auto ans = strtol(src, nullptr, 10); + if (ans >= min_XML_number && ans <= max_XML_number) + { + return '\0' + ans; + } + return '\0'; + } }; using FunctionDecodeXMLComponent = FunctionStringToString; @@ -188,4 +246,4 @@ void registerFunctionDecodeXMLComponent(FunctionFactory & factory) { factory.registerFunction(); } -} +} \ No newline at end of file diff --git a/tests/queries/0_stateless/01621_decode_XML.reference b/tests/queries/0_stateless/01621_decode_XML.reference index dab7a2d14e1..854f453ca10 100644 --- a/tests/queries/0_stateless/01621_decode_XML.reference +++ b/tests/queries/0_stateless/01621_decode_XML.reference @@ -2,3 +2,23 @@ Hello, "world"! <123> &clickhouse \'foo\' +Hello, && world +Hello, &;& world +Hello, &a;& world +Hello, <t;& world +Hello, <t& world +Hello, &t;& world +� +  + !"#$%&\'()*+,-./012 +)*+,-./0123456789:;< +=>?@ABCDEFGHIJKLMNOP +QRSTUVWXYZ[\\]^_`abcd +efghijklmnopqrstuvwx +yz{|}~€‚ƒ„…†‡ˆ‰Š‹Œ +Ž‘’“”•–—˜™š›œžŸ  +¡¢£¤¥¦§¨©ª«¬­®¯°±²³´ +µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈ +ÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜ +ÝÞßàáâãäåæçèéêëìíîïð +ñòóôõö÷øùúûüýþÿĀ diff --git a/tests/queries/0_stateless/01621_decode_XML.sql b/tests/queries/0_stateless/01621_decode_XML.sql index 4c9404e6925..04319ad1759 100644 --- a/tests/queries/0_stateless/01621_decode_XML.sql +++ b/tests/queries/0_stateless/01621_decode_XML.sql @@ -1,4 +1,26 @@ SELECT decodeXMLComponent('Hello, "world"!'); SELECT decodeXMLComponent('<123>'); SELECT decodeXMLComponent('&clickhouse'); -SELECT decodeXMLComponent(''foo''); \ No newline at end of file +SELECT decodeXMLComponent(''foo''); +SELECT decodeXMLComponent('Hello, && world'); +SELECT decodeXMLComponent('Hello, &;& world'); +SELECT decodeXMLComponent('Hello, &a;& world'); +SELECT decodeXMLComponent('Hello, <t;& world'); +SELECT decodeXMLComponent('Hello, <t& world'); +SELECT decodeXMLComponent('Hello, &t;& world'); + +--decode numeric entities +SELECT decodeXMLComponent('� '); +SELECT decodeXMLComponent(' '); +SELECT decodeXMLComponent(' !"#$%&'()*+,-./012'); +SELECT decodeXMLComponent(')*+,-./0123456789:;<'); +SELECT decodeXMLComponent('=>?@ABCDEFGHIJKLMNOP'); +SELECT decodeXMLComponent('QRSTUVWXYZ[\]^_`abcd'); +SELECT decodeXMLComponent('efghijklmnopqrstuvwx'); +SELECT decodeXMLComponent('yz{|}~€‚ƒ„…†‡ˆ‰Š‹Œ'); +SELECT decodeXMLComponent('Ž‘’“”•–—˜™š›œžŸ '); +SELECT decodeXMLComponent('¡¢£¤¥¦§¨©ª«¬­®¯°±²³´'); +SELECT decodeXMLComponent('µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈ'); +SELECT decodeXMLComponent('ÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜ'); +SELECT decodeXMLComponent('ÝÞßàáâãäåæçèéêëìíîïð'); +SELECT decodeXMLComponent('ñòóôõö÷øùúûüýþÿĀ'); \ No newline at end of file From 48ac1ce81ac4454e4affe49211ab6b30098b624b Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Tue, 5 Jan 2021 21:41:18 +0800 Subject: [PATCH 06/16] fix --- src/Functions/decodeXMLComponent.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index b2f42eabd02..2fdefa8a6c7 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -246,4 +246,4 @@ void registerFunctionDecodeXMLComponent(FunctionFactory & factory) { factory.registerFunction(); } -} \ No newline at end of file +} From 709556832fac124ff712944ae28061317f0a5f3f Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Tue, 5 Jan 2021 22:28:09 +0800 Subject: [PATCH 07/16] fix typos --- src/Functions/decodeXMLComponent.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index 2fdefa8a6c7..7aef3445614 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -229,10 +229,10 @@ namespace static inline char decodeNumberPart(const char * src) { - auto ans = strtol(src, nullptr, 10); - if (ans >= min_XML_number && ans <= max_XML_number) + auto numberic_ans = strtol(src, nullptr, 10); + if (numberic_ans >= min_XML_number && ans <= max_XML_number) { - return '\0' + ans; + return '\0' + numberic_ans; } return '\0'; } From 6ecf505f48d53ce0814eb1d5d4482e903a104f3d Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Tue, 5 Jan 2021 23:08:36 +0800 Subject: [PATCH 08/16] fix typos --- src/Functions/decodeXMLComponent.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index 7aef3445614..a4cad7834b2 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -230,7 +230,7 @@ namespace static inline char decodeNumberPart(const char * src) { auto numberic_ans = strtol(src, nullptr, 10); - if (numberic_ans >= min_XML_number && ans <= max_XML_number) + if (numberic_ans >= min_XML_number && numberic_ans <= max_XML_number) { return '\0' + numberic_ans; } From 8359289283f98ba32d88ce87fba2e38156a0af80 Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Sat, 9 Jan 2021 19:08:19 +0800 Subject: [PATCH 09/16] add unicode decode --- src/Functions/decodeXMLComponent.cpp | 189 +++++++++++++----- .../0_stateless/01621_decode_XML.reference | 17 +- .../queries/0_stateless/01621_decode_XML.sql | 18 +- 3 files changed, 150 insertions(+), 74 deletions(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index a4cad7834b2..8c46976a718 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -1,9 +1,10 @@ #include #include #include +#include #include -#include +#include namespace DB { namespace ErrorCodes @@ -54,8 +55,8 @@ namespace } private: - static const int min_XML_number = 32; - static const int max_XML_number = 126; + static const int max_legal_unicode_value = 0x10FFFF; + static const int max_legal_unicode_bits = 7; static size_t execute(const char * src, size_t src_size, char * dst) { const char * src_prev_pos = src; @@ -80,6 +81,32 @@ namespace src_curr_pos = src_end; break; } + else if (isValidNumeric(src_curr_pos, src_next_pos)) + { + std::vector decodeNumericChars; + decodeNumericPart(src_curr_pos + 2, src_next_pos, decodeNumericChars); + if (decodeNumericChars.empty()) + { + ++src_curr_pos; + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_prev_pos = src_curr_pos; + } + else + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + for (size_t i = 0; i < decodeNumericChars.size(); i++) + { + *dst_pos = decodeNumericChars[i]; + ++dst_pos; + } + src_prev_pos = src_next_pos + 1; + } + src_curr_pos = src_next_pos + 1; + } else if (src_next_pos - src_curr_pos == 3) { if (strncmp(src_curr_pos, "<", 3) == 0) @@ -122,26 +149,6 @@ namespace ++dst_pos; src_prev_pos = src_curr_pos + 5; } - else if (*(src_curr_pos + 1) == '#' && isdigit(*(src_curr_pos + 2)) && isdigit(*(src_curr_pos + 3))) - { - char numeric_character = decodeNumberPart(src_curr_pos + 2); - if (numeric_character == '\0') - { - size_t bytes_to_copy = src_next_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos + 5; - } - else - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '\0' + numeric_character; - ++dst_pos; - src_prev_pos = src_curr_pos + 5; - } - } else { ++src_curr_pos; @@ -173,28 +180,6 @@ namespace ++dst_pos; src_prev_pos = src_curr_pos + 6; } - else if ( - *(src_curr_pos + 1) == '#' && isdigit(*(src_curr_pos + 2)) && isdigit(*(src_curr_pos + 3)) - && isdigit(*(src_curr_pos + 4))) - { - char numeric_character = decodeNumberPart(src_curr_pos + 2); - if (numeric_character == '\0') - { - size_t bytes_to_copy = src_next_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos + 6; - } - else - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '\0' + numeric_character; - ++dst_pos; - src_prev_pos = src_curr_pos + 6; - } - } else { ++src_curr_pos; @@ -227,14 +212,118 @@ namespace return dst_pos - dst; } - static inline char decodeNumberPart(const char * src) + static void decodeNumericPart(const char * src, const char * end, std::vector & decodeNumericChars) { - auto numberic_ans = strtol(src, nullptr, 10); - if (numberic_ans >= min_XML_number && numberic_ans <= max_XML_number) + int numeric_ans; + if (*src == 'x' || *src == 'X') { - return '\0' + numberic_ans; + numeric_ans = hexOrDecStrToInt(src + 1, end, 16); + } + else + { + numeric_ans = hexOrDecStrToInt(src, end, 10); + } + const auto num_bits = numBitsCount(numeric_ans); + if (num_bits <= 7) + { + decodeNumericChars.push_back('\0' + (numeric_ans & 0x7F)); + } + else if (num_bits <= 11) + { + decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x1F) + 0xC0); + decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + } + else if (num_bits <= 16) + { + decodeNumericChars.push_back('\0' + ((numeric_ans >> 12) & 0x0F) + 0xE0); + decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x3F) + 0x80); + decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + } + else if ((num_bits <= 21) && (numeric_ans <= max_legal_unicode_value)) + { + decodeNumericChars.push_back('\0' + ((numeric_ans >> 18) & 0x07) + 0xF0); + decodeNumericChars.push_back('\0' + ((numeric_ans >> 12) & 0x3F) + 0x80); + decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x3F) + 0x80); + decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + } + } + + static int hexOrDecStrToInt(const char * src, const char * end, int base) + { + int numeric_ans = 0; + int pos = 0; + if (base == 16) + { + while (src + pos != end) + { + if (isNumericASCII(*(src + pos))) + { + numeric_ans = numeric_ans * base + (*(src + pos) - '0'); + } + else if (*(src + pos) >= 'a' && *(src + pos) <= 'f') + { + numeric_ans = numeric_ans * base + (*(src + pos) - 'a' + 10); + } + else if (*(src + pos) >= 'A' && *(src + pos) <= 'F') + { + numeric_ans = numeric_ans * base + (*(src + pos) - 'A' + 10); + } + ++pos; + } + } + else + { + while (src + pos != end) + { + numeric_ans = numeric_ans * base + (*(src + pos) - '0'); + ++pos; + } + } + return numeric_ans; + } + static int numBitsCount(int integer) + { + size_t num_bits = 0; + while (integer > 0) + { + ++num_bits; + integer >>= 1; + } + return num_bits; + } + static bool isValidNumeric(const char * src, const char * end) + { + int pos; + if (*src != '&' || *(src + 1) != '#' || (end - (src + 2) > max_legal_unicode_bits)) + { + return false; + } + if (*(src + 2) == 'x' || *(src + 2) == 'X') + { + pos = 3; + while (src + pos != end) + { + if (!isHexDigit(*(src + pos))) + { + return false; + } + ++pos; + } + return true; + } + else + { + pos = 2; + while (src + pos != end) + { + if (!isNumericASCII(*(src + pos))) + { + return false; + } + ++pos; + } + return true; } - return '\0'; } }; diff --git a/tests/queries/0_stateless/01621_decode_XML.reference b/tests/queries/0_stateless/01621_decode_XML.reference index 854f453ca10..3463fa0788c 100644 --- a/tests/queries/0_stateless/01621_decode_XML.reference +++ b/tests/queries/0_stateless/01621_decode_XML.reference @@ -8,17 +8,10 @@ Hello, &a;& world Hello, <t;& world Hello, <t& world Hello, &t;& world -� -  - !"#$%&\'()*+,-./012 + !"#$%&\'()*+,-./012 )*+,-./0123456789:;< =>?@ABCDEFGHIJKLMNOP -QRSTUVWXYZ[\\]^_`abcd -efghijklmnopqrstuvwx -yz{|}~€‚ƒ„…†‡ˆ‰Š‹Œ -Ž‘’“”•–—˜™š›œžŸ  -¡¢£¤¥¦§¨©ª«¬­®¯°±²³´ -µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈ -ÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜ -ÝÞßàáâãäåæçèéêëìíîïð -ñòóôõö÷øùúûüýþÿĀ +为什么 +为什么 +�\'123 +ЦЦЮЮЫㄱ diff --git a/tests/queries/0_stateless/01621_decode_XML.sql b/tests/queries/0_stateless/01621_decode_XML.sql index 04319ad1759..b111520db4c 100644 --- a/tests/queries/0_stateless/01621_decode_XML.sql +++ b/tests/queries/0_stateless/01621_decode_XML.sql @@ -10,17 +10,11 @@ SELECT decodeXMLComponent('Hello, <t& world'); SELECT decodeXMLComponent('Hello, &t;& world'); --decode numeric entities -SELECT decodeXMLComponent('� '); -SELECT decodeXMLComponent(' '); -SELECT decodeXMLComponent(' !"#$%&'()*+,-./012'); + +SELECT decodeXMLComponent(' !"#$%&'()*+,-./012'); SELECT decodeXMLComponent(')*+,-./0123456789:;<'); SELECT decodeXMLComponent('=>?@ABCDEFGHIJKLMNOP'); -SELECT decodeXMLComponent('QRSTUVWXYZ[\]^_`abcd'); -SELECT decodeXMLComponent('efghijklmnopqrstuvwx'); -SELECT decodeXMLComponent('yz{|}~€‚ƒ„…†‡ˆ‰Š‹Œ'); -SELECT decodeXMLComponent('Ž‘’“”•–—˜™š›œžŸ '); -SELECT decodeXMLComponent('¡¢£¤¥¦§¨©ª«¬­®¯°±²³´'); -SELECT decodeXMLComponent('µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈ'); -SELECT decodeXMLComponent('ÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜ'); -SELECT decodeXMLComponent('ÝÞßàáâãäåæçèéêëìíîïð'); -SELECT decodeXMLComponent('ñòóôõö÷øùúûüýþÿĀ'); \ No newline at end of file +SELECT decodeXMLComponent('为'); +SELECT decodeXMLComponent('为'); +SELECT decodeXMLComponent('�'123'); +SELECT decodeXMLComponent('ЦЦЮЮЫㄱ'); \ No newline at end of file From 33996d41f5d19cfc1cc66b1bf46c6b20843802ba Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Sat, 9 Jan 2021 19:15:09 +0800 Subject: [PATCH 10/16] add test cases reference --- tests/queries/0_stateless/01621_decode_XML.reference | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01621_decode_XML.reference b/tests/queries/0_stateless/01621_decode_XML.reference index 3463fa0788c..d4fa75bbf94 100644 --- a/tests/queries/0_stateless/01621_decode_XML.reference +++ b/tests/queries/0_stateless/01621_decode_XML.reference @@ -11,7 +11,7 @@ Hello, &t;& world !"#$%&\'()*+,-./012 )*+,-./0123456789:;< =>?@ABCDEFGHIJKLMNOP -为什么 -为什么 +为 +为 �\'123 ЦЦЮЮЫㄱ From 445bc1c65cb859e661065e174bfc3fbb8166a85e Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Sat, 9 Jan 2021 22:39:25 +0800 Subject: [PATCH 11/16] fix bug --- src/Functions/decodeXMLComponent.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index 8c46976a718..ff25db76290 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -83,9 +83,9 @@ namespace } else if (isValidNumeric(src_curr_pos, src_next_pos)) { - std::vector decodeNumericChars; - decodeNumericPart(src_curr_pos + 2, src_next_pos, decodeNumericChars); - if (decodeNumericChars.empty()) + std::vector decode_numeric_chars; + decodeNumericPart(src_curr_pos + 2, src_next_pos, decode_numeric_chars); + if (decode_numeric_chars.empty()) { ++src_curr_pos; size_t bytes_to_copy = src_curr_pos - src_prev_pos; @@ -98,9 +98,9 @@ namespace size_t bytes_to_copy = src_curr_pos - src_prev_pos; memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); dst_pos += bytes_to_copy; - for (size_t i = 0; i < decodeNumericChars.size(); i++) + for (auto cur_char : decode_numeric_chars) { - *dst_pos = decodeNumericChars[i]; + *dst_pos = cur_char; ++dst_pos; } src_prev_pos = src_next_pos + 1; From d7456f8ddd8b92cfe3f1a28bc49450c5df9b2caa Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Sun, 10 Jan 2021 16:18:26 +0800 Subject: [PATCH 12/16] update decodeXMLComponent.cpp --- src/Functions/decodeXMLComponent.cpp | 88 +++++++++++----------------- 1 file changed, 35 insertions(+), 53 deletions(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index ff25db76290..603aed08f53 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -2,9 +2,9 @@ #include #include #include +#include #include -#include namespace DB { namespace ErrorCodes @@ -83,28 +83,29 @@ namespace } else if (isValidNumeric(src_curr_pos, src_next_pos)) { - std::vector decode_numeric_chars; - decodeNumericPart(src_curr_pos + 2, src_next_pos, decode_numeric_chars); - if (decode_numeric_chars.empty()) + int numeric_entity; + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + if (*(src_curr_pos + 2) == 'x' || *(src_curr_pos + 2) == 'X') { - ++src_curr_pos; - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos; + numeric_entity = hexOrDecStrToInt(src_curr_pos + 3, src_next_pos, 0x10); } else { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - for (auto cur_char : decode_numeric_chars) - { - *dst_pos = cur_char; - ++dst_pos; - } - src_prev_pos = src_next_pos + 1; + numeric_entity = hexOrDecStrToInt(src_curr_pos + 2, src_next_pos, 10); } + if (numeric_entity > max_legal_unicode_value) + { + bytes_to_copy = src_next_pos - src_curr_pos + 1; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_curr_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + } + else + { + decodeNumericPart(numeric_entity, dst_pos); + } + src_prev_pos = src_next_pos + 1; src_curr_pos = src_next_pos + 1; } else if (src_next_pos - src_curr_pos == 3) @@ -212,39 +213,31 @@ namespace return dst_pos - dst; } - static void decodeNumericPart(const char * src, const char * end, std::vector & decodeNumericChars) + static void decodeNumericPart(int numeric_entity, char *& dst_pos) { - int numeric_ans; - if (*src == 'x' || *src == 'X') - { - numeric_ans = hexOrDecStrToInt(src + 1, end, 16); - } - else - { - numeric_ans = hexOrDecStrToInt(src, end, 10); - } - const auto num_bits = numBitsCount(numeric_ans); + const auto num_bits = numBitsCount(numeric_entity); if (num_bits <= 7) { - decodeNumericChars.push_back('\0' + (numeric_ans & 0x7F)); + *(dst_pos++) = '\0' + (numeric_entity & 0x7F); } else if (num_bits <= 11) { - decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x1F) + 0xC0); - decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + *(dst_pos++) = '\0' + ((numeric_entity >> 6) & 0x1F) + 0xC0; + + *(dst_pos++) = '\0' + (numeric_entity & 0x3F) + 0x80; } else if (num_bits <= 16) { - decodeNumericChars.push_back('\0' + ((numeric_ans >> 12) & 0x0F) + 0xE0); - decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x3F) + 0x80); - decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + *(dst_pos++) = '\0' + ((numeric_entity >> 12) & 0x0F) + 0xE0; + *(dst_pos++) = '\0' + ((numeric_entity >> 6) & 0x3F) + 0x80; + *(dst_pos++) = '\0' + (numeric_entity & 0x3F) + 0x80; } - else if ((num_bits <= 21) && (numeric_ans <= max_legal_unicode_value)) + else { - decodeNumericChars.push_back('\0' + ((numeric_ans >> 18) & 0x07) + 0xF0); - decodeNumericChars.push_back('\0' + ((numeric_ans >> 12) & 0x3F) + 0x80); - decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x3F) + 0x80); - decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + *(dst_pos++) = '\0' + ((numeric_entity >> 18) & 0x07) + 0xF0; + *(dst_pos++) = '\0' + ((numeric_entity >> 12) & 0x3F) + 0x80; + *(dst_pos++) = '\0' + ((numeric_entity >> 6) & 0x3F) + 0x80; + *(dst_pos++) = '\0' + (numeric_entity & 0x3F) + 0x80; } } @@ -252,22 +245,11 @@ namespace { int numeric_ans = 0; int pos = 0; - if (base == 16) + if (base == 0x10) { while (src + pos != end) { - if (isNumericASCII(*(src + pos))) - { - numeric_ans = numeric_ans * base + (*(src + pos) - '0'); - } - else if (*(src + pos) >= 'a' && *(src + pos) <= 'f') - { - numeric_ans = numeric_ans * base + (*(src + pos) - 'a' + 10); - } - else if (*(src + pos) >= 'A' && *(src + pos) <= 'F') - { - numeric_ans = numeric_ans * base + (*(src + pos) - 'A' + 10); - } + numeric_ans = numeric_ans * 0x10 + static_cast(unhex(*(src + pos))); ++pos; } } From a0982bfd554bad1dda0d082f676f37074bf09f65 Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Sun, 10 Jan 2021 16:21:21 +0800 Subject: [PATCH 13/16] update decodeXMLComponent.cpp --- src/Functions/decodeXMLComponent.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index 603aed08f53..ffc10d3a7bb 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -223,7 +223,6 @@ namespace else if (num_bits <= 11) { *(dst_pos++) = '\0' + ((numeric_entity >> 6) & 0x1F) + 0xC0; - *(dst_pos++) = '\0' + (numeric_entity & 0x3F) + 0x80; } else if (num_bits <= 16) From 9db1299f4135831c6cbc5cf9dddd88596bb9d91b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Jan 2021 00:25:19 +0300 Subject: [PATCH 14/16] Improve code --- src/Functions/decodeXMLComponent.cpp | 307 ++++++++++----------------- 1 file changed, 113 insertions(+), 194 deletions(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index ffc10d3a7bb..9b9cf5a081d 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -5,6 +5,7 @@ #include #include + namespace DB { namespace ErrorCodes @@ -28,7 +29,11 @@ namespace ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { + /// The size of result is always not more than the size of source. + /// Because entities decodes to the shorter byte sequence. + /// Example: &#xx... &#xx... will decode to UTF-8 byte sequence not longer than 4 bytes. res_data.resize(data.size()); + size_t size = offsets.size(); res_offsets.resize(size); @@ -56,255 +61,169 @@ namespace private: static const int max_legal_unicode_value = 0x10FFFF; - static const int max_legal_unicode_bits = 7; + static const int max_decimal_length_of_unicode_point = 7; /// 1114111 + static size_t execute(const char * src, size_t src_size, char * dst) { - const char * src_prev_pos = src; - const char * src_curr_pos = src; - const char * src_next_pos = src; + const char * src_pos = src; const char * src_end = src + src_size; char * dst_pos = dst; while (true) { - src_curr_pos = find_first_symbols<'&'>(src_curr_pos, src_end); + const char * entity_pos = find_first_symbols<'&'>(src_pos, src_end); - if (src_curr_pos == src_end) - { + if (entity_pos + strlen("lt;") >= src_end) break; - } - else if (*src_curr_pos == '&') + + /// Copy text between entities. + size_t bytes_to_copy = entity_pos - src_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_pos = entity_pos; + + ++entity_pos; + + const char * entity_end = find_first_symbols<';'>(entity_pos, src_end); + if (entity_end == src_end) + break; + + bool parsed = false; + + /// &#NNNN; or &#xNNNN; + uint32_t code_point = 0; + if (isValidNumericEntity(entity_pos, entity_end, code_point)) { - src_next_pos = find_first_symbols<';'>(src_curr_pos, src_end); - if (src_next_pos == src_end) + codePointToUTF8(code_point, dst_pos); + parsed = true; + } + else if (entity_end - entity_pos == 2) + { + if (memcmp(entity_pos, "lt", 2) == 0) { - src_curr_pos = src_end; - break; + *dst_pos = '<'; + ++dst_pos; + parsed = true; } - else if (isValidNumeric(src_curr_pos, src_next_pos)) + else if (memcmp(entity_pos, "gt", 2) == 0) { - int numeric_entity; - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - if (*(src_curr_pos + 2) == 'x' || *(src_curr_pos + 2) == 'X') - { - numeric_entity = hexOrDecStrToInt(src_curr_pos + 3, src_next_pos, 0x10); - } - else - { - numeric_entity = hexOrDecStrToInt(src_curr_pos + 2, src_next_pos, 10); - } - if (numeric_entity > max_legal_unicode_value) - { - bytes_to_copy = src_next_pos - src_curr_pos + 1; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_curr_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - } - else - { - decodeNumericPart(numeric_entity, dst_pos); - } - src_prev_pos = src_next_pos + 1; - src_curr_pos = src_next_pos + 1; + *dst_pos = '>'; + ++dst_pos; + parsed = true; } - else if (src_next_pos - src_curr_pos == 3) + } + else if (entity_end - entity_pos == 3) + { + if (memcmp(entity_pos, "amp", 3) == 0) { - if (strncmp(src_curr_pos, "<", 3) == 0) - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '<'; - ++dst_pos; - src_prev_pos = src_curr_pos + 4; - } - else if (strncmp(src_curr_pos, ">", 3) == 0) - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '>'; - ++dst_pos; - src_prev_pos = src_curr_pos + 4; - } - else - { - ++src_curr_pos; - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos; - continue; - } - src_curr_pos += 4; + *dst_pos = '&'; + ++dst_pos; + parsed = true; } - else if (src_next_pos - src_curr_pos == 4) + } + else if (entity_end - entity_pos == 4) + { + if (memcmp(entity_pos, "quot", 4) == 0) { - if (strncmp(src_curr_pos, "&", 4) == 0) - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '&'; - ++dst_pos; - src_prev_pos = src_curr_pos + 5; - } - else - { - ++src_curr_pos; - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos; - continue; - } - src_curr_pos += 5; + *dst_pos = '"'; + ++dst_pos; + parsed = true; } - else if (src_next_pos - src_curr_pos == 5) + else if (memcmp(entity_pos, "apos", 4) == 0) { - if (strncmp(src_curr_pos, """, 5) == 0) - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '"'; - ++dst_pos; - src_prev_pos = src_curr_pos + 6; - } - else if (strncmp(src_curr_pos, "&apos", 5) == 0) - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '\''; - ++dst_pos; - src_prev_pos = src_curr_pos + 6; - } - else - { - ++src_curr_pos; - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos; - continue; - } - src_curr_pos += 6; - } - else - { - ++src_curr_pos; - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos; + *dst_pos = '\''; + ++dst_pos; + parsed = true; } } + + if (parsed) + { + /// Skip the parsed entity. + src_pos = entity_end + 1; + } + else + { + /// Copy one byte as is and skip it. + *dst_pos = *src_pos; + ++dst_pos; + ++src_pos; + } } - if (src_prev_pos < src_curr_pos) + /// Copy the rest of the string. + if (src_pos < src_end) { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + size_t bytes_to_copy = src_end - src_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_pos, bytes_to_copy); dst_pos += bytes_to_copy; } return dst_pos - dst; } - static void decodeNumericPart(int numeric_entity, char *& dst_pos) + static void codePointToUTF8(uint32_t code_point, char *& dst_pos) { - const auto num_bits = numBitsCount(numeric_entity); - if (num_bits <= 7) + if (code_point < (1 << 7)) { - *(dst_pos++) = '\0' + (numeric_entity & 0x7F); + dst_pos[0] = (code_point & 0x7F); + ++dst_pos; } - else if (num_bits <= 11) + else if (code_point < (1 << 11)) { - *(dst_pos++) = '\0' + ((numeric_entity >> 6) & 0x1F) + 0xC0; - *(dst_pos++) = '\0' + (numeric_entity & 0x3F) + 0x80; + dst_pos[0] = ((code_point >> 6) & 0x1F) + 0xC0; + dst_pos[1] = (code_point & 0x3F) + 0x80; + dst_pos += 2; } - else if (num_bits <= 16) + else if (code_point < (1 << 16)) { - *(dst_pos++) = '\0' + ((numeric_entity >> 12) & 0x0F) + 0xE0; - *(dst_pos++) = '\0' + ((numeric_entity >> 6) & 0x3F) + 0x80; - *(dst_pos++) = '\0' + (numeric_entity & 0x3F) + 0x80; + dst_pos[0] = ((code_point >> 12) & 0x0F) + 0xE0; + dst_pos[1] = ((code_point >> 6) & 0x3F) + 0x80; + dst_pos[2] = (code_point & 0x3F) + 0x80; + dst_pos += 3; } else { - *(dst_pos++) = '\0' + ((numeric_entity >> 18) & 0x07) + 0xF0; - *(dst_pos++) = '\0' + ((numeric_entity >> 12) & 0x3F) + 0x80; - *(dst_pos++) = '\0' + ((numeric_entity >> 6) & 0x3F) + 0x80; - *(dst_pos++) = '\0' + (numeric_entity & 0x3F) + 0x80; + dst_pos[0] = ((code_point >> 18) & 0x07) + 0xF0; + dst_pos[1] = ((code_point >> 12) & 0x3F) + 0x80; + dst_pos[2] = ((code_point >> 6) & 0x3F) + 0x80; + dst_pos[3] = (code_point & 0x3F) + 0x80; + dst_pos += 4; } } - static int hexOrDecStrToInt(const char * src, const char * end, int base) + static bool isValidNumericEntity(const char * src, const char * end, uint32_t & code_point) { - int numeric_ans = 0; - int pos = 0; - if (base == 0x10) - { - while (src + pos != end) - { - numeric_ans = numeric_ans * 0x10 + static_cast(unhex(*(src + pos))); - ++pos; - } - } - else - { - while (src + pos != end) - { - numeric_ans = numeric_ans * base + (*(src + pos) - '0'); - ++pos; - } - } - return numeric_ans; - } - static int numBitsCount(int integer) - { - size_t num_bits = 0; - while (integer > 0) - { - ++num_bits; - integer >>= 1; - } - return num_bits; - } - static bool isValidNumeric(const char * src, const char * end) - { - int pos; - if (*src != '&' || *(src + 1) != '#' || (end - (src + 2) > max_legal_unicode_bits)) - { + if (src + strlen("#") >= end) return false; - } - if (*(src + 2) == 'x' || *(src + 2) == 'X') + + if (src[0] != '#' || (end - src > 1 + max_decimal_length_of_unicode_point)) + return false; + + if (src + 2 < end && (src[1] == 'x' || src[1] == 'X')) { - pos = 3; - while (src + pos != end) + src += 2; + for (; src < end; ++src) { - if (!isHexDigit(*(src + pos))) - { + if (!isHexDigit(*src)) return false; - } - ++pos; + code_point *= 16; + code_point += unhex(*src); } - return true; } else { - pos = 2; - while (src + pos != end) + src += 1; + for (; src < end; ++src) { - if (!isNumericASCII(*(src + pos))) - { + if (!isNumericASCII(*src)) return false; - } - ++pos; + code_point *= 10; + code_point += *src - '0'; } - return true; } + + return code_point <= max_legal_unicode_value; } }; From 8fe85b5fcbcc8a7427a7cc438845d3697ef00fa3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Jan 2021 00:33:54 +0300 Subject: [PATCH 15/16] Add stateful test --- tests/queries/1_stateful/00160_decode_xml_component.reference | 1 + tests/queries/1_stateful/00160_decode_xml_component.sql | 1 + 2 files changed, 2 insertions(+) create mode 100644 tests/queries/1_stateful/00160_decode_xml_component.reference create mode 100644 tests/queries/1_stateful/00160_decode_xml_component.sql diff --git a/tests/queries/1_stateful/00160_decode_xml_component.reference b/tests/queries/1_stateful/00160_decode_xml_component.reference new file mode 100644 index 00000000000..96234a2262e --- /dev/null +++ b/tests/queries/1_stateful/00160_decode_xml_component.reference @@ -0,0 +1 @@ +10601114492838968014 diff --git a/tests/queries/1_stateful/00160_decode_xml_component.sql b/tests/queries/1_stateful/00160_decode_xml_component.sql new file mode 100644 index 00000000000..0194eb330a9 --- /dev/null +++ b/tests/queries/1_stateful/00160_decode_xml_component.sql @@ -0,0 +1 @@ +SELECT sum(DISTINCT sipHash64(decodeXMLComponent(Title) AS decoded)) FROM test.hits WHERE Title != decoded; From f6f7ef65a2e07a3f0ff507f8bdb3eb18abb7bec3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Jan 2021 00:34:53 +0300 Subject: [PATCH 16/16] Add perf test --- tests/performance/url_hits.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/performance/url_hits.xml b/tests/performance/url_hits.xml index f0ad6a786e0..a699ef6ba97 100644 --- a/tests/performance/url_hits.xml +++ b/tests/performance/url_hits.xml @@ -32,6 +32,7 @@ extractURLParameters extractURLParameterNames decodeURLComponent + decodeXMLComponent cutWWW cutQueryString cutQueryStringAndFragment