From 8359289283f98ba32d88ce87fba2e38156a0af80 Mon Sep 17 00:00:00 2001 From: nautaa <870284156@qq.com> Date: Sat, 9 Jan 2021 19:08:19 +0800 Subject: [PATCH] add unicode decode --- src/Functions/decodeXMLComponent.cpp | 189 +++++++++++++----- .../0_stateless/01621_decode_XML.reference | 17 +- .../queries/0_stateless/01621_decode_XML.sql | 18 +- 3 files changed, 150 insertions(+), 74 deletions(-) diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index a4cad7834b2..8c46976a718 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -1,9 +1,10 @@ #include #include #include +#include #include -#include +#include namespace DB { namespace ErrorCodes @@ -54,8 +55,8 @@ namespace } private: - static const int min_XML_number = 32; - static const int max_XML_number = 126; + static const int max_legal_unicode_value = 0x10FFFF; + static const int max_legal_unicode_bits = 7; static size_t execute(const char * src, size_t src_size, char * dst) { const char * src_prev_pos = src; @@ -80,6 +81,32 @@ namespace src_curr_pos = src_end; break; } + else if (isValidNumeric(src_curr_pos, src_next_pos)) + { + std::vector decodeNumericChars; + decodeNumericPart(src_curr_pos + 2, src_next_pos, decodeNumericChars); + if (decodeNumericChars.empty()) + { + ++src_curr_pos; + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + src_prev_pos = src_curr_pos; + } + else + { + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + for (size_t i = 0; i < decodeNumericChars.size(); i++) + { + *dst_pos = decodeNumericChars[i]; + ++dst_pos; + } + src_prev_pos = src_next_pos + 1; + } + src_curr_pos = src_next_pos + 1; + } else if (src_next_pos - src_curr_pos == 3) { if (strncmp(src_curr_pos, "<", 3) == 0) @@ -122,26 +149,6 @@ namespace ++dst_pos; src_prev_pos = src_curr_pos + 5; } - else if (*(src_curr_pos + 1) == '#' && isdigit(*(src_curr_pos + 2)) && isdigit(*(src_curr_pos + 3))) - { - char numeric_character = decodeNumberPart(src_curr_pos + 2); - if (numeric_character == '\0') - { - size_t bytes_to_copy = src_next_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos + 5; - } - else - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '\0' + numeric_character; - ++dst_pos; - src_prev_pos = src_curr_pos + 5; - } - } else { ++src_curr_pos; @@ -173,28 +180,6 @@ namespace ++dst_pos; src_prev_pos = src_curr_pos + 6; } - else if ( - *(src_curr_pos + 1) == '#' && isdigit(*(src_curr_pos + 2)) && isdigit(*(src_curr_pos + 3)) - && isdigit(*(src_curr_pos + 4))) - { - char numeric_character = decodeNumberPart(src_curr_pos + 2); - if (numeric_character == '\0') - { - size_t bytes_to_copy = src_next_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - src_prev_pos = src_curr_pos + 6; - } - else - { - size_t bytes_to_copy = src_curr_pos - src_prev_pos; - memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); - dst_pos += bytes_to_copy; - *dst_pos = '\0' + numeric_character; - ++dst_pos; - src_prev_pos = src_curr_pos + 6; - } - } else { ++src_curr_pos; @@ -227,14 +212,118 @@ namespace return dst_pos - dst; } - static inline char decodeNumberPart(const char * src) + static void decodeNumericPart(const char * src, const char * end, std::vector & decodeNumericChars) { - auto numberic_ans = strtol(src, nullptr, 10); - if (numberic_ans >= min_XML_number && numberic_ans <= max_XML_number) + int numeric_ans; + if (*src == 'x' || *src == 'X') { - return '\0' + numberic_ans; + numeric_ans = hexOrDecStrToInt(src + 1, end, 16); + } + else + { + numeric_ans = hexOrDecStrToInt(src, end, 10); + } + const auto num_bits = numBitsCount(numeric_ans); + if (num_bits <= 7) + { + decodeNumericChars.push_back('\0' + (numeric_ans & 0x7F)); + } + else if (num_bits <= 11) + { + decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x1F) + 0xC0); + decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + } + else if (num_bits <= 16) + { + decodeNumericChars.push_back('\0' + ((numeric_ans >> 12) & 0x0F) + 0xE0); + decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x3F) + 0x80); + decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + } + else if ((num_bits <= 21) && (numeric_ans <= max_legal_unicode_value)) + { + decodeNumericChars.push_back('\0' + ((numeric_ans >> 18) & 0x07) + 0xF0); + decodeNumericChars.push_back('\0' + ((numeric_ans >> 12) & 0x3F) + 0x80); + decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x3F) + 0x80); + decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80); + } + } + + static int hexOrDecStrToInt(const char * src, const char * end, int base) + { + int numeric_ans = 0; + int pos = 0; + if (base == 16) + { + while (src + pos != end) + { + if (isNumericASCII(*(src + pos))) + { + numeric_ans = numeric_ans * base + (*(src + pos) - '0'); + } + else if (*(src + pos) >= 'a' && *(src + pos) <= 'f') + { + numeric_ans = numeric_ans * base + (*(src + pos) - 'a' + 10); + } + else if (*(src + pos) >= 'A' && *(src + pos) <= 'F') + { + numeric_ans = numeric_ans * base + (*(src + pos) - 'A' + 10); + } + ++pos; + } + } + else + { + while (src + pos != end) + { + numeric_ans = numeric_ans * base + (*(src + pos) - '0'); + ++pos; + } + } + return numeric_ans; + } + static int numBitsCount(int integer) + { + size_t num_bits = 0; + while (integer > 0) + { + ++num_bits; + integer >>= 1; + } + return num_bits; + } + static bool isValidNumeric(const char * src, const char * end) + { + int pos; + if (*src != '&' || *(src + 1) != '#' || (end - (src + 2) > max_legal_unicode_bits)) + { + return false; + } + if (*(src + 2) == 'x' || *(src + 2) == 'X') + { + pos = 3; + while (src + pos != end) + { + if (!isHexDigit(*(src + pos))) + { + return false; + } + ++pos; + } + return true; + } + else + { + pos = 2; + while (src + pos != end) + { + if (!isNumericASCII(*(src + pos))) + { + return false; + } + ++pos; + } + return true; } - return '\0'; } }; diff --git a/tests/queries/0_stateless/01621_decode_XML.reference b/tests/queries/0_stateless/01621_decode_XML.reference index 854f453ca10..3463fa0788c 100644 --- a/tests/queries/0_stateless/01621_decode_XML.reference +++ b/tests/queries/0_stateless/01621_decode_XML.reference @@ -8,17 +8,10 @@ Hello, &a;& world Hello, <t;& world Hello, <t& world Hello, &t;& world -� -  - !"#$%&\'()*+,-./012 + !"#$%&\'()*+,-./012 )*+,-./0123456789:;< =>?@ABCDEFGHIJKLMNOP -QRSTUVWXYZ[\\]^_`abcd -efghijklmnopqrstuvwx -yz{|}~€‚ƒ„…†‡ˆ‰Š‹Œ -Ž‘’“”•–—˜™š›œžŸ  -¡¢£¤¥¦§¨©ª«¬­®¯°±²³´ -µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈ -ÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜ -ÝÞßàáâãäåæçèéêëìíîïð -ñòóôõö÷øùúûüýþÿĀ +为什么 +为什么 +�\'123 +ЦЦЮЮЫㄱ diff --git a/tests/queries/0_stateless/01621_decode_XML.sql b/tests/queries/0_stateless/01621_decode_XML.sql index 04319ad1759..b111520db4c 100644 --- a/tests/queries/0_stateless/01621_decode_XML.sql +++ b/tests/queries/0_stateless/01621_decode_XML.sql @@ -10,17 +10,11 @@ SELECT decodeXMLComponent('Hello, <t& world'); SELECT decodeXMLComponent('Hello, &t;& world'); --decode numeric entities -SELECT decodeXMLComponent('� '); -SELECT decodeXMLComponent(' '); -SELECT decodeXMLComponent(' !"#$%&'()*+,-./012'); + +SELECT decodeXMLComponent(' !"#$%&'()*+,-./012'); SELECT decodeXMLComponent(')*+,-./0123456789:;<'); SELECT decodeXMLComponent('=>?@ABCDEFGHIJKLMNOP'); -SELECT decodeXMLComponent('QRSTUVWXYZ[\]^_`abcd'); -SELECT decodeXMLComponent('efghijklmnopqrstuvwx'); -SELECT decodeXMLComponent('yz{|}~€‚ƒ„…†‡ˆ‰Š‹Œ'); -SELECT decodeXMLComponent('Ž‘’“”•–—˜™š›œžŸ '); -SELECT decodeXMLComponent('¡¢£¤¥¦§¨©ª«¬­®¯°±²³´'); -SELECT decodeXMLComponent('µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈ'); -SELECT decodeXMLComponent('ÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜ'); -SELECT decodeXMLComponent('ÝÞßàáâãäåæçèéêëìíîïð'); -SELECT decodeXMLComponent('ñòóôõö÷øùúûüýþÿĀ'); \ No newline at end of file +SELECT decodeXMLComponent('为'); +SELECT decodeXMLComponent('为'); +SELECT decodeXMLComponent('�'123'); +SELECT decodeXMLComponent('ЦЦЮЮЫㄱ'); \ No newline at end of file