add unicode decode

This commit is contained in:
nautaa 2021-01-09 19:08:19 +08:00
parent 6ecf505f48
commit 8359289283
3 changed files with 150 additions and 74 deletions

View File

@ -1,9 +1,10 @@
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h> #include <Functions/FunctionStringToString.h>
#include <Common/StringUtils/StringUtils.h>
#include <common/find_symbols.h> #include <common/find_symbols.h>
#include <cstdio> #include <vector>
namespace DB namespace DB
{ {
namespace ErrorCodes namespace ErrorCodes
@ -54,8 +55,8 @@ namespace
} }
private: private:
static const int min_XML_number = 32; static const int max_legal_unicode_value = 0x10FFFF;
static const int max_XML_number = 126; static const int max_legal_unicode_bits = 7;
static size_t execute(const char * src, size_t src_size, char * dst) static size_t execute(const char * src, size_t src_size, char * dst)
{ {
const char * src_prev_pos = src; const char * src_prev_pos = src;
@ -80,6 +81,32 @@ namespace
src_curr_pos = src_end; src_curr_pos = src_end;
break; break;
} }
else if (isValidNumeric(src_curr_pos, src_next_pos))
{
std::vector<char> decodeNumericChars;
decodeNumericPart(src_curr_pos + 2, src_next_pos, decodeNumericChars);
if (decodeNumericChars.empty())
{
++src_curr_pos;
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
src_prev_pos = src_curr_pos;
}
else
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
for (size_t i = 0; i < decodeNumericChars.size(); i++)
{
*dst_pos = decodeNumericChars[i];
++dst_pos;
}
src_prev_pos = src_next_pos + 1;
}
src_curr_pos = src_next_pos + 1;
}
else if (src_next_pos - src_curr_pos == 3) else if (src_next_pos - src_curr_pos == 3)
{ {
if (strncmp(src_curr_pos, "&lt", 3) == 0) if (strncmp(src_curr_pos, "&lt", 3) == 0)
@ -122,26 +149,6 @@ namespace
++dst_pos; ++dst_pos;
src_prev_pos = src_curr_pos + 5; src_prev_pos = src_curr_pos + 5;
} }
else if (*(src_curr_pos + 1) == '#' && isdigit(*(src_curr_pos + 2)) && isdigit(*(src_curr_pos + 3)))
{
char numeric_character = decodeNumberPart(src_curr_pos + 2);
if (numeric_character == '\0')
{
size_t bytes_to_copy = src_next_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
src_prev_pos = src_curr_pos + 5;
}
else
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
*dst_pos = '\0' + numeric_character;
++dst_pos;
src_prev_pos = src_curr_pos + 5;
}
}
else else
{ {
++src_curr_pos; ++src_curr_pos;
@ -173,28 +180,6 @@ namespace
++dst_pos; ++dst_pos;
src_prev_pos = src_curr_pos + 6; src_prev_pos = src_curr_pos + 6;
} }
else if (
*(src_curr_pos + 1) == '#' && isdigit(*(src_curr_pos + 2)) && isdigit(*(src_curr_pos + 3))
&& isdigit(*(src_curr_pos + 4)))
{
char numeric_character = decodeNumberPart(src_curr_pos + 2);
if (numeric_character == '\0')
{
size_t bytes_to_copy = src_next_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
src_prev_pos = src_curr_pos + 6;
}
else
{
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
dst_pos += bytes_to_copy;
*dst_pos = '\0' + numeric_character;
++dst_pos;
src_prev_pos = src_curr_pos + 6;
}
}
else else
{ {
++src_curr_pos; ++src_curr_pos;
@ -227,14 +212,118 @@ namespace
return dst_pos - dst; return dst_pos - dst;
} }
static inline char decodeNumberPart(const char * src) static void decodeNumericPart(const char * src, const char * end, std::vector<char> & decodeNumericChars)
{ {
auto numberic_ans = strtol(src, nullptr, 10); int numeric_ans;
if (numberic_ans >= min_XML_number && numberic_ans <= max_XML_number) if (*src == 'x' || *src == 'X')
{ {
return '\0' + numberic_ans; numeric_ans = hexOrDecStrToInt(src + 1, end, 16);
}
else
{
numeric_ans = hexOrDecStrToInt(src, end, 10);
}
const auto num_bits = numBitsCount(numeric_ans);
if (num_bits <= 7)
{
decodeNumericChars.push_back('\0' + (numeric_ans & 0x7F));
}
else if (num_bits <= 11)
{
decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x1F) + 0xC0);
decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80);
}
else if (num_bits <= 16)
{
decodeNumericChars.push_back('\0' + ((numeric_ans >> 12) & 0x0F) + 0xE0);
decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x3F) + 0x80);
decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80);
}
else if ((num_bits <= 21) && (numeric_ans <= max_legal_unicode_value))
{
decodeNumericChars.push_back('\0' + ((numeric_ans >> 18) & 0x07) + 0xF0);
decodeNumericChars.push_back('\0' + ((numeric_ans >> 12) & 0x3F) + 0x80);
decodeNumericChars.push_back('\0' + ((numeric_ans >> 6) & 0x3F) + 0x80);
decodeNumericChars.push_back('\0' + (numeric_ans & 0x3F) + 0x80);
}
}
static int hexOrDecStrToInt(const char * src, const char * end, int base)
{
int numeric_ans = 0;
int pos = 0;
if (base == 16)
{
while (src + pos != end)
{
if (isNumericASCII(*(src + pos)))
{
numeric_ans = numeric_ans * base + (*(src + pos) - '0');
}
else if (*(src + pos) >= 'a' && *(src + pos) <= 'f')
{
numeric_ans = numeric_ans * base + (*(src + pos) - 'a' + 10);
}
else if (*(src + pos) >= 'A' && *(src + pos) <= 'F')
{
numeric_ans = numeric_ans * base + (*(src + pos) - 'A' + 10);
}
++pos;
}
}
else
{
while (src + pos != end)
{
numeric_ans = numeric_ans * base + (*(src + pos) - '0');
++pos;
}
}
return numeric_ans;
}
static int numBitsCount(int integer)
{
size_t num_bits = 0;
while (integer > 0)
{
++num_bits;
integer >>= 1;
}
return num_bits;
}
static bool isValidNumeric(const char * src, const char * end)
{
int pos;
if (*src != '&' || *(src + 1) != '#' || (end - (src + 2) > max_legal_unicode_bits))
{
return false;
}
if (*(src + 2) == 'x' || *(src + 2) == 'X')
{
pos = 3;
while (src + pos != end)
{
if (!isHexDigit(*(src + pos)))
{
return false;
}
++pos;
}
return true;
}
else
{
pos = 2;
while (src + pos != end)
{
if (!isNumericASCII(*(src + pos)))
{
return false;
}
++pos;
}
return true;
} }
return '\0';
} }
}; };

View File

@ -8,17 +8,10 @@ Hello, &a;& world
Hello, &ltt;& world Hello, &ltt;& world
Hello, &ltt& world Hello, &ltt& world
Hello, &t;& world Hello, &t;& world
&#0;&#1;&#2;&#3;&#4;&#5;&#6;&#7;&#8;&#9;&#10 !"#$%&\'()*+,-./012
&#11&#12&#13&#14&#15&#16&#17&#18&#19&#20&#21&#22&#23&#24&#25&#26&#27&#28&#29&#30
&#31 !"#$%&\'()*+,-./012
)*+,-./0123456789:;< )*+,-./0123456789:;<
=>?@ABCDEFGHIJKLMNOP =>?@ABCDEFGHIJKLMNOP
QRSTUVWXYZ[\\]^_`abcd 为什么
efghijklmnopqrstuvwx 为什么
yz{|}~&#127&#128&#129&#130&#131&#132&#133&#134&#135&#136&#137&#138&#139&#140 &#12345678;\'123
&#141&#142&#143&#144&#145&#146&#147&#148&#149&#150&#151&#152&#153&#154&#155&#156&#157&#158&#159&#160 ЦЦЮЮЫㄱ
&#161&#162&#163&#164&#165&#166&#167&#168&#169&#170&#171&#172&#173&#174&#175&#176&#177&#178&#179&#180
&#181&#182&#183&#184&#185&#186&#187&#188&#189&#190&#191&#192&#193&#194&#195&#196&#197&#198&#199&#200
&#201&#202&#203&#204&#205&#206&#207&#208&#209&#210&#211&#212&#213&#214&#215&#216&#217&#218&#219&#220
&#221&#222&#223&#224&#225&#226&#227&#228&#229&#230&#231&#232&#233&#234&#235&#236&#237&#238&#239&#240
&#241&#242&#243&#244&#245&#246&#247&#248&#249&#250&#251&#252&#253&#254&#255&#256

View File

@ -10,17 +10,11 @@ SELECT decodeXMLComponent('Hello, &ltt&amp; world');
SELECT decodeXMLComponent('Hello, &t;&amp; world'); SELECT decodeXMLComponent('Hello, &t;&amp; world');
--decode numeric entities --decode numeric entities
SELECT decodeXMLComponent('&#0;&#1;&#2;&#3;&#4;&#5;&#6;&#7;&#8;&#9;&#10;');
SELECT decodeXMLComponent('&#11;&#12;&#13;&#14;&#15;&#16;&#17;&#18;&#19;&#20;&#21;&#22;&#23;&#24;&#25;&#26;&#27;&#28;&#29;&#30;'); SELECT decodeXMLComponent('&#32;&#33;&#34;&#35;&#36;&#37;&#38;&#39;&#40;&#41;&#42;&#43;&#44;&#45;&#46;&#47;&#48;&#49;&#50;');
SELECT decodeXMLComponent('&#31;&#32;&#33;&#34;&#35;&#36;&#37;&#38;&#39;&#40;&#41;&#42;&#43;&#44;&#45;&#46;&#47;&#48;&#49;&#50;');
SELECT decodeXMLComponent('&#41;&#42;&#43;&#44;&#45;&#46;&#47;&#48;&#49;&#50;&#51;&#52;&#53;&#54;&#55;&#56;&#57;&#58;&#59;&#60;'); SELECT decodeXMLComponent('&#41;&#42;&#43;&#44;&#45;&#46;&#47;&#48;&#49;&#50;&#51;&#52;&#53;&#54;&#55;&#56;&#57;&#58;&#59;&#60;');
SELECT decodeXMLComponent('&#61;&#62;&#63;&#64;&#65;&#66;&#67;&#68;&#69;&#70;&#71;&#72;&#73;&#74;&#75;&#76;&#77;&#78;&#79;&#80;'); SELECT decodeXMLComponent('&#61;&#62;&#63;&#64;&#65;&#66;&#67;&#68;&#69;&#70;&#71;&#72;&#73;&#74;&#75;&#76;&#77;&#78;&#79;&#80;');
SELECT decodeXMLComponent('&#81;&#82;&#83;&#84;&#85;&#86;&#87;&#88;&#89;&#90;&#91;&#92;&#93;&#94;&#95;&#96;&#97;&#98;&#99;&#100;'); SELECT decodeXMLComponent('&#20026;');
SELECT decodeXMLComponent('&#101;&#102;&#103;&#104;&#105;&#106;&#107;&#108;&#109;&#110;&#111;&#112;&#113;&#114;&#115;&#116;&#117;&#118;&#119;&#120;'); SELECT decodeXMLComponent('&#x4e3a;');
SELECT decodeXMLComponent('&#121;&#122;&#123;&#124;&#125;&#126;&#127;&#128;&#129;&#130;&#131;&#132;&#133;&#134;&#135;&#136;&#137;&#138;&#139;&#140;'); SELECT decodeXMLComponent('&#12345678;&apos;123');
SELECT decodeXMLComponent('&#141;&#142;&#143;&#144;&#145;&#146;&#147;&#148;&#149;&#150;&#151;&#152;&#153;&#154;&#155;&#156;&#157;&#158;&#159;&#160;'); SELECT decodeXMLComponent('&#x0426;&#X0426;&#x042E;&#X042e;&#x042B;&#x3131;');
SELECT decodeXMLComponent('&#161;&#162;&#163;&#164;&#165;&#166;&#167;&#168;&#169;&#170;&#171;&#172;&#173;&#174;&#175;&#176;&#177;&#178;&#179;&#180;');
SELECT decodeXMLComponent('&#181;&#182;&#183;&#184;&#185;&#186;&#187;&#188;&#189;&#190;&#191;&#192;&#193;&#194;&#195;&#196;&#197;&#198;&#199;&#200;');
SELECT decodeXMLComponent('&#201;&#202;&#203;&#204;&#205;&#206;&#207;&#208;&#209;&#210;&#211;&#212;&#213;&#214;&#215;&#216;&#217;&#218;&#219;&#220;');
SELECT decodeXMLComponent('&#221;&#222;&#223;&#224;&#225;&#226;&#227;&#228;&#229;&#230;&#231;&#232;&#233;&#234;&#235;&#236;&#237;&#238;&#239;&#240;');
SELECT decodeXMLComponent('&#241;&#242;&#243;&#244;&#245;&#246;&#247;&#248;&#249;&#250;&#251;&#252;&#253;&#254;&#255;&#256;');