mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 16:50:48 +00:00
implement unquoteUrl
This commit is contained in:
parent
396ccccfce
commit
99cec8b165
@ -2,6 +2,10 @@
|
||||
|
||||
#include <DB/Common/StringView.h>
|
||||
|
||||
/// Percent decode of url data.
|
||||
std::string decodeUrl(const StringView& url);
|
||||
|
||||
|
||||
/// Extracts scheme from given url.
|
||||
StringView getUrlScheme(const StringView& url);
|
||||
|
||||
|
@ -971,6 +971,47 @@ struct CutSubstringImpl
|
||||
};
|
||||
|
||||
|
||||
struct UrlDecodeImpl
|
||||
{
|
||||
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
||||
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
||||
{
|
||||
res_data.reserve(data.size());
|
||||
size_t size = offsets.size();
|
||||
res_offsets.resize(size);
|
||||
|
||||
size_t prev_offset = 0;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
std::string url = decodeUrl(StringView(current, offsets[i] - prev_offset - 1));
|
||||
|
||||
res_data.resize(res_data.size() + url.size() + 1);
|
||||
memcpy(&res_data[res_offset], url.data(), url.size());
|
||||
res_offset += url.size() + 1;
|
||||
res_data[res_offset - 1] = 0;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
prev_offset = offsets[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void constant(const std::string & data,
|
||||
std::string & res_data)
|
||||
{
|
||||
res_data = decodeUrl(data);
|
||||
}
|
||||
|
||||
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
|
||||
ColumnString::Chars_t & res_data)
|
||||
{
|
||||
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct NameProtocol { static constexpr auto name = "protocol"; };
|
||||
struct NameDomain { static constexpr auto name = "domain"; };
|
||||
struct NameDomainWithoutWWW { static constexpr auto name = "domainWithoutWWW"; };
|
||||
@ -981,6 +1022,7 @@ struct NamePathFull { static constexpr auto name = "pathFull"; };
|
||||
struct NameQueryString { static constexpr auto name = "queryString"; };
|
||||
struct NameFragment { static constexpr auto name = "fragment"; };
|
||||
struct NameQueryStringAndFragment { static constexpr auto name = "queryStringAndFragment"; };
|
||||
struct NameUnquoteUrl { static constexpr auto name = "unquoteUrl"; };
|
||||
|
||||
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
|
||||
|
||||
@ -1002,6 +1044,7 @@ using FunctionPathFull = FunctionStringToString<ExtractSubstringImpl<ExtractPath
|
||||
using FunctionQueryString = FunctionStringToString<ExtractSubstringImpl<ExtractQueryString<true> >, NameQueryString> ;
|
||||
using FunctionFragment = FunctionStringToString<ExtractSubstringImpl<ExtractFragment<true> >, NameFragment> ;
|
||||
using FunctionQueryStringAndFragment = FunctionStringToString<ExtractSubstringImpl<ExtractQueryStringAndFragment<true> >, NameQueryStringAndFragment>;
|
||||
using FunctionUnquoteUrl = FunctionStringToString<UrlDecodeImpl, NameUnquoteUrl>;
|
||||
|
||||
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;
|
||||
|
||||
|
@ -1,6 +1,59 @@
|
||||
#include <DB/Common/StringUtils.h>
|
||||
#include <DB/Common/UrlUtils.h>
|
||||
|
||||
const char* const char2DigitTable = ("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff" //0-9
|
||||
"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //A-Z
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //a-z
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff");
|
||||
|
||||
std::string decodeUrl(const StringView& url)
|
||||
{
|
||||
const char* p = url.data();
|
||||
const char* st = url.data();
|
||||
const char* end = url.data() + url.size();
|
||||
std::string result;
|
||||
|
||||
for (; p < end; ++p)
|
||||
{
|
||||
if (*p != '%' || end - p < 3)
|
||||
continue;
|
||||
|
||||
unsigned char h = char2DigitTable[static_cast<unsigned char>(p[1])];
|
||||
unsigned char l = char2DigitTable[static_cast<unsigned char>(p[2])];
|
||||
|
||||
if (h != 0xFF && l != 0xFF)
|
||||
{
|
||||
unsigned char digit = (h << 4) + l;
|
||||
|
||||
if (digit < 127) {
|
||||
result.append(st, p - st + 1);
|
||||
result.back() = digit;
|
||||
st = p + 3;
|
||||
}
|
||||
}
|
||||
|
||||
p += 2;
|
||||
}
|
||||
|
||||
if (st == url.data())
|
||||
return std::string(url.data(), url.size());
|
||||
if (st < p)
|
||||
result.append(st, p - st);
|
||||
return result;
|
||||
}
|
||||
|
||||
StringView getUrlScheme(const StringView& url)
|
||||
{
|
||||
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
|
||||
|
@ -27,6 +27,7 @@ void registerFunctionsURL(FunctionFactory & factory)
|
||||
factory.registerFunction<FunctionCutFragment>();
|
||||
factory.registerFunction<FunctionCutQueryStringAndFragment>();
|
||||
factory.registerFunction<FunctionCutURLParameter>();
|
||||
factory.registerFunction<FunctionUnquoteUrl>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -12,3 +12,4 @@ com
|
||||
|
||||
ru
|
||||
ru
|
||||
http://127.0.0.1/?query=hello world+foo+bar
|
||||
|
@ -14,3 +14,5 @@ SELECT topLevelDomain('http://paul@www.example.com:80/') AS Domain;
|
||||
SELECT topLevelDomain('http://127.0.0.1:443/') AS Domain;
|
||||
SELECT topLevelDomain('svn+ssh://example.ru?q=hello%20world') AS Domain;
|
||||
SELECT topLevelDomain('svn+ssh://example.ru.?q=hello%20world') AS Domain;
|
||||
|
||||
SELECT unquoteUrl('http://127.0.0.1/?query=hello%20world+foo%2Bbar') AS Url;
|
||||
|
Loading…
Reference in New Issue
Block a user