implement unquoteUrl

This commit is contained in:
artpaul 2016-12-11 02:04:58 +05:00
parent 396ccccfce
commit 99cec8b165
6 changed files with 104 additions and 0 deletions

View File

@ -2,6 +2,10 @@
#include <DB/Common/StringView.h>
/// Percent decode of url data.
std::string decodeUrl(const StringView& url);
/// Extracts scheme from given url.
StringView getUrlScheme(const StringView& url);

View File

@ -971,6 +971,47 @@ struct CutSubstringImpl
};
struct UrlDecodeImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < size; ++i)
{
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
std::string url = decodeUrl(StringView(current, offsets[i] - prev_offset - 1));
res_data.resize(res_data.size() + url.size() + 1);
memcpy(&res_data[res_offset], url.data(), url.size());
res_offset += url.size() + 1;
res_data[res_offset - 1] = 0;
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
static void constant(const std::string & data,
std::string & res_data)
{
res_data = decodeUrl(data);
}
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
}
};
struct NameProtocol { static constexpr auto name = "protocol"; };
struct NameDomain { static constexpr auto name = "domain"; };
struct NameDomainWithoutWWW { static constexpr auto name = "domainWithoutWWW"; };
@ -981,6 +1022,7 @@ struct NamePathFull { static constexpr auto name = "pathFull"; };
struct NameQueryString { static constexpr auto name = "queryString"; };
struct NameFragment { static constexpr auto name = "fragment"; };
struct NameQueryStringAndFragment { static constexpr auto name = "queryStringAndFragment"; };
struct NameUnquoteUrl { static constexpr auto name = "unquoteUrl"; };
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
@ -1002,6 +1044,7 @@ using FunctionPathFull = FunctionStringToString<ExtractSubstringImpl<ExtractPath
using FunctionQueryString = FunctionStringToString<ExtractSubstringImpl<ExtractQueryString<true> >, NameQueryString> ;
using FunctionFragment = FunctionStringToString<ExtractSubstringImpl<ExtractFragment<true> >, NameFragment> ;
using FunctionQueryStringAndFragment = FunctionStringToString<ExtractSubstringImpl<ExtractQueryStringAndFragment<true> >, NameQueryStringAndFragment>;
using FunctionUnquoteUrl = FunctionStringToString<UrlDecodeImpl, NameUnquoteUrl>;
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;

View File

@ -1,6 +1,59 @@
#include <DB/Common/StringUtils.h>
#include <DB/Common/UrlUtils.h>
const char* const char2DigitTable = ("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff" //0-9
"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //A-Z
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //a-z
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff");
std::string decodeUrl(const StringView& url)
{
const char* p = url.data();
const char* st = url.data();
const char* end = url.data() + url.size();
std::string result;
for (; p < end; ++p)
{
if (*p != '%' || end - p < 3)
continue;
unsigned char h = char2DigitTable[static_cast<unsigned char>(p[1])];
unsigned char l = char2DigitTable[static_cast<unsigned char>(p[2])];
if (h != 0xFF && l != 0xFF)
{
unsigned char digit = (h << 4) + l;
if (digit < 127) {
result.append(st, p - st + 1);
result.back() = digit;
st = p + 3;
}
}
p += 2;
}
if (st == url.data())
return std::string(url.data(), url.size());
if (st < p)
result.append(st, p - st);
return result;
}
StringView getUrlScheme(const StringView& url)
{
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )

View File

@ -27,6 +27,7 @@ void registerFunctionsURL(FunctionFactory & factory)
factory.registerFunction<FunctionCutFragment>();
factory.registerFunction<FunctionCutQueryStringAndFragment>();
factory.registerFunction<FunctionCutURLParameter>();
factory.registerFunction<FunctionUnquoteUrl>();
}
}

View File

@ -12,3 +12,4 @@ com
ru
ru
http://127.0.0.1/?query=hello world+foo+bar

View File

@ -14,3 +14,5 @@ SELECT topLevelDomain('http://paul@www.example.com:80/') AS Domain;
SELECT topLevelDomain('http://127.0.0.1:443/') AS Domain;
SELECT topLevelDomain('svn+ssh://example.ru?q=hello%20world') AS Domain;
SELECT topLevelDomain('svn+ssh://example.ru.?q=hello%20world') AS Domain;
SELECT unquoteUrl('http://127.0.0.1/?query=hello%20world+foo%2Bbar') AS Url;