2016-12-15 18:56:31 +00:00
|
|
|
#include <DB/Common/hex.h>
|
2014-08-22 00:57:20 +00:00
|
|
|
#include <DB/Functions/FunctionFactory.h>
|
|
|
|
#include <DB/Functions/FunctionsURL.h>
|
2017-03-10 17:52:36 +00:00
|
|
|
#include <DB/Functions/FunctionsStringSearch.h>
|
2016-12-15 17:55:12 +00:00
|
|
|
#include <common/find_first_symbols.h>
|
2014-08-22 00:57:20 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-12-15 19:35:32 +00:00
|
|
|
/// We assume that size of the dst buf isn't less than src_size.
|
|
|
|
static size_t decodeURL(const char * src, size_t src_size, char * dst)
|
2016-12-15 12:05:05 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
const char * src_prev_pos = src;
|
|
|
|
const char * src_curr_pos = src;
|
|
|
|
const char * const src_end = src + src_size;
|
|
|
|
char * dst_pos = dst;
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
src_curr_pos = find_first_symbols<'%'>(src_curr_pos, src_end);
|
|
|
|
|
|
|
|
if (src_curr_pos == src_end)
|
|
|
|
break;
|
|
|
|
else if (src_end - src_curr_pos < 3)
|
|
|
|
{
|
|
|
|
src_curr_pos = src_end;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
unsigned char high = char_to_digit_table[static_cast<unsigned char>(src_curr_pos[1])];
|
|
|
|
unsigned char low = char_to_digit_table[static_cast<unsigned char>(src_curr_pos[2])];
|
|
|
|
|
|
|
|
if (high != 0xFF && low != 0xFF)
|
|
|
|
{
|
|
|
|
unsigned char octet = (high << 4) + low;
|
|
|
|
|
|
|
|
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
|
|
|
|
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
|
|
|
|
dst_pos += bytes_to_copy;
|
|
|
|
|
|
|
|
*dst_pos = octet;
|
|
|
|
++dst_pos;
|
|
|
|
|
|
|
|
src_prev_pos = src_curr_pos + 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
src_curr_pos += 3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (src_prev_pos < src_curr_pos)
|
|
|
|
{
|
|
|
|
size_t bytes_to_copy = src_curr_pos - src_prev_pos;
|
|
|
|
memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy);
|
|
|
|
dst_pos += bytes_to_copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
return dst_pos - dst;
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
size_t ExtractProtocol::getReserveLengthForElement()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
return makeStringView("https").size() + 1;
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void ExtractProtocol::execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
2016-12-15 12:05:05 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
StringView scheme = getURLScheme(StringView(data, size));
|
|
|
|
Pos pos = data + scheme.size();
|
2016-12-15 12:05:05 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (scheme.empty() || (data + size) - pos < 4)
|
|
|
|
return;
|
2016-12-15 12:05:05 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (pos[0] == ':')
|
|
|
|
res_size = pos - data;
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void DecodeURLComponentImpl::vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
2017-04-01 07:20:54 +00:00
|
|
|
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
2016-12-15 12:05:05 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data.resize(data.size());
|
|
|
|
size_t size = offsets.size();
|
|
|
|
res_offsets.resize(size);
|
2016-12-15 12:05:05 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t prev_offset = 0;
|
|
|
|
size_t res_offset = 0;
|
2016-12-15 12:05:05 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
const char * src_data = reinterpret_cast<const char *>(&data[prev_offset]);
|
|
|
|
size_t src_size = offsets[i] - prev_offset;
|
|
|
|
size_t dst_size = decodeURL(src_data, src_size, reinterpret_cast<char *>(res_data.data() + res_offset));
|
2016-12-15 12:05:05 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res_offset += dst_size;
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
2016-12-15 19:35:32 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data.resize(res_offset);
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-12-15 19:35:32 +00:00
|
|
|
void DecodeURLComponentImpl::constant(const std::string & str,
|
2017-04-01 07:20:54 +00:00
|
|
|
std::string & res_data)
|
2016-12-15 12:05:05 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
ColumnString src;
|
|
|
|
ColumnString dst;
|
|
|
|
src.insert(str);
|
2016-12-15 20:33:35 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
vector(src.getChars(), src.getOffsets(), dst.getChars(), dst.getOffsets());
|
2016-12-15 20:33:35 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data = dst[0].get<String>();
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void DecodeURLComponentImpl::vector_fixed(const ColumnString::Chars_t & data, size_t n,
|
2017-04-01 07:20:54 +00:00
|
|
|
ColumnString::Chars_t & res_data)
|
2016-12-15 12:05:05 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
struct NameProtocol { static constexpr auto name = "protocol"; };
|
|
|
|
struct NameDomain { static constexpr auto name = "domain"; };
|
|
|
|
struct NameDomainWithoutWWW { static constexpr auto name = "domainWithoutWWW"; };
|
|
|
|
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
|
|
|
|
struct NameTopLevelDomain { static constexpr auto name = "topLevelDomain"; };
|
|
|
|
struct NamePath { static constexpr auto name = "path"; };
|
|
|
|
struct NamePathFull { static constexpr auto name = "pathFull"; };
|
|
|
|
struct NameQueryString { static constexpr auto name = "queryString"; };
|
|
|
|
struct NameFragment { static constexpr auto name = "fragment"; };
|
|
|
|
struct NameQueryStringAndFragment { static constexpr auto name = "queryStringAndFragment"; };
|
2017-03-10 17:52:36 +00:00
|
|
|
struct NameDecodeURLComponent { static constexpr auto name = "decodeURLComponent"; };
|
|
|
|
|
|
|
|
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
struct NameCutWWW { static constexpr auto name = "cutWWW"; };
|
|
|
|
struct NameCutQueryString { static constexpr auto name = "cutQueryString"; };
|
|
|
|
struct NameCutFragment { static constexpr auto name = "cutFragment"; };
|
|
|
|
struct NameCutQueryStringAndFragment { static constexpr auto name = "cutQueryStringAndFragment"; };
|
2017-03-10 17:52:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
struct NameExtractURLParameter { static constexpr auto name = "extractURLParameter"; };
|
|
|
|
struct NameCutURLParameter { static constexpr auto name = "cutURLParameter"; };
|
2017-03-10 17:52:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
using FunctionProtocol = FunctionStringToString<ExtractSubstringImpl<ExtractProtocol>, NameProtocol> ;
|
|
|
|
using FunctionDomain = FunctionStringToString<ExtractSubstringImpl<ExtractDomain<false> >, NameDomain> ;
|
|
|
|
using FunctionDomainWithoutWWW = FunctionStringToString<ExtractSubstringImpl<ExtractDomain<true> >, NameDomainWithoutWWW>;
|
2017-03-10 17:52:36 +00:00
|
|
|
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, NameFirstSignificantSubdomain>;
|
2017-04-01 07:20:54 +00:00
|
|
|
using FunctionTopLevelDomain = FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain>, NameTopLevelDomain> ;
|
|
|
|
using FunctionPath = FunctionStringToString<ExtractSubstringImpl<ExtractPath>, NamePath> ;
|
|
|
|
using FunctionPathFull = FunctionStringToString<ExtractSubstringImpl<ExtractPathFull>, NamePathFull> ;
|
|
|
|
using FunctionQueryString = FunctionStringToString<ExtractSubstringImpl<ExtractQueryString<true> >, NameQueryString> ;
|
|
|
|
using FunctionFragment = FunctionStringToString<ExtractSubstringImpl<ExtractFragment<true> >, NameFragment> ;
|
2017-03-10 17:52:36 +00:00
|
|
|
using FunctionQueryStringAndFragment = FunctionStringToString<ExtractSubstringImpl<ExtractQueryStringAndFragment<true> >, NameQueryStringAndFragment>;
|
|
|
|
using FunctionDecodeURLComponent = FunctionStringToString<DecodeURLComponentImpl, NameDecodeURLComponent>;
|
|
|
|
|
|
|
|
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
using FunctionCutWWW = FunctionStringToString<CutSubstringImpl<ExtractWWW>, NameCutWWW> ;
|
|
|
|
using FunctionCutQueryString = FunctionStringToString<CutSubstringImpl<ExtractQueryString<false> >, NameCutQueryString> ;
|
|
|
|
using FunctionCutFragment = FunctionStringToString<CutSubstringImpl<ExtractFragment<false> >, NameCutFragment> ;
|
2017-03-10 17:52:36 +00:00
|
|
|
using FunctionCutQueryStringAndFragment = FunctionStringToString<CutSubstringImpl<ExtractQueryStringAndFragment<false> >, NameCutQueryStringAndFragment>;
|
|
|
|
|
|
|
|
using FunctionExtractURLParameter = FunctionsStringSearchToString<ExtractURLParameterImpl, NameExtractURLParameter>;
|
|
|
|
using FunctionCutURLParameter = FunctionsStringSearchToString<CutURLParameterImpl, NameCutURLParameter>;
|
|
|
|
using FunctionExtractURLParameters = FunctionTokens<ExtractURLParametersImpl>;
|
|
|
|
using FunctionExtractURLParameters = FunctionTokens<ExtractURLParametersImpl>;
|
|
|
|
using FunctionURLHierarchy = FunctionTokens<URLHierarchyImpl>;
|
|
|
|
using FunctionURLPathHierarchy = FunctionTokens<URLPathHierarchyImpl>;
|
|
|
|
using FunctionExtractURLParameterNames = FunctionTokens<ExtractURLParameterNamesImpl>;
|
|
|
|
|
2016-12-15 12:05:05 +00:00
|
|
|
|
2014-08-22 00:57:20 +00:00
|
|
|
void registerFunctionsURL(FunctionFactory & factory)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
factory.registerFunction<FunctionProtocol>();
|
|
|
|
factory.registerFunction<FunctionDomain>();
|
|
|
|
factory.registerFunction<FunctionDomainWithoutWWW>();
|
|
|
|
factory.registerFunction<FunctionFirstSignificantSubdomain>();
|
|
|
|
factory.registerFunction<FunctionTopLevelDomain>();
|
|
|
|
factory.registerFunction<FunctionPath>();
|
|
|
|
factory.registerFunction<FunctionPathFull>();
|
|
|
|
factory.registerFunction<FunctionQueryString>();
|
|
|
|
factory.registerFunction<FunctionFragment>();
|
|
|
|
factory.registerFunction<FunctionQueryStringAndFragment>();
|
|
|
|
factory.registerFunction<FunctionExtractURLParameter>();
|
|
|
|
factory.registerFunction<FunctionExtractURLParameters>();
|
|
|
|
factory.registerFunction<FunctionExtractURLParameterNames>();
|
|
|
|
factory.registerFunction<FunctionURLHierarchy>();
|
|
|
|
factory.registerFunction<FunctionURLPathHierarchy>();
|
|
|
|
factory.registerFunction<FunctionCutToFirstSignificantSubdomain>();
|
|
|
|
factory.registerFunction<FunctionCutWWW>();
|
|
|
|
factory.registerFunction<FunctionCutQueryString>();
|
|
|
|
factory.registerFunction<FunctionCutFragment>();
|
|
|
|
factory.registerFunction<FunctionCutQueryStringAndFragment>();
|
|
|
|
factory.registerFunction<FunctionCutURLParameter>();
|
|
|
|
factory.registerFunction<FunctionDecodeURLComponent>();
|
2014-08-22 00:57:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|