2012-07-16 03:42:36 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <DataTypes/DataTypeString.h>
|
|
|
|
#include <Columns/ColumnString.h>
|
2018-01-15 19:07:47 +00:00
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2017-07-13 20:58:19 +00:00
|
|
|
#include <Common/typeid_cast.h>
|
2018-11-25 00:08:50 +00:00
|
|
|
#include <common/find_symbols.h>
|
2018-08-26 00:26:51 +00:00
|
|
|
#include <common/StringRef.h>
|
2017-07-21 06:35:58 +00:00
|
|
|
#include <Functions/FunctionHelpers.h>
|
2018-09-09 23:47:56 +00:00
|
|
|
#include <Functions/FunctionStringToString.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Functions/FunctionsStringArray.h>
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-03-12 11:09:25 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-10-20 05:21:49 +00:00
|
|
|
/** URL processing functions.
|
|
|
|
* All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
|
2012-07-16 03:42:36 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Functions for extraction parts of URL.
|
|
|
|
* If URL has nothing like, then empty string is returned.
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2012-07-16 03:42:36 +00:00
|
|
|
* domain
|
|
|
|
* domainWithoutWWW
|
|
|
|
* topLevelDomain
|
|
|
|
* protocol
|
|
|
|
* path
|
|
|
|
* queryString
|
|
|
|
* fragment
|
|
|
|
* queryStringAndFragment
|
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Functions, removing parts from URL.
|
|
|
|
* If URL has nothing like, then it is retured unchanged.
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2012-07-16 03:42:36 +00:00
|
|
|
* cutWWW
|
|
|
|
* cutFragment
|
|
|
|
* cutQueryString
|
|
|
|
* cutQueryStringAndFragment
|
2012-07-21 03:45:48 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
|
|
|
|
* If there are many parameters with same name - return value of first one. Value is not %-decoded.
|
2012-07-21 03:45:48 +00:00
|
|
|
*
|
2013-03-18 10:27:45 +00:00
|
|
|
* extractURLParameter(URL, name)
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Extract all parameters from URL in form of array of strings name=value.
|
2013-03-18 10:27:45 +00:00
|
|
|
* extractURLParameters(URL)
|
2013-08-02 13:55:43 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Extract names of all parameters from URL in form of array of strings.
|
2013-08-05 08:40:56 +00:00
|
|
|
* extractURLParameterNames(URL)
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Remove specified parameter from URL.
|
2013-03-18 10:27:45 +00:00
|
|
|
* cutURLParameter(URL, name)
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Get array of URL 'hierarchy' as in Yandex.Metrica tree-like reports. See docs.
|
2013-03-18 10:27:45 +00:00
|
|
|
* URLHierarchy(URL)
|
2012-07-16 03:42:36 +00:00
|
|
|
*/
|
|
|
|
|
2016-05-28 10:35:44 +00:00
|
|
|
using Pos = const char *;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2016-12-15 12:05:05 +00:00
|
|
|
|
|
|
|
/// Extracts scheme from given url.
|
2018-08-26 00:26:51 +00:00
|
|
|
inline StringRef getURLScheme(const char * data, size_t size)
|
2012-07-16 03:42:36 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
|
2018-08-26 00:26:51 +00:00
|
|
|
const char * pos = data;
|
|
|
|
const char * end = data + size;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
if (isAlphaASCII(*pos))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-08-26 00:26:51 +00:00
|
|
|
for (++pos; pos < end; ++pos)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-08-26 00:26:51 +00:00
|
|
|
if (!(isAlphaNumericASCII(*pos) || *pos == '+' || *pos == '-' || *pos == '.'))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
return StringRef(data, pos - data);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
return {};
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
|
|
|
|
|
2016-12-15 12:05:05 +00:00
|
|
|
/// Extracts host from given url.
|
2018-08-26 00:26:51 +00:00
|
|
|
inline StringRef getURLHost(const char * data, size_t size)
|
2016-12-15 12:05:05 +00:00
|
|
|
{
|
2018-08-26 00:26:51 +00:00
|
|
|
Pos pos = data;
|
|
|
|
Pos end = data + size;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (end == (pos = find_first_symbols<'/'>(pos, end)))
|
2018-08-26 00:26:51 +00:00
|
|
|
return {};
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
if (pos != data)
|
2018-02-19 11:49:49 +00:00
|
|
|
{
|
2018-08-26 00:26:51 +00:00
|
|
|
StringRef scheme = getURLScheme(data, size);
|
|
|
|
Pos scheme_end = data + scheme.size;
|
2018-02-19 11:49:49 +00:00
|
|
|
|
|
|
|
// Colon must follows after scheme.
|
2018-02-22 03:10:51 +00:00
|
|
|
if (pos - scheme_end != 1 || *scheme_end != ':')
|
2018-08-26 00:26:51 +00:00
|
|
|
return {};
|
2018-02-19 11:49:49 +00:00
|
|
|
}
|
|
|
|
|
2018-02-22 03:10:51 +00:00
|
|
|
if (end - pos < 2 || *(pos) != '/' || *(pos + 1) != '/')
|
2018-08-26 00:26:51 +00:00
|
|
|
return {};
|
|
|
|
pos += 2;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
const char * start_of_host = pos;
|
2018-02-19 11:49:49 +00:00
|
|
|
for (; pos < end; ++pos)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-02-19 11:49:49 +00:00
|
|
|
if (*pos == '@')
|
2018-02-22 03:10:51 +00:00
|
|
|
start_of_host = pos + 1;
|
2018-04-19 04:25:08 +00:00
|
|
|
else if (*pos == ':' || *pos == '/' || *pos == '?' || *pos == '#')
|
2017-04-01 07:20:54 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
return (pos == start_of_host) ? StringRef{} : StringRef(start_of_host, pos - start_of_host);
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
struct ExtractProtocol
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement();
|
2016-12-15 12:05:05 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size);
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
template <bool without_www>
|
|
|
|
struct ExtractDomain
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 15; }
|
|
|
|
|
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
2018-08-26 00:26:51 +00:00
|
|
|
StringRef host = getURLHost(data, size);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
if (host.size == 0)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2018-08-26 00:26:51 +00:00
|
|
|
if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
|
|
|
|
host = { host.data + 4, host.size - 4 };
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
res_data = host.data;
|
|
|
|
res_size = host.size;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
2014-10-27 15:16:11 +00:00
|
|
|
struct ExtractFirstSignificantSubdomain
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 10; }
|
|
|
|
|
|
|
|
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
|
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
|
|
|
|
|
|
|
Pos tmp;
|
|
|
|
size_t domain_length;
|
|
|
|
ExtractDomain<true>::execute(data, size, tmp, domain_length);
|
|
|
|
|
|
|
|
if (domain_length == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (out_domain_end)
|
|
|
|
*out_domain_end = tmp + domain_length;
|
|
|
|
|
|
|
|
/// cut useless dot
|
|
|
|
if (tmp[domain_length - 1] == '.')
|
|
|
|
--domain_length;
|
|
|
|
|
|
|
|
res_data = tmp;
|
|
|
|
res_size = domain_length;
|
|
|
|
|
|
|
|
auto begin = tmp;
|
|
|
|
auto end = begin + domain_length;
|
|
|
|
const char * last_3_periods[3]{};
|
|
|
|
|
2018-08-27 13:56:53 +00:00
|
|
|
auto pos = find_first_symbols<'.'>(begin, end);
|
|
|
|
while (pos < end)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
last_3_periods[2] = last_3_periods[1];
|
|
|
|
last_3_periods[1] = last_3_periods[0];
|
|
|
|
last_3_periods[0] = pos;
|
2018-08-26 00:26:51 +00:00
|
|
|
pos = find_first_symbols<'.'>(pos + 1, end);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!last_3_periods[0])
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!last_3_periods[1])
|
|
|
|
{
|
|
|
|
res_size = last_3_periods[0] - begin;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!last_3_periods[2])
|
|
|
|
last_3_periods[2] = begin - 1;
|
|
|
|
|
2018-11-21 00:46:06 +00:00
|
|
|
size_t size_of_second_subdomain_plus_period = last_3_periods[0] - last_3_periods[1];
|
|
|
|
if (size_of_second_subdomain_plus_period == 4 || size_of_second_subdomain_plus_period == 3)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-11-21 00:46:06 +00:00
|
|
|
/// We will key by four bytes that are either ".xyz" or ".xy.".
|
|
|
|
UInt32 key = unalignedLoad<UInt32>(last_3_periods[1]);
|
|
|
|
|
|
|
|
/// NOTE: assuming little endian.
|
|
|
|
/// NOTE: does the compiler generate SIMD code?
|
|
|
|
/// NOTE: for larger amount of cases we can use a perfect hash table (see 'gperf' as an example).
|
|
|
|
if ( key == '.' + 'c' * 0x100U + 'o' * 0x10000U + 'm' * 0x1000000U
|
|
|
|
|| key == '.' + 'n' * 0x100U + 'e' * 0x10000U + 't' * 0x1000000U
|
|
|
|
|| key == '.' + 'o' * 0x100U + 'r' * 0x10000U + 'g' * 0x1000000U
|
|
|
|
|| key == '.' + 'b' * 0x100U + 'i' * 0x10000U + 'z' * 0x1000000U
|
|
|
|
|| key == '.' + 'g' * 0x100U + 'o' * 0x10000U + 'v' * 0x1000000U
|
|
|
|
|| key == '.' + 'm' * 0x100U + 'i' * 0x10000U + 'l' * 0x1000000U
|
|
|
|
|| key == '.' + 'e' * 0x100U + 'd' * 0x10000U + 'u' * 0x1000000U
|
|
|
|
|| key == '.' + 'c' * 0x100U + 'o' * 0x10000U + '.' * 0x1000000U)
|
|
|
|
{
|
|
|
|
res_data += last_3_periods[2] + 1 - begin;
|
|
|
|
res_size = last_3_periods[1] - last_3_periods[2] - 1;
|
|
|
|
return;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
res_data += last_3_periods[1] + 1 - begin;
|
|
|
|
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
|
|
|
}
|
2014-10-27 15:16:11 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct CutToFirstSignificantSubdomain
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 15; }
|
2014-10-27 15:16:11 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
2014-10-27 15:16:11 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos tmp_data;
|
|
|
|
size_t tmp_length;
|
|
|
|
Pos domain_end;
|
|
|
|
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
|
2014-10-27 15:16:11 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (tmp_length == 0)
|
|
|
|
return;
|
2014-10-27 15:16:11 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data = tmp_data;
|
|
|
|
res_size = domain_end - tmp_data;
|
|
|
|
}
|
2014-10-27 15:16:11 +00:00
|
|
|
};
|
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
struct ExtractTopLevelDomain
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 5; }
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
2018-08-26 00:26:51 +00:00
|
|
|
StringRef host = getURLHost(data, size);
|
2016-12-09 22:49:21 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2018-08-26 00:26:51 +00:00
|
|
|
if (host.size != 0)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-08-26 00:26:51 +00:00
|
|
|
if (host.data[host.size - 1] == '.')
|
|
|
|
host.size -= 1;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
auto host_end = host.data + host.size;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
Pos last_dot = find_last_symbols_or_null<'.'>(host.data, host_end);
|
2017-04-01 07:20:54 +00:00
|
|
|
if (!last_dot)
|
|
|
|
return;
|
2018-11-25 00:08:50 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// For IPv4 addresses select nothing.
|
2017-04-01 07:20:54 +00:00
|
|
|
if (last_dot[1] <= '9')
|
|
|
|
return;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data = last_dot + 1;
|
2018-11-25 00:08:50 +00:00
|
|
|
res_size = host_end - res_data;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct ExtractPath
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 25; }
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos pos = data;
|
|
|
|
Pos end = pos + size;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (end != (pos = find_first_symbols<'/'>(pos, end)) && pos[1] == '/' && end != (pos = find_first_symbols<'/'>(pos + 2, end)))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
Pos query_string_or_fragment = find_first_symbols<'?', '#'>(pos, end);
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data = pos;
|
2018-11-25 00:08:50 +00:00
|
|
|
res_size = query_string_or_fragment - res_data;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
2014-12-05 13:31:48 +00:00
|
|
|
struct ExtractPathFull
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 30; }
|
|
|
|
|
|
|
|
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
|
|
|
|
|
|
|
Pos pos = data;
|
|
|
|
Pos end = pos + size;
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (end != (pos = find_first_symbols<'/'>(pos, end)) && pos[1] == '/' && end != (pos = find_first_symbols<'/'>(pos + 2, end)))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
res_data = pos;
|
|
|
|
res_size = end - res_data;
|
|
|
|
}
|
|
|
|
}
|
2014-12-05 13:31:48 +00:00
|
|
|
};
|
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
template <bool without_leading_char>
|
|
|
|
struct ExtractQueryString
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 10; }
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos pos = data;
|
|
|
|
Pos end = pos + size;
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (end != (pos = find_first_symbols<'?'>(pos, end)))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
Pos fragment = find_first_symbols<'#'>(pos, end);
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data = pos + (without_leading_char ? 1 : 0);
|
2018-11-25 00:08:50 +00:00
|
|
|
res_size = fragment - res_data;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
template <bool without_leading_char>
|
|
|
|
struct ExtractFragment
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 10; }
|
|
|
|
|
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
|
|
|
|
|
|
|
Pos pos = data;
|
|
|
|
Pos end = pos + size;
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (end != (pos = find_first_symbols<'#'>(pos, end)))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
res_data = pos + (without_leading_char ? 1 : 0);
|
|
|
|
res_size = end - res_data;
|
|
|
|
}
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
template <bool without_leading_char>
|
|
|
|
struct ExtractQueryStringAndFragment
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getReserveLengthForElement() { return 20; }
|
|
|
|
|
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
|
|
|
|
|
|
|
Pos pos = data;
|
|
|
|
Pos end = pos + size;
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (end != (pos = find_first_symbols<'?'>(pos, end)))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
res_data = pos + (without_leading_char ? 1 : 0);
|
|
|
|
res_size = end - res_data;
|
|
|
|
}
|
2018-11-25 00:08:50 +00:00
|
|
|
else if (end != (pos = find_first_symbols<'#'>(pos, end)))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
res_data = pos;
|
|
|
|
res_size = end - res_data;
|
|
|
|
}
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// With dot at the end.
|
2012-07-16 03:42:36 +00:00
|
|
|
struct ExtractWWW
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
|
|
{
|
|
|
|
res_data = data;
|
|
|
|
res_size = 0;
|
|
|
|
|
|
|
|
Pos pos = data;
|
|
|
|
Pos end = pos + size;
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (end != (pos = find_first_symbols<'/'>(pos, end)))
|
2018-02-20 01:34:50 +00:00
|
|
|
{
|
|
|
|
if (pos != data)
|
|
|
|
{
|
|
|
|
Pos tmp;
|
|
|
|
size_t protocol_length;
|
|
|
|
ExtractProtocol::execute(data, size, tmp, protocol_length);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-02-20 01:34:50 +00:00
|
|
|
if (pos != data + protocol_length + 1)
|
|
|
|
return;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-02-22 03:10:51 +00:00
|
|
|
if (end - pos < 2 || *(pos) != '/' || *(pos + 1) != '/')
|
2018-02-20 01:34:50 +00:00
|
|
|
return;
|
|
|
|
|
2018-02-22 03:10:51 +00:00
|
|
|
const char *start_of_host = (pos += 2);
|
2018-02-20 01:34:50 +00:00
|
|
|
for (; pos < end; ++pos)
|
|
|
|
{
|
|
|
|
if (*pos == '@')
|
2018-02-22 03:10:51 +00:00
|
|
|
start_of_host = pos + 1;
|
2018-04-19 04:25:08 +00:00
|
|
|
else if (*pos == ':' || *pos == '/' || *pos == '?' || *pos == '#')
|
2018-02-20 01:34:50 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-02-22 03:10:51 +00:00
|
|
|
if (start_of_host + 4 < end && !strncmp(start_of_host, "www.", 4))
|
2018-02-20 01:34:50 +00:00
|
|
|
{
|
2018-02-22 03:10:51 +00:00
|
|
|
res_data = start_of_host;
|
2018-02-20 01:34:50 +00:00
|
|
|
res_size = 4;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2013-03-05 12:12:47 +00:00
|
|
|
struct ExtractURLParameterImpl
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector(const ColumnString::Chars & data,
|
|
|
|
const ColumnString::Offsets & offsets,
|
|
|
|
std::string pattern,
|
|
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
res_data.reserve(data.size() / 5);
|
2017-04-01 07:20:54 +00:00
|
|
|
res_offsets.resize(offsets.size());
|
|
|
|
|
|
|
|
pattern += '=';
|
|
|
|
const char * param_str = pattern.c_str();
|
|
|
|
size_t param_len = pattern.size();
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
ColumnString::Offset prev_offset = 0;
|
|
|
|
ColumnString::Offset res_offset = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
ColumnString::Offset cur_offset = offsets[i];
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
const char * str = reinterpret_cast<const char *>(&data[prev_offset]);
|
2018-11-25 00:08:50 +00:00
|
|
|
const char * end = reinterpret_cast<const char *>(&data[cur_offset]);
|
|
|
|
|
|
|
|
/// Find query string or fragment identifier.
|
|
|
|
/// Note that we support parameters in fragment identifier in the same way as in query string.
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
const char * const query_string_begin = find_first_symbols<'?', '#'>(str, end);
|
|
|
|
|
|
|
|
/// Will point to the beginning of "name=value" pair. Then it will be reassigned to the beginning of "value".
|
|
|
|
const char * param_begin = nullptr;
|
|
|
|
|
|
|
|
if (query_string_begin + 1 < end)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
param_begin = query_string_begin + 1;
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (true)
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
param_begin = strstr(param_begin, param_str);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (!param_begin)
|
2017-04-01 07:20:54 +00:00
|
|
|
break;
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (param_begin[-1] != '?' && param_begin[-1] != '#' && param_begin[-1] != '&')
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
/// Parameter name is different but has the same suffix.
|
|
|
|
param_begin += param_len;
|
2017-04-01 07:20:54 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
param_begin += param_len;
|
2017-04-01 07:20:54 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (param_begin)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
const char * param_end = find_first_symbols<'&', '#'>(param_begin, end);
|
|
|
|
if (param_end == end)
|
|
|
|
param_end = param_begin + strlen(param_begin);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
size_t param_size = param_end - param_begin;
|
|
|
|
|
|
|
|
res_data.resize(res_offset + param_size + 1);
|
|
|
|
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], param_begin, param_size);
|
|
|
|
res_offset += param_size;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
/// No parameter found, put empty string in result.
|
2017-04-01 07:20:54 +00:00
|
|
|
res_data.resize(res_offset + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
res_data[res_offset] = 0;
|
|
|
|
++res_offset;
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
|
|
|
|
prev_offset = cur_offset;
|
|
|
|
}
|
|
|
|
}
|
2013-03-05 12:12:47 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2013-03-18 10:27:45 +00:00
|
|
|
struct CutURLParameterImpl
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector(const ColumnString::Chars & data,
|
2017-12-15 21:32:25 +00:00
|
|
|
const ColumnString::Offsets & offsets,
|
2017-04-01 07:20:54 +00:00
|
|
|
std::string pattern,
|
2018-11-25 00:08:50 +00:00
|
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
res_data.reserve(data.size());
|
|
|
|
res_offsets.resize(offsets.size());
|
|
|
|
|
|
|
|
pattern += '=';
|
|
|
|
const char * param_str = pattern.c_str();
|
|
|
|
size_t param_len = pattern.size();
|
|
|
|
|
|
|
|
size_t prev_offset = 0;
|
|
|
|
size_t res_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
|
|
{
|
|
|
|
size_t cur_offset = offsets[i];
|
|
|
|
|
|
|
|
const char * url_begin = reinterpret_cast<const char *>(&data[prev_offset]);
|
|
|
|
const char * url_end = reinterpret_cast<const char *>(&data[cur_offset]) - 1;
|
|
|
|
const char * begin_pos = url_begin;
|
|
|
|
const char * end_pos = begin_pos;
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
const char * query_string_begin = find_first_symbols<'?', '#'>(url_begin, url_end);
|
|
|
|
if (query_string_begin == url_end)
|
2017-04-01 07:20:54 +00:00
|
|
|
break;
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
const char * pos = strstr(query_string_begin + 1, param_str);
|
2017-04-01 07:20:54 +00:00
|
|
|
if (pos == nullptr)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (pos[-1] != '?' && pos[-1] != '#' && pos[-1] != '&')
|
|
|
|
{
|
|
|
|
pos = nullptr;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
begin_pos = pos;
|
|
|
|
end_pos = begin_pos + param_len;
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Skip the value.
|
2017-04-01 07:20:54 +00:00
|
|
|
while (*end_pos && *end_pos != '&' && *end_pos != '#')
|
|
|
|
++end_pos;
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Capture '&' before or after the parameter.
|
2017-04-01 07:20:54 +00:00
|
|
|
if (*end_pos == '&')
|
|
|
|
++end_pos;
|
|
|
|
else if (begin_pos[-1] == '&')
|
|
|
|
--begin_pos;
|
|
|
|
} while (false);
|
|
|
|
|
|
|
|
size_t cut_length = (url_end - url_begin) - (end_pos - begin_pos);
|
|
|
|
res_data.resize(res_offset + cut_length + 1);
|
|
|
|
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], url_begin, begin_pos - url_begin);
|
|
|
|
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset] + (begin_pos - url_begin), end_pos, url_end - end_pos);
|
|
|
|
res_offset += cut_length + 1;
|
|
|
|
res_data[res_offset - 1] = 0;
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
|
|
|
|
prev_offset = cur_offset;
|
|
|
|
}
|
|
|
|
}
|
2013-03-18 10:27:45 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2013-03-05 13:30:23 +00:00
|
|
|
class ExtractURLParametersImpl
|
|
|
|
{
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
bool first;
|
2014-06-26 00:58:14 +00:00
|
|
|
|
2013-03-05 13:30:23 +00:00
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr auto name = "extractURLParameters";
|
|
|
|
static String getName() { return name; }
|
|
|
|
|
|
|
|
static size_t getNumberOfArguments() { return 1; }
|
|
|
|
|
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[0]))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
2017-12-02 02:47:12 +00:00
|
|
|
void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Returns the position of the argument that is the column of rows
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t getStringsArgumentPosition()
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Called for each next string.
|
2017-04-01 07:20:54 +00:00
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
first = true;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
2017-04-01 07:20:54 +00:00
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
|
|
|
if (pos == nullptr)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (first)
|
|
|
|
{
|
|
|
|
first = false;
|
2018-11-25 00:08:50 +00:00
|
|
|
pos = find_first_symbols<'?', '#'>(pos, end);
|
|
|
|
if (pos == end)
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
++pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
token_begin = pos;
|
2018-11-25 00:08:50 +00:00
|
|
|
pos = find_first_symbols<'=', '&', '#', '?'>(pos, end);
|
|
|
|
if (pos == end)
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (*pos == '?')
|
|
|
|
{
|
|
|
|
++pos;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*pos == '&' || *pos == '#')
|
|
|
|
{
|
|
|
|
token_end = pos++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
++pos;
|
2018-11-25 00:08:50 +00:00
|
|
|
pos = find_first_symbols<'&', '#'>(pos, end);
|
|
|
|
if (pos == end)
|
2017-04-01 07:20:54 +00:00
|
|
|
token_end = end;
|
|
|
|
else
|
|
|
|
token_end = pos++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2013-03-06 09:00:58 +00:00
|
|
|
};
|
|
|
|
|
2013-08-02 13:55:43 +00:00
|
|
|
class ExtractURLParameterNamesImpl
|
|
|
|
{
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
bool first;
|
2013-08-02 13:55:43 +00:00
|
|
|
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr auto name = "extractURLParameterNames";
|
|
|
|
static String getName() { return name; }
|
|
|
|
|
|
|
|
static size_t getNumberOfArguments() { return 1; }
|
|
|
|
|
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[0]))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Returns the position of the argument that is the column of rows
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t getStringsArgumentPosition()
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-12-02 02:47:12 +00:00
|
|
|
void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Called for each next string.
|
2017-04-01 07:20:54 +00:00
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
first = true;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
2017-04-01 07:20:54 +00:00
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
|
|
|
if (pos == nullptr)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (first)
|
|
|
|
{
|
|
|
|
first = false;
|
2018-11-25 00:08:50 +00:00
|
|
|
pos = find_first_symbols<'?', '#'>(pos, end);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
else
|
2018-11-25 00:08:50 +00:00
|
|
|
pos = find_first_symbols<'&', '#'>(pos, end);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
if (pos == end)
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
token_begin = pos;
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
pos = find_first_symbols<'=', '&', '#', '?'>(pos, end);
|
|
|
|
if (pos == end)
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
else
|
|
|
|
token_end = pos;
|
|
|
|
|
|
|
|
if (*pos == '?')
|
|
|
|
{
|
|
|
|
++pos;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2013-08-02 13:55:43 +00:00
|
|
|
};
|
2013-03-06 09:00:58 +00:00
|
|
|
|
|
|
|
class URLHierarchyImpl
|
|
|
|
{
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos begin;
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
2014-06-26 00:58:14 +00:00
|
|
|
|
2013-03-06 09:00:58 +00:00
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr auto name = "URLHierarchy";
|
|
|
|
static String getName() { return name; }
|
|
|
|
|
|
|
|
static size_t getNumberOfArguments() { return 1; }
|
|
|
|
|
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[0]))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
2017-12-02 02:47:12 +00:00
|
|
|
void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Returns the position of the argument that is the column of rows
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t getStringsArgumentPosition()
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Called for each next string.
|
2017-04-01 07:20:54 +00:00
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
begin = pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
2017-04-01 07:20:54 +00:00
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Code from URLParser.
|
2017-04-01 07:20:54 +00:00
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (pos == begin)
|
|
|
|
{
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Let's parse everything that goes before the path
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Assume that the protocol has already been changed to lowercase.
|
2017-04-01 07:20:54 +00:00
|
|
|
while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9')))
|
|
|
|
++pos;
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes.
|
|
|
|
* (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there
|
|
|
|
* For the rest, simply return the full URL as the only element of the hierarchy.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end))
|
|
|
|
{
|
|
|
|
pos = end;
|
|
|
|
token_begin = begin;
|
|
|
|
token_end = end;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// The domain for simplicity is everything that after the protocol and two slashes, until the next slash or `?` or `#`
|
2017-04-01 07:20:54 +00:00
|
|
|
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
if (pos != end)
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
token_begin = begin;
|
|
|
|
token_end = pos;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// We go to the next `/` or `?` or `#`, skipping all those at the beginning.
|
2017-04-01 07:20:54 +00:00
|
|
|
while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#'))
|
|
|
|
++pos;
|
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
if (pos != end)
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
token_begin = begin;
|
|
|
|
token_end = pos;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2013-03-05 13:30:23 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2014-02-11 19:18:38 +00:00
|
|
|
class URLPathHierarchyImpl
|
|
|
|
{
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos begin;
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
Pos start;
|
2014-02-11 19:18:38 +00:00
|
|
|
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr auto name = "URLPathHierarchy";
|
|
|
|
static String getName() { return name; }
|
|
|
|
|
|
|
|
static size_t getNumberOfArguments() { return 1; }
|
|
|
|
|
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[0]))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
2017-12-02 02:47:12 +00:00
|
|
|
void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Returns the position of the argument that is the column of rows
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t getStringsArgumentPosition()
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Called for each next string.
|
2017-04-01 07:20:54 +00:00
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
begin = pos = pos_;
|
|
|
|
start = begin;
|
|
|
|
end = end_;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
2017-04-01 07:20:54 +00:00
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Code from URLParser.
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (pos == begin)
|
|
|
|
{
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Let's parse everything that goes before the path
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Assume that the protocol has already been changed to lowercase.
|
2017-04-01 07:20:54 +00:00
|
|
|
while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9')))
|
|
|
|
++pos;
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes.
|
|
|
|
* (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there.
|
|
|
|
* For the rest, just return an empty array.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end))
|
|
|
|
{
|
|
|
|
pos = end;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// The domain for simplicity is everything that after the protocol and the two slashes, until the next slash or `?` or `#`
|
2017-04-01 07:20:54 +00:00
|
|
|
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
start = pos;
|
|
|
|
|
|
|
|
if (pos != end)
|
|
|
|
++pos;
|
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// We go to the next `/` or `?` or `#`, skipping all those at the beginning.
|
2017-04-01 07:20:54 +00:00
|
|
|
while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#'))
|
|
|
|
++pos;
|
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
if (pos != end)
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
token_begin = start;
|
|
|
|
token_end = pos;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2014-02-11 19:18:38 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/** Select part of string using the Extractor.
|
2012-07-16 03:42:36 +00:00
|
|
|
*/
|
|
|
|
template <typename Extractor>
|
|
|
|
struct ExtractSubstringImpl
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
size_t size = offsets.size();
|
|
|
|
res_offsets.resize(size);
|
|
|
|
res_data.reserve(size * Extractor::getReserveLengthForElement());
|
|
|
|
|
|
|
|
size_t prev_offset = 0;
|
|
|
|
size_t res_offset = 0;
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Matched part.
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos start;
|
|
|
|
size_t length;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
Extractor::execute(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length);
|
|
|
|
|
|
|
|
res_data.resize(res_data.size() + length + 1);
|
|
|
|
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length);
|
|
|
|
res_offset += length + 1;
|
|
|
|
res_data[res_offset - 1] = 0;
|
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void constant(const std::string & data,
|
|
|
|
std::string & res_data)
|
|
|
|
{
|
|
|
|
Pos start;
|
|
|
|
size_t length;
|
|
|
|
Extractor::execute(data.data(), data.size(), start, length);
|
|
|
|
res_data.assign(start, length);
|
|
|
|
}
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/** Delete part of string using the Extractor.
|
2012-07-16 03:42:36 +00:00
|
|
|
*/
|
|
|
|
template <typename Extractor>
|
|
|
|
struct CutSubstringImpl
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
res_data.reserve(data.size());
|
|
|
|
size_t size = offsets.size();
|
|
|
|
res_offsets.resize(size);
|
|
|
|
|
|
|
|
size_t prev_offset = 0;
|
|
|
|
size_t res_offset = 0;
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Matched part.
|
2017-04-01 07:20:54 +00:00
|
|
|
Pos start;
|
|
|
|
size_t length;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
|
|
|
|
Extractor::execute(current, offsets[i] - prev_offset - 1, start, length);
|
2018-09-02 03:00:04 +00:00
|
|
|
size_t start_index = start - reinterpret_cast<const char *>(data.data());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
|
|
|
|
memcpySmallAllowReadWriteOverflow15(
|
|
|
|
&res_data[res_offset], current, start - current);
|
|
|
|
memcpySmallAllowReadWriteOverflow15(
|
|
|
|
&res_data[res_offset + start - current], start + length, offsets[i] - start_index - length);
|
|
|
|
res_offset += offsets[i] - prev_offset - length;
|
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void constant(const std::string & data,
|
|
|
|
std::string & res_data)
|
|
|
|
{
|
|
|
|
Pos start;
|
|
|
|
size_t length;
|
|
|
|
Extractor::execute(data.data(), data.size(), start, length);
|
|
|
|
res_data.reserve(data.size() - length);
|
|
|
|
res_data.append(data.data(), start);
|
|
|
|
res_data.append(start + length, data.data() + data.size());
|
|
|
|
}
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
|
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2016-12-15 12:05:05 +00:00
|
|
|
/// Percent decode of url data.
|
2016-12-12 06:09:00 +00:00
|
|
|
struct DecodeURLComponentImpl
|
2016-12-10 21:04:58 +00:00
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets);
|
2016-12-10 21:04:58 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
static void constant(const std::string & data,
|
|
|
|
std::string & res_data);
|
2016-12-10 21:04:58 +00:00
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector_fixed(const ColumnString::Chars & data, size_t n,
|
|
|
|
ColumnString::Chars & res_data);
|
2016-12-10 21:04:58 +00:00
|
|
|
};
|
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
}
|