ClickHouse/dbms/include/DB/Functions/FunctionsURL.h

1073 lines
28 KiB
C++
Raw Normal View History

2012-07-16 03:42:36 +00:00
#pragma once
#include <DB/DataTypes/DataTypeString.h>
#include <DB/Columns/ColumnString.h>
#include <DB/Columns/ColumnConst.h>
#include <DB/Common/StringUtils.h>
#include <DB/Common/StringView.h>
2012-07-16 03:42:36 +00:00
#include <DB/Functions/FunctionsString.h>
#include <DB/Functions/FunctionsStringSearch.h>
#include <DB/Functions/FunctionsStringArray.h>
2012-07-16 03:42:36 +00:00
#ifdef __APPLE__
#include <common/apple_memrchr.h>
#endif
2012-07-16 03:42:36 +00:00
namespace DB
{
/** URL processing functions.
* All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
2012-07-16 03:42:36 +00:00
*
* Functions for extraction parts of URL.
* If URL has nothing like, then empty string is returned.
*
2012-07-16 03:42:36 +00:00
* domain
* domainWithoutWWW
* topLevelDomain
* protocol
* path
* queryString
* fragment
* queryStringAndFragment
*
* Functions, removing parts from URL.
* If URL has nothing like, then it is retured unchanged.
*
2012-07-16 03:42:36 +00:00
* cutWWW
* cutFragment
* cutQueryString
* cutQueryStringAndFragment
2012-07-21 03:45:48 +00:00
*
* Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
* If there are many parameters with same name - return value of first one. Value is not %-decoded.
2012-07-21 03:45:48 +00:00
*
* extractURLParameter(URL, name)
*
* Extract all parameters from URL in form of array of strings name=value.
* extractURLParameters(URL)
*
* Extract names of all parameters from URL in form of array of strings.
* extractURLParameterNames(URL)
*
* Remove specified parameter from URL.
* cutURLParameter(URL, name)
*
* Get array of URL 'hierarchy' as in Yandex.Metrica tree-like reports. See docs.
* URLHierarchy(URL)
2012-07-16 03:42:36 +00:00
*/
using Pos = const char *;
2012-07-16 03:42:36 +00:00
/// Extracts scheme from given url.
2016-12-15 12:33:50 +00:00
inline StringView getURLScheme(const StringView & url)
2012-07-16 03:42:36 +00:00
{
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
2016-12-15 18:59:07 +00:00
const char * p = url.data();
const char * end = url.data() + url.size();
if (isAlphaASCII(*p))
2012-07-16 03:42:36 +00:00
{
for (++p; p < end; ++p)
{
if (!(isAlphaNumericASCII(*p) || *p == '+' || *p == '-' || *p == '.'))
{
break;
}
}
return StringView(url.data(), p - url.data());
}
return StringView();
}
2012-07-16 03:42:36 +00:00
/// Extracts host from given url.
2016-12-15 12:33:50 +00:00
inline StringView getURLHost(const StringView & url)
{
2016-12-15 12:33:50 +00:00
StringView scheme = getURLScheme(url);
2016-12-15 18:59:07 +00:00
const char * p = url.data() + scheme.size();
const char * end = url.data() + url.size();
// Colon must follows after scheme.
if (p == end || *p != ':')
return StringView();
// Authority component must starts with "//".
if (end - p < 2 || (p[1] != '/' || p[2] != '/'))
return StringView();
else
p += 3;
2016-12-15 18:59:07 +00:00
const char * st = p;
for (; p < end; ++p)
{
if (*p == '@')
{
st = p + 1;
}
else if (*p == ':' || *p == '/' || *p == '?' || *p == '#')
{
break;
}
2012-07-16 03:42:36 +00:00
}
return (p == st) ? StringView() : StringView(st, p - st);
}
struct ExtractProtocol
{
static size_t getReserveLengthForElement();
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size);
2012-07-16 03:42:36 +00:00
};
template <bool without_www>
struct ExtractDomain
{
static size_t getReserveLengthForElement() { return 15; }
2012-07-16 03:42:36 +00:00
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
2016-12-15 12:33:50 +00:00
StringView host = getURLHost(StringView(data, size));
if (host.empty())
{
res_data = data;
res_size = 0;
}
else
{
if (without_www && host.size() > 4 && !strncmp(host.data(), "www.", 4))
host = host.substr(4);
res_data = host.data();
res_size = host.size();
}
2012-07-16 03:42:36 +00:00
}
};
struct ExtractFirstSignificantSubdomain
{
static size_t getReserveLengthForElement() { return 10; }
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
{
res_data = data;
res_size = 0;
Pos tmp;
size_t domain_length;
ExtractDomain<true>::execute(data, size, tmp, domain_length);
if (domain_length == 0)
return;
if (out_domain_end)
*out_domain_end = tmp + domain_length;
/// cut useless dot
if (tmp[domain_length - 1] == '.')
--domain_length;
res_data = tmp;
res_size = domain_length;
auto begin = tmp;
auto end = begin + domain_length;
const char * last_3_periods[3]{};
auto pos = static_cast<const char *>(memchr(begin, '.', domain_length));
while (pos)
{
last_3_periods[2] = last_3_periods[1];
last_3_periods[1] = last_3_periods[0];
last_3_periods[0] = pos;
pos = static_cast<const char *>(memchr(pos + 1, '.', end - pos - 1));
}
if (!last_3_periods[0])
return;
if (!last_3_periods[1])
{
res_size = last_3_periods[0] - begin;
return;
}
if (!last_3_periods[2])
last_3_periods[2] = begin - 1;
if (!strncmp(last_3_periods[1] + 1, "com.", 4) /// Note that in ColumnString every value has zero byte after it.
|| !strncmp(last_3_periods[1] + 1, "net.", 4)
|| !strncmp(last_3_periods[1] + 1, "org.", 4)
|| !strncmp(last_3_periods[1] + 1, "co.", 3))
{
res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1;
return;
}
res_data += last_3_periods[1] + 1 - begin;
res_size = last_3_periods[0] - last_3_periods[1] - 1;
}
};
struct CutToFirstSignificantSubdomain
{
static size_t getReserveLengthForElement() { return 15; }
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos tmp_data;
size_t tmp_length;
Pos domain_end;
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
if (tmp_length == 0)
return;
res_data = tmp_data;
res_size = domain_end - tmp_data;
}
};
2012-07-16 03:42:36 +00:00
struct ExtractTopLevelDomain
{
static size_t getReserveLengthForElement() { return 5; }
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
2016-12-15 12:33:50 +00:00
StringView host = getURLHost(StringView(data, size));
2016-12-09 22:49:21 +00:00
2012-07-16 03:42:36 +00:00
res_data = data;
res_size = 0;
2016-12-09 22:49:21 +00:00
if (!host.empty())
{
if (host.back() == '.')
host = StringView(host.data(), host.size() - 1);
2012-07-16 03:42:36 +00:00
2016-12-09 22:49:21 +00:00
Pos last_dot = reinterpret_cast<Pos>(memrchr(host.data(), '.', host.size()));
2012-07-16 03:42:36 +00:00
2016-12-09 22:49:21 +00:00
if (!last_dot)
return;
/// Для IPv4-адресов не выделяем ничего.
if (last_dot[1] <= '9')
return;
2012-07-16 03:42:36 +00:00
2016-12-09 22:49:21 +00:00
res_data = last_dot + 1;
res_size = (host.data() + host.size()) - res_data;
}
2012-07-16 03:42:36 +00:00
}
};
struct ExtractPath
{
static size_t getReserveLengthForElement() { return 25; }
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos pos = data;
Pos end = pos + size;
2014-04-08 07:47:51 +00:00
if (nullptr != (pos = strchr(data, '/')) && pos[1] == '/' && nullptr != (pos = strchr(pos + 2, '/')))
2012-07-16 03:42:36 +00:00
{
Pos query_string_or_fragment = strpbrk(pos, "?#");
res_data = pos;
res_size = (query_string_or_fragment ? query_string_or_fragment : end) - res_data;
}
}
};
struct ExtractPathFull
{
static size_t getReserveLengthForElement() { return 30; }
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos pos = data;
Pos end = pos + size;
if (nullptr != (pos = strchr(data, '/')) && pos[1] == '/' && nullptr != (pos = strchr(pos + 2, '/')))
{
res_data = pos;
res_size = end - res_data;
}
}
};
2012-07-16 03:42:36 +00:00
template <bool without_leading_char>
struct ExtractQueryString
{
static size_t getReserveLengthForElement() { return 10; }
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos pos = data;
Pos end = pos + size;
2014-04-08 07:47:51 +00:00
if (nullptr != (pos = strchr(data, '?')))
2012-07-16 03:42:36 +00:00
{
Pos fragment = strchr(pos, '#');
res_data = pos + (without_leading_char ? 1 : 0);
res_size = (fragment ? fragment : end) - res_data;
}
}
};
template <bool without_leading_char>
struct ExtractFragment
{
static size_t getReserveLengthForElement() { return 10; }
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos pos = data;
Pos end = pos + size;
2014-04-08 07:47:51 +00:00
if (nullptr != (pos = strchr(data, '#')))
2012-07-16 03:42:36 +00:00
{
res_data = pos + (without_leading_char ? 1 : 0);
res_size = end - res_data;
}
}
};
template <bool without_leading_char>
struct ExtractQueryStringAndFragment
{
static size_t getReserveLengthForElement() { return 20; }
2012-07-16 03:42:36 +00:00
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos pos = data;
Pos end = pos + size;
2014-04-08 07:47:51 +00:00
if (nullptr != (pos = strchr(data, '?')))
2012-07-16 03:42:36 +00:00
{
res_data = pos + (without_leading_char ? 1 : 0);
res_size = end - res_data;
}
else if (nullptr != (pos = strchr(data, '#')))
{
res_data = pos;
res_size = end - res_data;
}
2012-07-16 03:42:36 +00:00
}
};
/// С точкой на конце.
struct ExtractWWW
{
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos pos = data;
Pos end = pos + size;
Pos tmp;
size_t protocol_length;
ExtractProtocol::execute(data, size, tmp, protocol_length);
pos += protocol_length + 3;
if (pos >= end || pos[-1] != '/' || pos[-2] != '/')
2012-07-16 03:42:36 +00:00
return;
if (pos + 4 < end && !strncmp(pos, "www.", 4))
{
res_data = pos;
res_size = 4;
}
}
};
struct ExtractURLParameterImpl
{
2013-09-15 05:51:43 +00:00
static void vector(const ColumnString::Chars_t & data,
const ColumnString::Offsets_t & offsets,
std::string pattern,
2013-09-15 05:51:43 +00:00
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.reserve(data.size() / 5);
res_offsets.resize(offsets.size());
pattern += '=';
const char * param_str = pattern.c_str();
size_t param_len = pattern.size();
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
size_t cur_offset = offsets[i];
const char * str = reinterpret_cast<const char *>(&data[prev_offset]);
const char * pos = nullptr;
const char * begin = strpbrk(str, "?#");
if (begin != nullptr)
{
pos = begin + 1;
while (true)
{
pos = strstr(pos, param_str);
if (pos == nullptr)
break;
if (pos[-1] != '?' && pos[-1] != '#' && pos[-1] != '&')
{
pos += param_len;
continue;
}
else
{
pos += param_len;
break;
}
}
}
2014-04-08 07:31:51 +00:00
if (pos != nullptr)
{
const char * end = strpbrk(pos, "&#");
2014-04-08 07:31:51 +00:00
if (end == nullptr)
end = pos + strlen(pos);
res_data.resize(res_offset + (end - pos) + 1);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], pos, end - pos);
res_offset += end - pos;
}
else
{
res_data.resize(res_offset + 1);
}
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
struct CutURLParameterImpl
{
2013-09-15 05:51:43 +00:00
static void vector(const ColumnString::Chars_t & data,
const ColumnString::Offsets_t & offsets,
std::string pattern,
2013-09-15 05:51:43 +00:00
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.reserve(data.size());
res_offsets.resize(offsets.size());
pattern += '=';
const char * param_str = pattern.c_str();
size_t param_len = pattern.size();
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
size_t cur_offset = offsets[i];
const char * url_begin = reinterpret_cast<const char *>(&data[prev_offset]);
const char * url_end = reinterpret_cast<const char *>(&data[cur_offset]) - 1;
const char * begin_pos = url_begin;
const char * end_pos = begin_pos;
do
{
const char * begin = strpbrk(url_begin, "?#");
2014-04-08 07:31:51 +00:00
if (begin == nullptr)
break;
const char * pos = strstr(begin + 1, param_str);
2014-04-08 07:31:51 +00:00
if (pos == nullptr)
break;
if (pos[-1] != '?' && pos[-1] != '#' && pos[-1] != '&')
{
2014-04-08 07:31:51 +00:00
pos = nullptr;
break;
}
begin_pos = pos;
end_pos = begin_pos + param_len;
/// Пропустим значение.
while (*end_pos && *end_pos != '&' && *end_pos != '#')
++end_pos;
/// Захватим '&' до или после параметра.
if (*end_pos == '&')
++end_pos;
else if (begin_pos[-1] == '&')
--begin_pos;
} while (false);
size_t cut_length = (url_end - url_begin) - (end_pos - begin_pos);
res_data.resize(res_offset + cut_length + 1);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], url_begin, begin_pos - url_begin);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset] + (begin_pos - url_begin), end_pos, url_end - end_pos);
res_offset += cut_length + 1;
res_data[res_offset - 1] = 0;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
class ExtractURLParametersImpl
{
private:
Pos pos;
Pos end;
bool first;
public:
static constexpr auto name = "extractURLParameters";
static String getName() { return name; }
2016-12-29 19:38:10 +00:00
static size_t getNumberOfArguments() { return 1; }
static void checkArguments(const DataTypes & arguments)
{
if (!typeid_cast<const DataTypeString *>(&*arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
void init(Block & block, const ColumnNumbers & arguments) {}
/// Возвращает позицию аргумента, являющегося столбцом строк
size_t getStringsArgumentPosition()
{
return 0;
}
/// Вызывается для каждой следующей строки.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
first = true;
}
/// Получить следующий токен, если есть, или вернуть false.
bool get(Pos & token_begin, Pos & token_end)
{
2014-04-08 07:31:51 +00:00
if (pos == nullptr)
return false;
if (first)
{
first = false;
pos = strpbrk(pos, "?#");
2014-04-08 07:31:51 +00:00
if (pos == nullptr)
return false;
++pos;
}
while (true)
{
token_begin = pos;
pos = strpbrk(pos, "=&#?");
if (pos == nullptr)
return false;
if (*pos == '?')
{
++pos;
continue;
}
break;
}
if (*pos == '&' || *pos == '#')
{
token_end = pos++;
}
else
{
++pos;
pos = strpbrk(pos, "&#");
if (pos == nullptr)
token_end = end;
else
token_end = pos++;
}
return true;
}
};
class ExtractURLParameterNamesImpl
{
private:
Pos pos;
Pos end;
bool first;
public:
static constexpr auto name = "extractURLParameterNames";
static String getName() { return name; }
2016-12-29 19:38:10 +00:00
static size_t getNumberOfArguments() { return 1; }
static void checkArguments(const DataTypes & arguments)
{
if (!typeid_cast<const DataTypeString *>(&*arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
/// Возвращает позицию аргумента, являющегося столбцом строк
size_t getStringsArgumentPosition()
{
return 0;
}
void init(Block & block, const ColumnNumbers & arguments) {}
/// Вызывается для каждой следующей строки.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
first = true;
}
/// Получить следующий токен, если есть, или вернуть false.
bool get(Pos & token_begin, Pos & token_end)
{
2014-04-08 07:31:51 +00:00
if (pos == nullptr)
return false;
if (first)
{
first = false;
pos = strpbrk(pos, "?#");
}
else
pos = strpbrk(pos, "&#");
2014-04-08 07:31:51 +00:00
if (pos == nullptr)
return false;
++pos;
while (true)
{
token_begin = pos;
pos = strpbrk(pos, "=&#?");
if (pos == nullptr)
return false;
else
token_end = pos;
if (*pos == '?')
{
++pos;
continue;
}
break;
}
return true;
}
};
class URLHierarchyImpl
{
private:
Pos begin;
Pos pos;
Pos end;
public:
static constexpr auto name = "URLHierarchy";
static String getName() { return name; }
2016-12-29 19:38:10 +00:00
static size_t getNumberOfArguments() { return 1; }
static void checkArguments(const DataTypes & arguments)
{
if (!typeid_cast<const DataTypeString *>(&*arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
void init(Block & block, const ColumnNumbers & arguments) {}
/// Возвращает позицию аргумента, являющегося столбцом строк
size_t getStringsArgumentPosition()
{
return 0;
}
/// Вызывается для каждой следующей строки.
void set(Pos pos_, Pos end_)
{
begin = pos = pos_;
end = end_;
}
/// Получить следующий токен, если есть, или вернуть false.
bool get(Pos & token_begin, Pos & token_end)
{
/// Код из URLParser.
if (pos == end)
return false;
if (pos == begin)
{
/// Распарсим всё, что идёт до пути
/// Предположим, что протокол уже переведён в нижний регистр.
while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9')))
++pos;
/** Будем вычислять иерархию только для URL-ов, в которых есть протокол, и после него идут два слеша.
* (http, file - подходят, mailto, magnet - не подходят), и после двух слешей ещё хоть что-нибудь есть
* Для остальных просто вернём полный URL как единственный элемент иерархии.
*/
if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end))
{
pos = end;
token_begin = begin;
token_end = end;
return true;
}
/// Доменом для простоты будем считать всё, что после протокола и двух слешей, до следующего слеша или до ? или до #
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
if (pos != end)
++pos;
token_begin = begin;
token_end = pos;
return true;
}
/// Идём до следующего / или ? или #, пропуская все те, что вначале.
while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
if (pos == end)
return false;
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
if (pos != end)
++pos;
token_begin = begin;
token_end = pos;
return true;
}
};
class URLPathHierarchyImpl
{
private:
Pos begin;
Pos pos;
Pos end;
Pos start;
public:
static constexpr auto name = "URLPathHierarchy";
static String getName() { return name; }
2016-12-29 19:38:10 +00:00
static size_t getNumberOfArguments() { return 1; }
static void checkArguments(const DataTypes & arguments)
{
if (!typeid_cast<const DataTypeString *>(&*arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
void init(Block & block, const ColumnNumbers & arguments) {}
/// Возвращает позицию аргумента, являющегося столбцом строк
size_t getStringsArgumentPosition()
{
return 0;
}
/// Вызывается для каждой следующей строки.
void set(Pos pos_, Pos end_)
{
begin = pos = pos_;
start = begin;
end = end_;
}
/// Получить следующий токен, если есть, или вернуть false.
bool get(Pos & token_begin, Pos & token_end)
{
/// Код из URLParser.
if (pos == end)
return false;
if (pos == begin)
{
/// Распарсим всё, что идёт до пути
/// Предположим, что протокол уже переведён в нижний регистр.
while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9')))
++pos;
/** Будем вычислять иерархию только для URL-ов, в которых есть протокол, и после него идут два слеша.
* (http, file - подходят, mailto, magnet - не подходят), и после двух слешей ещё хоть что-нибудь есть
* Для остальных просто вернём пустой массив.
*/
if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end))
{
pos = end;
return false;
}
/// Доменом для простоты будем считать всё, что после протокола и двух слешей, до следующего слеша или до ? или до #
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
start = pos;
if (pos != end)
++pos;
}
/// Идём до следующего / или ? или #, пропуская все те, что вначале.
while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
if (pos == end)
return false;
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
if (pos != end)
++pos;
token_begin = start;
token_end = pos;
return true;
}
};
2012-07-16 03:42:36 +00:00
/** Выделить кусок строки, используя Extractor.
*/
template <typename Extractor>
struct ExtractSubstringImpl
{
2013-09-15 05:51:43 +00:00
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
2012-07-16 03:42:36 +00:00
{
size_t size = offsets.size();
res_offsets.resize(size);
res_data.reserve(size * Extractor::getReserveLengthForElement());
2012-07-16 03:42:36 +00:00
size_t prev_offset = 0;
size_t res_offset = 0;
/// Выделенный кусок.
Pos start;
size_t length;
2012-07-16 03:42:36 +00:00
for (size_t i = 0; i < size; ++i)
{
Extractor::execute(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length);
2012-07-16 03:42:36 +00:00
res_data.resize(res_data.size() + length + 1);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length);
2012-07-16 03:42:36 +00:00
res_offset += length + 1;
res_data[res_offset - 1] = 0;
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
static void constant(const std::string & data,
std::string & res_data)
{
Pos start;
size_t length;
Extractor::execute(data.data(), data.size(), start, length);
res_data.assign(start, length);
}
2013-09-15 05:51:43 +00:00
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
2012-07-16 03:42:36 +00:00
{
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
}
};
/** Удалить кусок строки, используя Extractor.
*/
template <typename Extractor>
struct CutSubstringImpl
{
2013-09-15 05:51:43 +00:00
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
2012-07-16 03:42:36 +00:00
{
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
size_t prev_offset = 0;
size_t res_offset = 0;
/// Выделенный кусок.
Pos start;
size_t length;
for (size_t i = 0; i < size; ++i)
{
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
Extractor::execute(current, offsets[i] - prev_offset - 1, start, length);
size_t start_index = start - reinterpret_cast<const char *>(&data[0]);
res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
memcpySmallAllowReadWriteOverflow15(
&res_data[res_offset], current, start - current);
memcpySmallAllowReadWriteOverflow15(
&res_data[res_offset + start - current], start + length, offsets[i] - start_index - length);
2012-07-16 03:42:36 +00:00
res_offset += offsets[i] - prev_offset - length;
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
static void constant(const std::string & data,
std::string & res_data)
{
Pos start;
size_t length;
Extractor::execute(data.data(), data.size(), start, length);
res_data.reserve(data.size() - length);
res_data.append(data.data(), start);
res_data.append(start + length, data.data() + data.size());
2012-07-16 03:42:36 +00:00
}
2013-09-15 05:51:43 +00:00
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
2012-07-16 03:42:36 +00:00
{
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
}
};
/// Percent decode of url data.
struct DecodeURLComponentImpl
2016-12-10 21:04:58 +00:00
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets);
2016-12-10 21:04:58 +00:00
static void constant(const std::string & data,
std::string & res_data);
2016-12-10 21:04:58 +00:00
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data);
2016-12-10 21:04:58 +00:00
};
struct NameProtocol { static constexpr auto name = "protocol"; };
struct NameDomain { static constexpr auto name = "domain"; };
struct NameDomainWithoutWWW { static constexpr auto name = "domainWithoutWWW"; };
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
struct NameTopLevelDomain { static constexpr auto name = "topLevelDomain"; };
struct NamePath { static constexpr auto name = "path"; };
struct NamePathFull { static constexpr auto name = "pathFull"; };
struct NameQueryString { static constexpr auto name = "queryString"; };
struct NameFragment { static constexpr auto name = "fragment"; };
struct NameQueryStringAndFragment { static constexpr auto name = "queryStringAndFragment"; };
struct NameDecodeURLComponent { static constexpr auto name = "decodeURLComponent"; };
2012-07-16 03:42:36 +00:00
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
struct NameCutWWW { static constexpr auto name = "cutWWW"; };
struct NameCutQueryString { static constexpr auto name = "cutQueryString"; };
struct NameCutFragment { static constexpr auto name = "cutFragment"; };
struct NameCutQueryStringAndFragment { static constexpr auto name = "cutQueryStringAndFragment"; };
2012-07-16 03:42:36 +00:00
struct NameExtractURLParameter { static constexpr auto name = "extractURLParameter"; };
struct NameCutURLParameter { static constexpr auto name = "cutURLParameter"; };
using FunctionProtocol = FunctionStringToString<ExtractSubstringImpl<ExtractProtocol>, NameProtocol> ;
using FunctionDomain = FunctionStringToString<ExtractSubstringImpl<ExtractDomain<false> >, NameDomain> ;
using FunctionDomainWithoutWWW = FunctionStringToString<ExtractSubstringImpl<ExtractDomain<true> >, NameDomainWithoutWWW>;
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, NameFirstSignificantSubdomain>;
using FunctionTopLevelDomain = FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain>, NameTopLevelDomain> ;
using FunctionPath = FunctionStringToString<ExtractSubstringImpl<ExtractPath>, NamePath> ;
using FunctionPathFull = FunctionStringToString<ExtractSubstringImpl<ExtractPathFull>, NamePathFull> ;
using FunctionQueryString = FunctionStringToString<ExtractSubstringImpl<ExtractQueryString<true> >, NameQueryString> ;
using FunctionFragment = FunctionStringToString<ExtractSubstringImpl<ExtractFragment<true> >, NameFragment> ;
using FunctionQueryStringAndFragment = FunctionStringToString<ExtractSubstringImpl<ExtractQueryStringAndFragment<true> >, NameQueryStringAndFragment>;
using FunctionDecodeURLComponent = FunctionStringToString<DecodeURLComponentImpl, NameDecodeURLComponent>;
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;
using FunctionCutWWW = FunctionStringToString<CutSubstringImpl<ExtractWWW>, NameCutWWW> ;
using FunctionCutQueryString = FunctionStringToString<CutSubstringImpl<ExtractQueryString<false> >, NameCutQueryString> ;
using FunctionCutFragment = FunctionStringToString<CutSubstringImpl<ExtractFragment<false> >, NameCutFragment> ;
using FunctionCutQueryStringAndFragment = FunctionStringToString<CutSubstringImpl<ExtractQueryStringAndFragment<false> >, NameCutQueryStringAndFragment>;
using FunctionExtractURLParameter = FunctionsStringSearchToString<ExtractURLParameterImpl, NameExtractURLParameter>;
using FunctionCutURLParameter = FunctionsStringSearchToString<CutURLParameterImpl, NameCutURLParameter>;
using FunctionExtractURLParameters = FunctionTokens<ExtractURLParametersImpl>;
using FunctionExtractURLParameters = FunctionTokens<ExtractURLParametersImpl>;
using FunctionURLHierarchy = FunctionTokens<URLHierarchyImpl>;
using FunctionURLPathHierarchy = FunctionTokens<URLPathHierarchyImpl>;
using FunctionExtractURLParameterNames = FunctionTokens<ExtractURLParameterNamesImpl>;
2012-07-16 03:42:36 +00:00
}