ClickHouse/src/Functions/URL/domain.h
2021-10-02 10:13:14 +03:00

147 lines
3.4 KiB
C++

#pragma once
#include "protocol.h"
#include <base/find_symbols.h>
#include <cstring>
#include <Common/StringUtils/StringUtils.h>
namespace DB
{
namespace
{
inline StringRef checkAndReturnHost(const Pos & pos, const Pos & dot_pos, const Pos & start_of_host)
{
if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1)
return StringRef{};
auto after_dot = *(dot_pos + 1);
if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#')
return StringRef{};
return StringRef(start_of_host, pos - start_of_host);
}
}
/// Extracts host from given url.
///
/// @return empty StringRef if the host is not valid (i.e. it does not have dot, or there no symbol after dot).
inline StringRef getURLHost(const char * data, size_t size)
{
Pos pos = data;
Pos end = data + size;
if (*pos == '/' && *(pos + 1) == '/')
{
pos += 2;
}
else
{
Pos scheme_end = data + std::min(size, 16UL);
for (++pos; pos < scheme_end; ++pos)
{
if (!isAlphaNumericASCII(*pos))
{
switch (*pos)
{
case '.':
case '-':
case '+':
break;
case ' ': /// restricted symbols
case '\t':
case '<':
case '>':
case '%':
case '{':
case '}':
case '|':
case '\\':
case '^':
case '~':
case '[':
case ']':
case ';':
case '=':
case '&':
return StringRef{};
default:
goto exloop;
}
}
}
exloop: if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/')
pos += 3;
else
pos = data;
}
Pos dot_pos = nullptr;
auto start_of_host = pos;
for (; pos < end; ++pos)
{
switch (*pos)
{
case '.':
dot_pos = pos;
break;
case ':': /// end symbols
case '/':
case '?':
case '#':
return checkAndReturnHost(pos, dot_pos, start_of_host);
case '@': /// myemail@gmail.com
start_of_host = pos + 1;
break;
case ' ': /// restricted symbols in whole URL
case '\t':
case '<':
case '>':
case '%':
case '{':
case '}':
case '|':
case '\\':
case '^':
case '~':
case '[':
case ']':
case ';':
case '=':
case '&':
return StringRef{};
}
}
return checkAndReturnHost(pos, dot_pos, start_of_host);
}
template <bool without_www>
struct ExtractDomain
{
static size_t getReserveLengthForElement() { return 15; }
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
StringRef host = getURLHost(data, size);
if (host.size == 0)
{
res_data = data;
res_size = 0;
}
else
{
if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
host = { host.data + 4, host.size - 4 };
res_data = host.data;
res_size = host.size;
}
}
};
}