mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-12 17:32:32 +00:00
147 lines
3.4 KiB
C++
147 lines
3.4 KiB
C++
#pragma once
|
|
|
|
#include "protocol.h"
|
|
#include <base/find_symbols.h>
|
|
#include <cstring>
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace
|
|
{
|
|
|
|
inline StringRef checkAndReturnHost(const Pos & pos, const Pos & dot_pos, const Pos & start_of_host)
|
|
{
|
|
if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1)
|
|
return StringRef{};
|
|
|
|
auto after_dot = *(dot_pos + 1);
|
|
if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#')
|
|
return StringRef{};
|
|
|
|
return StringRef(start_of_host, pos - start_of_host);
|
|
}
|
|
|
|
}
|
|
|
|
/// Extracts host from given url.
|
|
///
|
|
/// @return empty StringRef if the host is not valid (i.e. it does not have dot, or there no symbol after dot).
|
|
inline StringRef getURLHost(const char * data, size_t size)
|
|
{
|
|
Pos pos = data;
|
|
Pos end = data + size;
|
|
|
|
if (*pos == '/' && *(pos + 1) == '/')
|
|
{
|
|
pos += 2;
|
|
}
|
|
else
|
|
{
|
|
Pos scheme_end = data + std::min(size, 16UL);
|
|
for (++pos; pos < scheme_end; ++pos)
|
|
{
|
|
if (!isAlphaNumericASCII(*pos))
|
|
{
|
|
switch (*pos)
|
|
{
|
|
case '.':
|
|
case '-':
|
|
case '+':
|
|
break;
|
|
case ' ': /// restricted symbols
|
|
case '\t':
|
|
case '<':
|
|
case '>':
|
|
case '%':
|
|
case '{':
|
|
case '}':
|
|
case '|':
|
|
case '\\':
|
|
case '^':
|
|
case '~':
|
|
case '[':
|
|
case ']':
|
|
case ';':
|
|
case '=':
|
|
case '&':
|
|
return StringRef{};
|
|
default:
|
|
goto exloop;
|
|
}
|
|
}
|
|
}
|
|
exloop: if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/')
|
|
pos += 3;
|
|
else
|
|
pos = data;
|
|
}
|
|
|
|
Pos dot_pos = nullptr;
|
|
auto start_of_host = pos;
|
|
for (; pos < end; ++pos)
|
|
{
|
|
switch (*pos)
|
|
{
|
|
case '.':
|
|
dot_pos = pos;
|
|
break;
|
|
case ':': /// end symbols
|
|
case '/':
|
|
case '?':
|
|
case '#':
|
|
return checkAndReturnHost(pos, dot_pos, start_of_host);
|
|
case '@': /// myemail@gmail.com
|
|
start_of_host = pos + 1;
|
|
break;
|
|
case ' ': /// restricted symbols in whole URL
|
|
case '\t':
|
|
case '<':
|
|
case '>':
|
|
case '%':
|
|
case '{':
|
|
case '}':
|
|
case '|':
|
|
case '\\':
|
|
case '^':
|
|
case '~':
|
|
case '[':
|
|
case ']':
|
|
case ';':
|
|
case '=':
|
|
case '&':
|
|
return StringRef{};
|
|
}
|
|
}
|
|
|
|
return checkAndReturnHost(pos, dot_pos, start_of_host);
|
|
}
|
|
|
|
template <bool without_www>
|
|
struct ExtractDomain
|
|
{
|
|
static size_t getReserveLengthForElement() { return 15; }
|
|
|
|
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
|
{
|
|
StringRef host = getURLHost(data, size);
|
|
|
|
if (host.size == 0)
|
|
{
|
|
res_data = data;
|
|
res_size = 0;
|
|
}
|
|
else
|
|
{
|
|
if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
|
|
host = { host.data + 4, host.size - 4 };
|
|
|
|
res_data = host.data;
|
|
res_size = host.size;
|
|
}
|
|
}
|
|
};
|
|
|
|
}
|