diff --git a/src/Functions/URL/netloc.cpp b/src/Functions/URL/netloc.cpp index d8858c3364a..ea45504fa18 100644 --- a/src/Functions/URL/netloc.cpp +++ b/src/Functions/URL/netloc.cpp @@ -1,10 +1,133 @@ +#include #include #include -#include "netloc.h" +#include + namespace DB { +struct ExtractNetloc +{ + /// We use the same as domain function + static size_t getReserveLengthForElement() { return 15; } + + static inline StringRef getNetworkLocation(const char * data, size_t size) + { + Pos pos = data; + Pos end = data + size; + + /// Skip scheme. + if (pos + 2 < end && pos[0] == '/' && pos[1] == '/') + { + pos += 2; + } + else + { + Pos scheme_end = data + std::min(size, 16UL); + for (++pos; pos < scheme_end; ++pos) + { + if (!isAlphaNumericASCII(*pos)) + { + switch (*pos) + { + case '.': + case '-': + case '+': + break; + case ' ': /// restricted symbols + case '\t': + case '<': + case '>': + case '%': + case '{': + case '}': + case '|': + case '\\': + case '^': + case '~': + case '[': + case ']': + case ';': + case '=': + case '&': + return StringRef{}; + default: + pos = scheme_end; /// exit from the loop + } + } + } + if (pos + 2 < scheme_end && pos[0] == ':' && pos[1] == '/' && pos[2] == '/') + pos += 3; + else + pos = data; + } + + /// Now pos points to the first byte after scheme (if there is). + + bool has_identification = false; + Pos question_mark_pos = end; + Pos slash_pos = end; + auto start_of_host = pos; + for (; pos < end; ++pos) + { + switch (*pos) + { + case '/': + if (has_identification) + return StringRef(start_of_host, pos - start_of_host); + else + slash_pos = pos; + break; + case '?': + if (has_identification) + return StringRef(start_of_host, pos - start_of_host); + else + question_mark_pos = pos; + break; + case '#': + return StringRef(start_of_host, pos - start_of_host); + case '@': /// foo:bar@example.ru + has_identification = true; + break; + case ' ': /// restricted symbols in whole URL + case '\t': + case '<': + case '>': + case '%': + case '{': + case '}': + case '|': + case '\\': + case '^': + case '~': + case '[': + case ']': + case ';': + case '=': + case '&': + return pos > start_of_host + ? StringRef(start_of_host, std::min(std::min(pos - 1, question_mark_pos), slash_pos) - start_of_host) + : StringRef{}; + } + } + + if (has_identification) + return StringRef(start_of_host, pos - start_of_host); + else + return StringRef(start_of_host, std::min(std::min(pos, question_mark_pos), slash_pos) - start_of_host); + } + + static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size) + { + StringRef host = getNetworkLocation(data, size); + + res_data = host.data; + res_size = host.size; + } +}; + + struct NameNetloc { static constexpr auto name = "netloc"; }; using FunctionNetloc = FunctionStringToString, NameNetloc>; diff --git a/src/Functions/URL/netloc.h b/src/Functions/URL/netloc.h deleted file mode 100644 index ac1e57a884a..00000000000 --- a/src/Functions/URL/netloc.h +++ /dev/null @@ -1,134 +0,0 @@ -#pragma once - -#include "FunctionsURL.h" -#include -#include "protocol.h" -#include -#include - - -namespace DB -{ - -struct ExtractNetloc -{ - /// We use the same as domain function - static size_t getReserveLengthForElement() { return 15; } - - static inline StringRef getNetworkLocation(const char * data, size_t size) - { - Pos pos = data; - Pos end = data + size; - - /// Skip scheme. - if (pos + 2 < end && pos[0] == '/' && pos[1] == '/') - { - pos += 2; - } - else - { - Pos scheme_end = data + std::min(size, 16UL); - for (++pos; pos < scheme_end; ++pos) - { - if (!isAlphaNumericASCII(*pos)) - { - switch (*pos) - { - case '.': - case '-': - case '+': - break; - case ' ': /// restricted symbols - case '\t': - case '<': - case '>': - case '%': - case '{': - case '}': - case '|': - case '\\': - case '^': - case '~': - case '[': - case ']': - case ';': - case '=': - case '&': - return StringRef{}; - default: - pos = scheme_end; /// exit from the loop - } - } - } - if (pos + 2 < scheme_end && pos[0] == ':' && pos[1] == '/' && pos[2] == '/') - pos += 3; - else - pos = data; - } - - /// Now pos points to the first byte after scheme (if there is). - - bool has_identification = false; - Pos question_mark_pos = end; - Pos slash_pos = end; - auto start_of_host = pos; - for (; pos < end; ++pos) - { - switch (*pos) - { - case '/': - if (has_identification) - return StringRef(start_of_host, pos - start_of_host); - else - slash_pos = pos; - break; - case '?': - if (has_identification) - return StringRef(start_of_host, pos - start_of_host); - else - question_mark_pos = pos; - break; - case '#': - return StringRef(start_of_host, pos - start_of_host); - case '@': /// foo:bar@example.ru - has_identification = true; - break; - case ' ': /// restricted symbols in whole URL - case '\t': - case '<': - case '>': - case '%': - case '{': - case '}': - case '|': - case '\\': - case '^': - case '~': - case '[': - case ']': - case ';': - case '=': - case '&': - return pos > start_of_host - ? StringRef(start_of_host, std::min(std::min(pos - 1, question_mark_pos), slash_pos) - start_of_host) - : StringRef{}; - } - } - - if (has_identification) - return StringRef(start_of_host, pos - start_of_host); - else - return StringRef(start_of_host, std::min(std::min(pos, question_mark_pos), slash_pos) - start_of_host); - } - - static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size) - { - StringRef host = getNetworkLocation(data, size); - - res_data = host.data; - res_size = host.size; - } -}; - -} -