mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
commit
fea00eb595
12
dbms/include/DB/Common/StringView.h
Normal file
12
dbms/include/DB/Common/StringView.h
Normal file
@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <experimental/string_view>
|
||||
|
||||
using StringView = std::experimental::string_view;
|
||||
|
||||
/// It creates StringView from literal constant at compile time.
|
||||
template <typename TChar, size_t size>
|
||||
constexpr inline std::experimental::basic_string_view<TChar> makeStringView(const TChar (&str)[size])
|
||||
{
|
||||
return std::experimental::basic_string_view<TChar>(str, size - 1);
|
||||
}
|
3
dbms/include/DB/Common/hex.h
Normal file
3
dbms/include/DB/Common/hex.h
Normal file
@ -0,0 +1,3 @@
|
||||
#pragma once
|
||||
|
||||
extern const char* const char_to_digit_table;
|
@ -3,6 +3,8 @@
|
||||
#include <DB/DataTypes/DataTypeString.h>
|
||||
#include <DB/Columns/ColumnString.h>
|
||||
#include <DB/Columns/ColumnConst.h>
|
||||
#include <DB/Common/StringUtils.h>
|
||||
#include <DB/Common/StringView.h>
|
||||
#include <DB/Functions/FunctionsString.h>
|
||||
#include <DB/Functions/FunctionsStringSearch.h>
|
||||
#include <DB/Functions/FunctionsStringArray.h>
|
||||
@ -57,26 +59,70 @@ namespace DB
|
||||
|
||||
using Pos = const char *;
|
||||
|
||||
|
||||
/// Extracts scheme from given url.
|
||||
inline StringView getURLScheme(const StringView & url)
|
||||
{
|
||||
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
|
||||
const char* p = url.data();
|
||||
const char* end = url.data() + url.size();
|
||||
|
||||
if (isAlphaASCII(*p))
|
||||
{
|
||||
for (++p; p < end; ++p)
|
||||
{
|
||||
if (!(isAlphaNumericASCII(*p) || *p == '+' || *p == '-' || *p == '.'))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return StringView(url.data(), p - url.data());
|
||||
}
|
||||
|
||||
return StringView();
|
||||
}
|
||||
|
||||
|
||||
/// Extracts host from given url.
|
||||
inline StringView getURLHost(const StringView & url)
|
||||
{
|
||||
StringView scheme = getURLScheme(url);
|
||||
const char* p = url.data() + scheme.size();
|
||||
const char* end = url.data() + url.size();
|
||||
|
||||
// Colon must follows after scheme.
|
||||
if (p == end || *p != ':')
|
||||
return StringView();
|
||||
// Authority component must starts with "//".
|
||||
if (end - p < 2 || (p[1] != '/' || p[2] != '/'))
|
||||
return StringView();
|
||||
else
|
||||
p += 3;
|
||||
|
||||
const char* st = p;
|
||||
|
||||
for (; p < end; ++p)
|
||||
{
|
||||
if (*p == '@')
|
||||
{
|
||||
st = p + 1;
|
||||
}
|
||||
else if (*p == ':' || *p == '/' || *p == '?' || *p == '#')
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return (p == st) ? StringView() : StringView(st, p - st);
|
||||
}
|
||||
|
||||
|
||||
struct ExtractProtocol
|
||||
{
|
||||
static size_t getReserveLengthForElement() { return strlen("https") + 1; }
|
||||
static size_t getReserveLengthForElement();
|
||||
|
||||
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
||||
{
|
||||
res_data = data;
|
||||
res_size = 0;
|
||||
|
||||
Pos pos = data;
|
||||
|
||||
while (isAlphaNumericASCII(*pos))
|
||||
++pos;
|
||||
|
||||
if (pos == data || pos + 3 >= data + size)
|
||||
return;
|
||||
|
||||
if (pos[0] == ':')
|
||||
res_size = pos - data;
|
||||
}
|
||||
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size);
|
||||
};
|
||||
|
||||
template <bool without_www>
|
||||
@ -86,33 +132,21 @@ struct ExtractDomain
|
||||
|
||||
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
||||
{
|
||||
res_data = data;
|
||||
res_size = 0;
|
||||
StringView host = getURLHost(StringView(data, size));
|
||||
|
||||
Pos pos = data;
|
||||
Pos end = pos + size;
|
||||
if (host.empty())
|
||||
{
|
||||
res_data = data;
|
||||
res_size = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (without_www && host.size() > 4 && !strncmp(host.data(), "www.", 4))
|
||||
host = host.substr(4);
|
||||
|
||||
Pos tmp;
|
||||
size_t protocol_length;
|
||||
ExtractProtocol::execute(data, size, tmp, protocol_length);
|
||||
pos += protocol_length + 3;
|
||||
|
||||
if (pos >= end || pos[-1] != '/' || pos[-2] != '/')
|
||||
return;
|
||||
|
||||
if (without_www && pos + 4 < end && !strncmp(pos, "www.", 4))
|
||||
pos += 4;
|
||||
|
||||
Pos domain_begin = pos;
|
||||
|
||||
while (pos < end && *pos != '/' && *pos != ':' && *pos != '?' && *pos != '#')
|
||||
++pos;
|
||||
|
||||
if (pos == domain_begin)
|
||||
return;
|
||||
|
||||
res_data = domain_begin;
|
||||
res_size = pos - domain_begin;
|
||||
res_data = host.data();
|
||||
res_size = host.size();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -210,39 +244,27 @@ struct ExtractTopLevelDomain
|
||||
|
||||
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
||||
{
|
||||
StringView host = getURLHost(StringView(data, size));
|
||||
|
||||
res_data = data;
|
||||
res_size = 0;
|
||||
|
||||
Pos pos = data;
|
||||
Pos end = pos + size;
|
||||
if (!host.empty())
|
||||
{
|
||||
if (host.back() == '.')
|
||||
host = StringView(host.data(), host.size() - 1);
|
||||
|
||||
Pos tmp;
|
||||
size_t protocol_length;
|
||||
ExtractProtocol::execute(data, size, tmp, protocol_length);
|
||||
pos += protocol_length + 3;
|
||||
Pos last_dot = reinterpret_cast<Pos>(memrchr(host.data(), '.', host.size()));
|
||||
|
||||
if (pos >= end || pos[-1] != '/' || pos[-2] != '/')
|
||||
return;
|
||||
if (!last_dot)
|
||||
return;
|
||||
/// Для IPv4-адресов не выделяем ничего.
|
||||
if (last_dot[1] <= '9')
|
||||
return;
|
||||
|
||||
Pos domain_begin = pos;
|
||||
|
||||
while (pos < end && *pos != '/' && *pos != ':' && *pos != '?' && *pos != '#')
|
||||
++pos;
|
||||
|
||||
if (pos == domain_begin)
|
||||
return;
|
||||
|
||||
Pos last_dot = reinterpret_cast<Pos>(memrchr(domain_begin, '.', pos - domain_begin));
|
||||
|
||||
if (!last_dot)
|
||||
return;
|
||||
|
||||
/// Для IPv4-адресов не выделяем ничего.
|
||||
if (last_dot[1] <= '9')
|
||||
return;
|
||||
|
||||
res_data = last_dot + 1;
|
||||
res_size = pos - res_data;
|
||||
res_data = last_dot + 1;
|
||||
res_size = (host.data() + host.size()) - res_data;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -996,6 +1018,20 @@ struct CutSubstringImpl
|
||||
};
|
||||
|
||||
|
||||
/// Percent decode of url data.
|
||||
struct DecodeURLComponentImpl
|
||||
{
|
||||
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
||||
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets);
|
||||
|
||||
static void constant(const std::string & data,
|
||||
std::string & res_data);
|
||||
|
||||
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
|
||||
ColumnString::Chars_t & res_data);
|
||||
};
|
||||
|
||||
|
||||
struct NameProtocol { static constexpr auto name = "protocol"; };
|
||||
struct NameDomain { static constexpr auto name = "domain"; };
|
||||
struct NameDomainWithoutWWW { static constexpr auto name = "domainWithoutWWW"; };
|
||||
@ -1006,6 +1042,7 @@ struct NamePathFull { static constexpr auto name = "pathFull"; };
|
||||
struct NameQueryString { static constexpr auto name = "queryString"; };
|
||||
struct NameFragment { static constexpr auto name = "fragment"; };
|
||||
struct NameQueryStringAndFragment { static constexpr auto name = "queryStringAndFragment"; };
|
||||
struct NameDecodeURLComponent { static constexpr auto name = "decodeURLComponent"; };
|
||||
|
||||
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
|
||||
|
||||
@ -1027,6 +1064,7 @@ using FunctionPathFull = FunctionStringToString<ExtractSubstringImpl<ExtractPath
|
||||
using FunctionQueryString = FunctionStringToString<ExtractSubstringImpl<ExtractQueryString<true> >, NameQueryString> ;
|
||||
using FunctionFragment = FunctionStringToString<ExtractSubstringImpl<ExtractFragment<true> >, NameFragment> ;
|
||||
using FunctionQueryStringAndFragment = FunctionStringToString<ExtractSubstringImpl<ExtractQueryStringAndFragment<true> >, NameQueryStringAndFragment>;
|
||||
using FunctionDecodeURLComponent = FunctionStringToString<DecodeURLComponentImpl, NameDecodeURLComponent>;
|
||||
|
||||
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;
|
||||
|
||||
|
20
dbms/src/Common/hex.cpp
Normal file
20
dbms/src/Common/hex.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
#include <DB/Common/hex.h>
|
||||
|
||||
const char* const char_to_digit_table = (
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff" //0-9
|
||||
"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //A-Z
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //a-z
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
);
|
@ -1,9 +1,119 @@
|
||||
#include <DB/Common/hex.h>
|
||||
#include <DB/Functions/FunctionFactory.h>
|
||||
#include <DB/Functions/FunctionsURL.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
template <typename T>
|
||||
static void decodeUrl(const StringView & url, T & dest, size_t & offset)
|
||||
{
|
||||
const char* p = url.data();
|
||||
const char* st = url.data();
|
||||
const char* end = url.data() + url.size();
|
||||
|
||||
for (; p < end; ++p)
|
||||
{
|
||||
if (*p != '%' || end - p < 3)
|
||||
continue;
|
||||
|
||||
unsigned char h = char_to_digit_table[static_cast<unsigned char>(p[1])];
|
||||
unsigned char l = char_to_digit_table[static_cast<unsigned char>(p[2])];
|
||||
|
||||
if (h != 0xFF && l != 0xFF)
|
||||
{
|
||||
unsigned char digit = (h << 4) + l;
|
||||
|
||||
if (digit < 127) {
|
||||
dest.resize(dest.size() + p - st + 1);
|
||||
memcpy(&dest[offset], st, p - st);
|
||||
offset += p - st;
|
||||
dest[offset] = digit;
|
||||
offset++;
|
||||
|
||||
st = p + 3;
|
||||
}
|
||||
}
|
||||
|
||||
p += 2;
|
||||
}
|
||||
|
||||
if (st == url.data())
|
||||
{
|
||||
dest.resize(dest.size() + url.size() + 1);
|
||||
memcpy(&dest[offset], url.data(), url.size());
|
||||
offset += url.size() + 1;
|
||||
dest[offset - 1] = 0;
|
||||
}
|
||||
else if (st < p)
|
||||
{
|
||||
dest.resize(dest.size() + p - st);
|
||||
memcpy(&dest[offset], st, p - st);
|
||||
offset += p - st;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
size_t ExtractProtocol::getReserveLengthForElement()
|
||||
{
|
||||
return makeStringView("https").size() + 1;
|
||||
}
|
||||
|
||||
|
||||
void ExtractProtocol::execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
||||
{
|
||||
res_data = data;
|
||||
res_size = 0;
|
||||
|
||||
StringView scheme = getURLScheme(StringView(data, size));
|
||||
Pos pos = data + scheme.size();
|
||||
|
||||
if (scheme.empty() || (data + size) - pos < 4)
|
||||
return;
|
||||
|
||||
if (pos[0] == ':')
|
||||
res_size = pos - data;
|
||||
}
|
||||
|
||||
|
||||
void DecodeURLComponentImpl::vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
||||
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
||||
{
|
||||
res_data.reserve(data.size());
|
||||
size_t size = offsets.size();
|
||||
res_offsets.resize(size);
|
||||
|
||||
size_t prev_offset = 0;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
const StringView url(current, offsets[i] - prev_offset - 1);
|
||||
|
||||
decodeUrl(url, res_data, res_offset);
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
prev_offset = offsets[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void DecodeURLComponentImpl::constant(const std::string & data,
|
||||
std::string & res_data)
|
||||
{
|
||||
size_t offset = 0;
|
||||
decodeUrl(data, res_data, offset);
|
||||
}
|
||||
|
||||
|
||||
void DecodeURLComponentImpl::vector_fixed(const ColumnString::Chars_t & data, size_t n,
|
||||
ColumnString::Chars_t & res_data)
|
||||
{
|
||||
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
|
||||
|
||||
void registerFunctionsURL(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionProtocol>();
|
||||
@ -27,6 +137,7 @@ void registerFunctionsURL(FunctionFactory & factory)
|
||||
factory.registerFunction<FunctionCutFragment>();
|
||||
factory.registerFunction<FunctionCutQueryStringAndFragment>();
|
||||
factory.registerFunction<FunctionCutURLParameter>();
|
||||
factory.registerFunction<FunctionDecodeURLComponent>();
|
||||
}
|
||||
|
||||
}
|
||||
|
15
dbms/tests/queries/0_stateless/00395_url_functions.reference
Normal file
15
dbms/tests/queries/0_stateless/00395_url_functions.reference
Normal file
@ -0,0 +1,15 @@
|
||||
http
|
||||
https
|
||||
svn+ssh
|
||||
|
||||
http
|
||||
www.example.com
|
||||
|
||||
www.example.com
|
||||
127.0.0.1
|
||||
example.com
|
||||
com
|
||||
|
||||
ru
|
||||
ru
|
||||
/?query=hello world+foo+bar
|
18
dbms/tests/queries/0_stateless/00395_url_functions.sql
Normal file
18
dbms/tests/queries/0_stateless/00395_url_functions.sql
Normal file
@ -0,0 +1,18 @@
|
||||
SELECT protocol('http://example.com') AS Scheme;
|
||||
SELECT protocol('https://example.com/') AS Scheme;
|
||||
SELECT protocol('svn+ssh://example.com?q=hello%20world') AS Scheme;
|
||||
SELECT protocol('ftp!://example.com/') AS Scheme;
|
||||
SELECT protocol('http://127.0.0.1:443/') AS Scheme;
|
||||
|
||||
SELECT domain('http://paul@www.example.com:80/') AS Host;
|
||||
SELECT domain('http:/paul/example/com') AS Host;
|
||||
SELECT domain('http://www.example.com?q=4') AS Host;
|
||||
SELECT domain('http://127.0.0.1:443/') AS Host;
|
||||
SELECT domainWithoutWWW('http://paul@www.example.com:80/') AS Host;
|
||||
|
||||
SELECT topLevelDomain('http://paul@www.example.com:80/') AS Domain;
|
||||
SELECT topLevelDomain('http://127.0.0.1:443/') AS Domain;
|
||||
SELECT topLevelDomain('svn+ssh://example.ru?q=hello%20world') AS Domain;
|
||||
SELECT topLevelDomain('svn+ssh://example.ru.?q=hello%20world') AS Domain;
|
||||
|
||||
SELECT decodeURLComponent(pathFull('http://127.0.0.1/?query=hello%20world+foo%2Bbar')) AS Path;
|
Loading…
Reference in New Issue
Block a user