Merge pull request #239 from yandex/issue-219

Issue 219
This commit is contained in:
alexey-milovidov 2016-12-15 17:21:35 +04:00 committed by GitHub
commit fea00eb595
7 changed files with 286 additions and 69 deletions

View File

@ -0,0 +1,12 @@
#pragma once
#include <experimental/string_view>
using StringView = std::experimental::string_view;
/// It creates StringView from literal constant at compile time.
template <typename TChar, size_t size>
constexpr inline std::experimental::basic_string_view<TChar> makeStringView(const TChar (&str)[size])
{
return std::experimental::basic_string_view<TChar>(str, size - 1);
}

View File

@ -0,0 +1,3 @@
#pragma once
extern const char* const char_to_digit_table;

View File

@ -3,6 +3,8 @@
#include <DB/DataTypes/DataTypeString.h>
#include <DB/Columns/ColumnString.h>
#include <DB/Columns/ColumnConst.h>
#include <DB/Common/StringUtils.h>
#include <DB/Common/StringView.h>
#include <DB/Functions/FunctionsString.h>
#include <DB/Functions/FunctionsStringSearch.h>
#include <DB/Functions/FunctionsStringArray.h>
@ -57,26 +59,70 @@ namespace DB
using Pos = const char *;
/// Extracts scheme from given url.
inline StringView getURLScheme(const StringView & url)
{
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
const char* p = url.data();
const char* end = url.data() + url.size();
if (isAlphaASCII(*p))
{
for (++p; p < end; ++p)
{
if (!(isAlphaNumericASCII(*p) || *p == '+' || *p == '-' || *p == '.'))
{
break;
}
}
return StringView(url.data(), p - url.data());
}
return StringView();
}
/// Extracts host from given url.
inline StringView getURLHost(const StringView & url)
{
StringView scheme = getURLScheme(url);
const char* p = url.data() + scheme.size();
const char* end = url.data() + url.size();
// Colon must follows after scheme.
if (p == end || *p != ':')
return StringView();
// Authority component must starts with "//".
if (end - p < 2 || (p[1] != '/' || p[2] != '/'))
return StringView();
else
p += 3;
const char* st = p;
for (; p < end; ++p)
{
if (*p == '@')
{
st = p + 1;
}
else if (*p == ':' || *p == '/' || *p == '?' || *p == '#')
{
break;
}
}
return (p == st) ? StringView() : StringView(st, p - st);
}
struct ExtractProtocol
{
static size_t getReserveLengthForElement() { return strlen("https") + 1; }
static size_t getReserveLengthForElement();
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos pos = data;
while (isAlphaNumericASCII(*pos))
++pos;
if (pos == data || pos + 3 >= data + size)
return;
if (pos[0] == ':')
res_size = pos - data;
}
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size);
};
template <bool without_www>
@ -86,33 +132,21 @@ struct ExtractDomain
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
StringView host = getURLHost(StringView(data, size));
Pos pos = data;
Pos end = pos + size;
if (host.empty())
{
res_data = data;
res_size = 0;
}
else
{
if (without_www && host.size() > 4 && !strncmp(host.data(), "www.", 4))
host = host.substr(4);
Pos tmp;
size_t protocol_length;
ExtractProtocol::execute(data, size, tmp, protocol_length);
pos += protocol_length + 3;
if (pos >= end || pos[-1] != '/' || pos[-2] != '/')
return;
if (without_www && pos + 4 < end && !strncmp(pos, "www.", 4))
pos += 4;
Pos domain_begin = pos;
while (pos < end && *pos != '/' && *pos != ':' && *pos != '?' && *pos != '#')
++pos;
if (pos == domain_begin)
return;
res_data = domain_begin;
res_size = pos - domain_begin;
res_data = host.data();
res_size = host.size();
}
}
};
@ -210,39 +244,27 @@ struct ExtractTopLevelDomain
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
StringView host = getURLHost(StringView(data, size));
res_data = data;
res_size = 0;
Pos pos = data;
Pos end = pos + size;
if (!host.empty())
{
if (host.back() == '.')
host = StringView(host.data(), host.size() - 1);
Pos tmp;
size_t protocol_length;
ExtractProtocol::execute(data, size, tmp, protocol_length);
pos += protocol_length + 3;
Pos last_dot = reinterpret_cast<Pos>(memrchr(host.data(), '.', host.size()));
if (pos >= end || pos[-1] != '/' || pos[-2] != '/')
return;
if (!last_dot)
return;
/// Для IPv4-адресов не выделяем ничего.
if (last_dot[1] <= '9')
return;
Pos domain_begin = pos;
while (pos < end && *pos != '/' && *pos != ':' && *pos != '?' && *pos != '#')
++pos;
if (pos == domain_begin)
return;
Pos last_dot = reinterpret_cast<Pos>(memrchr(domain_begin, '.', pos - domain_begin));
if (!last_dot)
return;
/// Для IPv4-адресов не выделяем ничего.
if (last_dot[1] <= '9')
return;
res_data = last_dot + 1;
res_size = pos - res_data;
res_data = last_dot + 1;
res_size = (host.data() + host.size()) - res_data;
}
}
};
@ -996,6 +1018,20 @@ struct CutSubstringImpl
};
/// Percent decode of url data.
struct DecodeURLComponentImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets);
static void constant(const std::string & data,
std::string & res_data);
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data);
};
struct NameProtocol { static constexpr auto name = "protocol"; };
struct NameDomain { static constexpr auto name = "domain"; };
struct NameDomainWithoutWWW { static constexpr auto name = "domainWithoutWWW"; };
@ -1006,6 +1042,7 @@ struct NamePathFull { static constexpr auto name = "pathFull"; };
struct NameQueryString { static constexpr auto name = "queryString"; };
struct NameFragment { static constexpr auto name = "fragment"; };
struct NameQueryStringAndFragment { static constexpr auto name = "queryStringAndFragment"; };
struct NameDecodeURLComponent { static constexpr auto name = "decodeURLComponent"; };
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
@ -1027,6 +1064,7 @@ using FunctionPathFull = FunctionStringToString<ExtractSubstringImpl<ExtractPath
using FunctionQueryString = FunctionStringToString<ExtractSubstringImpl<ExtractQueryString<true> >, NameQueryString> ;
using FunctionFragment = FunctionStringToString<ExtractSubstringImpl<ExtractFragment<true> >, NameFragment> ;
using FunctionQueryStringAndFragment = FunctionStringToString<ExtractSubstringImpl<ExtractQueryStringAndFragment<true> >, NameQueryStringAndFragment>;
using FunctionDecodeURLComponent = FunctionStringToString<DecodeURLComponentImpl, NameDecodeURLComponent>;
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;

20
dbms/src/Common/hex.cpp Normal file
View File

@ -0,0 +1,20 @@
#include <DB/Common/hex.h>
const char* const char_to_digit_table = (
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff" //0-9
"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //A-Z
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //a-z
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
);

View File

@ -1,9 +1,119 @@
#include <DB/Common/hex.h>
#include <DB/Functions/FunctionFactory.h>
#include <DB/Functions/FunctionsURL.h>
namespace DB
{
template <typename T>
static void decodeUrl(const StringView & url, T & dest, size_t & offset)
{
const char* p = url.data();
const char* st = url.data();
const char* end = url.data() + url.size();
for (; p < end; ++p)
{
if (*p != '%' || end - p < 3)
continue;
unsigned char h = char_to_digit_table[static_cast<unsigned char>(p[1])];
unsigned char l = char_to_digit_table[static_cast<unsigned char>(p[2])];
if (h != 0xFF && l != 0xFF)
{
unsigned char digit = (h << 4) + l;
if (digit < 127) {
dest.resize(dest.size() + p - st + 1);
memcpy(&dest[offset], st, p - st);
offset += p - st;
dest[offset] = digit;
offset++;
st = p + 3;
}
}
p += 2;
}
if (st == url.data())
{
dest.resize(dest.size() + url.size() + 1);
memcpy(&dest[offset], url.data(), url.size());
offset += url.size() + 1;
dest[offset - 1] = 0;
}
else if (st < p)
{
dest.resize(dest.size() + p - st);
memcpy(&dest[offset], st, p - st);
offset += p - st;
}
}
size_t ExtractProtocol::getReserveLengthForElement()
{
return makeStringView("https").size() + 1;
}
void ExtractProtocol::execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
StringView scheme = getURLScheme(StringView(data, size));
Pos pos = data + scheme.size();
if (scheme.empty() || (data + size) - pos < 4)
return;
if (pos[0] == ':')
res_size = pos - data;
}
void DecodeURLComponentImpl::vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < size; ++i)
{
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
const StringView url(current, offsets[i] - prev_offset - 1);
decodeUrl(url, res_data, res_offset);
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
void DecodeURLComponentImpl::constant(const std::string & data,
std::string & res_data)
{
size_t offset = 0;
decodeUrl(data, res_data, offset);
}
void DecodeURLComponentImpl::vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
}
void registerFunctionsURL(FunctionFactory & factory)
{
factory.registerFunction<FunctionProtocol>();
@ -27,6 +137,7 @@ void registerFunctionsURL(FunctionFactory & factory)
factory.registerFunction<FunctionCutFragment>();
factory.registerFunction<FunctionCutQueryStringAndFragment>();
factory.registerFunction<FunctionCutURLParameter>();
factory.registerFunction<FunctionDecodeURLComponent>();
}
}

View File

@ -0,0 +1,15 @@
http
https
svn+ssh
http
www.example.com
www.example.com
127.0.0.1
example.com
com
ru
ru
/?query=hello world+foo+bar

View File

@ -0,0 +1,18 @@
SELECT protocol('http://example.com') AS Scheme;
SELECT protocol('https://example.com/') AS Scheme;
SELECT protocol('svn+ssh://example.com?q=hello%20world') AS Scheme;
SELECT protocol('ftp!://example.com/') AS Scheme;
SELECT protocol('http://127.0.0.1:443/') AS Scheme;
SELECT domain('http://paul@www.example.com:80/') AS Host;
SELECT domain('http:/paul/example/com') AS Host;
SELECT domain('http://www.example.com?q=4') AS Host;
SELECT domain('http://127.0.0.1:443/') AS Host;
SELECT domainWithoutWWW('http://paul@www.example.com:80/') AS Host;
SELECT topLevelDomain('http://paul@www.example.com:80/') AS Domain;
SELECT topLevelDomain('http://127.0.0.1:443/') AS Domain;
SELECT topLevelDomain('svn+ssh://example.ru?q=hello%20world') AS Domain;
SELECT topLevelDomain('svn+ssh://example.ru.?q=hello%20world') AS Domain;
SELECT decodeURLComponent(pathFull('http://127.0.0.1/?query=hello%20world+foo%2Bbar')) AS Path;