host extraction functionality was moved to UrlUtils

This commit is contained in:
artpaul 2016-12-10 00:31:16 +05:00
parent 544143f5c9
commit dfb466d156
6 changed files with 84 additions and 32 deletions

View File

@ -1,11 +1,17 @@
#pragma once
#include <cassert>
#include <stdexcept>
#include <string>
/// A lightweight non-owning read-only view into a subsequence of a string.
class StringView
{
public:
using size_type = size_t;
static constexpr size_type npos = size_type(-1);
public:
inline StringView() noexcept
: str(nullptr)
@ -41,11 +47,27 @@ public:
return str == nullptr;
}
inline size_t size() const noexcept
inline size_type size() const noexcept
{
return len;
}
public:
/**
* Returns a substring [pos, pos + count).
* If the requested substring extends past the end of the string,
* or if count == npos, the returned substring is [pos, size()).
*/
StringView substr(size_type pos, size_type count = npos) const
{
if (pos >= len)
throw std::out_of_range("pos must be less than len");
if (pos + count >= len || count == npos)
return StringView(str + pos, len - pos);
else
return StringView(str + pos, count);
}
public:
inline operator bool () const noexcept
{
@ -54,7 +76,7 @@ public:
private:
const char* str;
const size_t len;
size_t len;
};

View File

@ -2,9 +2,9 @@
#include <DB/Common/StringView.h>
/** Extracts scheme from given url.
*
* If there is no valid scheme then
* empty StringView will be returned.
*/
/// Extracts scheme from given url.
StringView getUrlScheme(const StringView& url);
/// Extracts host from given url.
StringView getUrlHost(const StringView& url);

View File

@ -85,33 +85,21 @@ struct ExtractDomain
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
StringView host = getUrlHost(StringView(data, size));
Pos pos = data;
Pos end = pos + size;
if (host.empty())
{
res_data = data;
res_size = 0;
}
else
{
if (without_www && host.size() > 4 && !strncmp(host.data(), "www.", 4))
host = host.substr(4);
Pos tmp;
size_t protocol_length;
ExtractProtocol::execute(data, size, tmp, protocol_length);
pos += protocol_length + 3;
if (pos >= end || pos[-1] != '/' || pos[-2] != '/')
return;
if (without_www && pos + 4 < end && !strncmp(pos, "www.", 4))
pos += 4;
Pos domain_begin = pos;
while (pos < end && *pos != '/' && *pos != ':' && *pos != '?' && *pos != '#')
++pos;
if (pos == domain_begin)
return;
res_data = domain_begin;
res_size = pos - domain_begin;
res_data = host.data();
res_size = host.size();
}
}
};

View File

@ -22,3 +22,36 @@ StringView getUrlScheme(const StringView& url)
return StringView();
}
StringView getUrlHost(const StringView& url)
{
StringView scheme = getUrlScheme(url);
const char* p = url.data() + scheme.size();
const char* end = url.data() + url.size();
// Colon must follows after scheme.
if (p == end || *p != ':')
return StringView();
// Authority component must starts with "//".
if (end - p < 2 || (p[1] != '/' || p[2] != '/'))
return StringView();
else
p += 3;
const char* st = p;
for (; p < end; ++p)
{
if (*p == '@')
{
st = p + 1;
}
else if (*p == ':' || *p == '/' || *p == '?' || *p == '#')
{
break;
}
}
return (p == st) ? StringView() : StringView(st, p - st);
}

View File

@ -2,3 +2,7 @@ http
https
svn+ssh
www.example.com
www.example.com
example.com

View File

@ -2,3 +2,8 @@ SELECT protocol('http://example.com') AS Scheme;
SELECT protocol('https://example.com/') AS Scheme;
SELECT protocol('svn+ssh://example.com?q=hello%20world') AS Scheme;
SELECT protocol('ftp!://example.com/') AS Scheme;
SELECT domain('http://paul@www.example.com:80/') AS Host;
SELECT domain('http:/paul/example/com') AS Host;
SELECT domain('http://www.example.com?q=4') AS Host;
SELECT domainWithoutWWW('http://paul@www.example.com:80/') AS Host;