Merge pull request #16845 from azat/cutToFirstSignificantSubdomainWithWWW

Add cutToFirstSignificantSubdomainWithWWW()
This commit is contained in:
Kruglov Pavel 2020-11-20 01:44:57 +03:00 committed by GitHub
commit 51bcd286f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 44 additions and 5 deletions

View File

@ -115,7 +115,21 @@ Returns the “first significant subdomain”. This is a non-standard concept sp
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain” (see the explanation above). Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain” (see the explanation above).
For example, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`. For example:
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
- `cutToFirstSignificantSubdomain('www.tr') = 'tr'`.
- `cutToFirstSignificantSubdomain('tr') = ''`.
### cutToFirstSignificantSubdomainWithWWW {#cuttofirstsignificantsubdomainwithwww}
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain”, without stripping "www".
For example:
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
- `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`.
- `cutToFirstSignificantSubdomain('tr') = ''`.
### port(URL\[, default_port = 0\]) {#port} ### port(URL\[, default_port = 0\]) {#port}

View File

@ -6,6 +6,7 @@
namespace DB namespace DB
{ {
template <bool without_www>
struct CutToFirstSignificantSubdomain struct CutToFirstSignificantSubdomain
{ {
static size_t getReserveLengthForElement() { return 15; } static size_t getReserveLengthForElement() { return 15; }
@ -18,7 +19,7 @@ struct CutToFirstSignificantSubdomain
Pos tmp_data; Pos tmp_data;
size_t tmp_length; size_t tmp_length;
Pos domain_end; Pos domain_end;
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end); ExtractFirstSignificantSubdomain<without_www>::execute(data, size, tmp_data, tmp_length, &domain_end);
if (tmp_length == 0) if (tmp_length == 0)
return; return;
@ -29,11 +30,15 @@ struct CutToFirstSignificantSubdomain
}; };
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; }; struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>; using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain<true>>, NameCutToFirstSignificantSubdomain>;
struct NameCutToFirstSignificantSubdomainWithWWW { static constexpr auto name = "cutToFirstSignificantSubdomainWithWWW"; };
using FunctionCutToFirstSignificantSubdomainWithWWW = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain<false>>, NameCutToFirstSignificantSubdomainWithWWW>;
void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory) void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory)
{ {
factory.registerFunction<FunctionCutToFirstSignificantSubdomain>(); factory.registerFunction<FunctionCutToFirstSignificantSubdomain>();
factory.registerFunction<FunctionCutToFirstSignificantSubdomainWithWWW>();
} }
} }

View File

@ -7,7 +7,7 @@ namespace DB
{ {
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; }; struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, NameFirstSignificantSubdomain>; using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain<true>>, NameFirstSignificantSubdomain>;
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory) void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory)
{ {

View File

@ -7,6 +7,7 @@
namespace DB namespace DB
{ {
template <bool without_www>
struct ExtractFirstSignificantSubdomain struct ExtractFirstSignificantSubdomain
{ {
static size_t getReserveLengthForElement() { return 10; } static size_t getReserveLengthForElement() { return 10; }
@ -18,7 +19,7 @@ struct ExtractFirstSignificantSubdomain
Pos tmp; Pos tmp;
size_t domain_length; size_t domain_length;
ExtractDomain<true>::execute(data, size, tmp, domain_length); ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
if (domain_length == 0) if (domain_length == 0)
return; return;

View File

@ -78,6 +78,15 @@ example.com
example.com example.com
example.com example.com
example.com example.com
com
====CUT TO FIRST SIGNIFICANT SUBDOMAIN WITH WWW====
www.com
example.com
example.com
example.com
example.com
====CUT WWW==== ====CUT WWW====
http://example.com http://example.com
http://example.com:1234 http://example.com:1234

View File

@ -86,6 +86,16 @@ SELECT cutToFirstSignificantSubdomain('http://paul@www.example.com/a/b/c?a=b#d=f
SELECT cutToFirstSignificantSubdomain('//paul@www.example.com/a/b/c?a=b#d=f'); SELECT cutToFirstSignificantSubdomain('//paul@www.example.com/a/b/c?a=b#d=f');
SELECT cutToFirstSignificantSubdomain('www.example.com'); SELECT cutToFirstSignificantSubdomain('www.example.com');
SELECT cutToFirstSignificantSubdomain('example.com'); SELECT cutToFirstSignificantSubdomain('example.com');
SELECT cutToFirstSignificantSubdomain('www.com');
SELECT cutToFirstSignificantSubdomain('com');
SELECT '====CUT TO FIRST SIGNIFICANT SUBDOMAIN WITH WWW====';
SELECT cutToFirstSignificantSubdomainWithWWW('http://com');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.com');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.foo.example.com');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com:1');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com/');
SELECT '====CUT WWW===='; SELECT '====CUT WWW====';
SELECT cutWWW('http://www.example.com'); SELECT cutWWW('http://www.example.com');