Add cutToFirstSignificantSubdomainWithWWW()

Sometimes it is odd to get TLD itself from the
cutToFirstSignificantSubdomain() (since you will not get TLD itself if
you pass it directly):
- cutToFirstSignificantSubdomain('org')            -> ""
- cutToFirstSignificantSubdomain('www.org')        -> org
- cutToFirstSignificantSubdomain('kernel.org')     -> kernel.org
- cutToFirstSignificantSubdomain('www.kernel.org') -> kernel.org

So add one more function to get www.org in this case:
- cutToFirstSignificantSubdomainWithWWW('org')            -> ""
- cutToFirstSignificantSubdomainWithWWW('www.org')        -> www.org
- cutToFirstSignificantSubdomainWithWWW('kernel.org')     -> kernel.org
- cutToFirstSignificantSubdomainWithWWW('www.kernel.org') -> kernel.org

P.S. not sure about the naming though, so it will great if someone has
suggestion for the name.
This commit is contained in:
Azat Khuzhin 2020-11-11 01:04:59 +03:00
parent 0bc60e2d53
commit 48645eae33
6 changed files with 44 additions and 5 deletions

View File

@ -115,7 +115,21 @@ Returns the “first significant subdomain”. This is a non-standard concept sp
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain” (see the explanation above).
For example, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
For example:
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
- `cutToFirstSignificantSubdomain('www.tr') = 'tr'`.
- `cutToFirstSignificantSubdomain('tr') = ''`.
### cutToFirstSignificantSubdomainWithWWW {#cuttofirstsignificantsubdomainwithwww}
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain”, without stripping "www".
For example:
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
- `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`.
- `cutToFirstSignificantSubdomain('tr') = ''`.
### port(URL\[, default_port = 0\]) {#port}

View File

@ -6,6 +6,7 @@
namespace DB
{
template <bool without_www>
struct CutToFirstSignificantSubdomain
{
static size_t getReserveLengthForElement() { return 15; }
@ -18,7 +19,7 @@ struct CutToFirstSignificantSubdomain
Pos tmp_data;
size_t tmp_length;
Pos domain_end;
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
ExtractFirstSignificantSubdomain<without_www>::execute(data, size, tmp_data, tmp_length, &domain_end);
if (tmp_length == 0)
return;
@ -29,11 +30,15 @@ struct CutToFirstSignificantSubdomain
};
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain<true>>, NameCutToFirstSignificantSubdomain>;
struct NameCutToFirstSignificantSubdomainWithWWW { static constexpr auto name = "cutToFirstSignificantSubdomainWithWWW"; };
using FunctionCutToFirstSignificantSubdomainWithWWW = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain<false>>, NameCutToFirstSignificantSubdomainWithWWW>;
void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory)
{
factory.registerFunction<FunctionCutToFirstSignificantSubdomain>();
factory.registerFunction<FunctionCutToFirstSignificantSubdomainWithWWW>();
}
}

View File

@ -7,7 +7,7 @@ namespace DB
{
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, NameFirstSignificantSubdomain>;
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain<true>>, NameFirstSignificantSubdomain>;
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory)
{

View File

@ -7,6 +7,7 @@
namespace DB
{
template <bool without_www>
struct ExtractFirstSignificantSubdomain
{
static size_t getReserveLengthForElement() { return 10; }
@ -18,7 +19,7 @@ struct ExtractFirstSignificantSubdomain
Pos tmp;
size_t domain_length;
ExtractDomain<true>::execute(data, size, tmp, domain_length);
ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
if (domain_length == 0)
return;

View File

@ -78,6 +78,15 @@ example.com
example.com
example.com
example.com
com
====CUT TO FIRST SIGNIFICANT SUBDOMAIN WITH WWW====
www.com
example.com
example.com
example.com
example.com
====CUT WWW====
http://example.com
http://example.com:1234

View File

@ -86,6 +86,16 @@ SELECT cutToFirstSignificantSubdomain('http://paul@www.example.com/a/b/c?a=b#d=f
SELECT cutToFirstSignificantSubdomain('//paul@www.example.com/a/b/c?a=b#d=f');
SELECT cutToFirstSignificantSubdomain('www.example.com');
SELECT cutToFirstSignificantSubdomain('example.com');
SELECT cutToFirstSignificantSubdomain('www.com');
SELECT cutToFirstSignificantSubdomain('com');
SELECT '====CUT TO FIRST SIGNIFICANT SUBDOMAIN WITH WWW====';
SELECT cutToFirstSignificantSubdomainWithWWW('http://com');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.com');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.foo.example.com');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com:1');
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com/');
SELECT '====CUT WWW====';
SELECT cutWWW('http://www.example.com');