mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Add cutToFirstSignificantSubdomainWithWWW()
Sometimes it is odd to get TLD itself from the cutToFirstSignificantSubdomain() (since you will not get TLD itself if you pass it directly): - cutToFirstSignificantSubdomain('org') -> "" - cutToFirstSignificantSubdomain('www.org') -> org - cutToFirstSignificantSubdomain('kernel.org') -> kernel.org - cutToFirstSignificantSubdomain('www.kernel.org') -> kernel.org So add one more function to get www.org in this case: - cutToFirstSignificantSubdomainWithWWW('org') -> "" - cutToFirstSignificantSubdomainWithWWW('www.org') -> www.org - cutToFirstSignificantSubdomainWithWWW('kernel.org') -> kernel.org - cutToFirstSignificantSubdomainWithWWW('www.kernel.org') -> kernel.org P.S. not sure about the naming though, so it will great if someone has suggestion for the name.
This commit is contained in:
parent
0bc60e2d53
commit
48645eae33
@ -115,7 +115,21 @@ Returns the “first significant subdomain”. This is a non-standard concept sp
|
|||||||
|
|
||||||
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain” (see the explanation above).
|
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain” (see the explanation above).
|
||||||
|
|
||||||
For example, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
|
For example:
|
||||||
|
|
||||||
|
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
|
||||||
|
- `cutToFirstSignificantSubdomain('www.tr') = 'tr'`.
|
||||||
|
- `cutToFirstSignificantSubdomain('tr') = ''`.
|
||||||
|
|
||||||
|
### cutToFirstSignificantSubdomainWithWWW {#cuttofirstsignificantsubdomainwithwww}
|
||||||
|
|
||||||
|
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain”, without stripping "www".
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
|
||||||
|
- `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`.
|
||||||
|
- `cutToFirstSignificantSubdomain('tr') = ''`.
|
||||||
|
|
||||||
### port(URL\[, default_port = 0\]) {#port}
|
### port(URL\[, default_port = 0\]) {#port}
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
|
template <bool without_www>
|
||||||
struct CutToFirstSignificantSubdomain
|
struct CutToFirstSignificantSubdomain
|
||||||
{
|
{
|
||||||
static size_t getReserveLengthForElement() { return 15; }
|
static size_t getReserveLengthForElement() { return 15; }
|
||||||
@ -18,7 +19,7 @@ struct CutToFirstSignificantSubdomain
|
|||||||
Pos tmp_data;
|
Pos tmp_data;
|
||||||
size_t tmp_length;
|
size_t tmp_length;
|
||||||
Pos domain_end;
|
Pos domain_end;
|
||||||
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
|
ExtractFirstSignificantSubdomain<without_www>::execute(data, size, tmp_data, tmp_length, &domain_end);
|
||||||
|
|
||||||
if (tmp_length == 0)
|
if (tmp_length == 0)
|
||||||
return;
|
return;
|
||||||
@ -29,11 +30,15 @@ struct CutToFirstSignificantSubdomain
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
|
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
|
||||||
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;
|
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain<true>>, NameCutToFirstSignificantSubdomain>;
|
||||||
|
|
||||||
|
struct NameCutToFirstSignificantSubdomainWithWWW { static constexpr auto name = "cutToFirstSignificantSubdomainWithWWW"; };
|
||||||
|
using FunctionCutToFirstSignificantSubdomainWithWWW = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain<false>>, NameCutToFirstSignificantSubdomainWithWWW>;
|
||||||
|
|
||||||
void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory)
|
void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory)
|
||||||
{
|
{
|
||||||
factory.registerFunction<FunctionCutToFirstSignificantSubdomain>();
|
factory.registerFunction<FunctionCutToFirstSignificantSubdomain>();
|
||||||
|
factory.registerFunction<FunctionCutToFirstSignificantSubdomainWithWWW>();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ namespace DB
|
|||||||
{
|
{
|
||||||
|
|
||||||
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
|
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
|
||||||
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, NameFirstSignificantSubdomain>;
|
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain<true>>, NameFirstSignificantSubdomain>;
|
||||||
|
|
||||||
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory)
|
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory)
|
||||||
{
|
{
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
|
template <bool without_www>
|
||||||
struct ExtractFirstSignificantSubdomain
|
struct ExtractFirstSignificantSubdomain
|
||||||
{
|
{
|
||||||
static size_t getReserveLengthForElement() { return 10; }
|
static size_t getReserveLengthForElement() { return 10; }
|
||||||
@ -18,7 +19,7 @@ struct ExtractFirstSignificantSubdomain
|
|||||||
|
|
||||||
Pos tmp;
|
Pos tmp;
|
||||||
size_t domain_length;
|
size_t domain_length;
|
||||||
ExtractDomain<true>::execute(data, size, tmp, domain_length);
|
ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
|
||||||
|
|
||||||
if (domain_length == 0)
|
if (domain_length == 0)
|
||||||
return;
|
return;
|
||||||
|
@ -78,6 +78,15 @@ example.com
|
|||||||
example.com
|
example.com
|
||||||
example.com
|
example.com
|
||||||
example.com
|
example.com
|
||||||
|
com
|
||||||
|
|
||||||
|
====CUT TO FIRST SIGNIFICANT SUBDOMAIN WITH WWW====
|
||||||
|
|
||||||
|
www.com
|
||||||
|
example.com
|
||||||
|
example.com
|
||||||
|
example.com
|
||||||
|
example.com
|
||||||
====CUT WWW====
|
====CUT WWW====
|
||||||
http://example.com
|
http://example.com
|
||||||
http://example.com:1234
|
http://example.com:1234
|
||||||
|
@ -86,6 +86,16 @@ SELECT cutToFirstSignificantSubdomain('http://paul@www.example.com/a/b/c?a=b#d=f
|
|||||||
SELECT cutToFirstSignificantSubdomain('//paul@www.example.com/a/b/c?a=b#d=f');
|
SELECT cutToFirstSignificantSubdomain('//paul@www.example.com/a/b/c?a=b#d=f');
|
||||||
SELECT cutToFirstSignificantSubdomain('www.example.com');
|
SELECT cutToFirstSignificantSubdomain('www.example.com');
|
||||||
SELECT cutToFirstSignificantSubdomain('example.com');
|
SELECT cutToFirstSignificantSubdomain('example.com');
|
||||||
|
SELECT cutToFirstSignificantSubdomain('www.com');
|
||||||
|
SELECT cutToFirstSignificantSubdomain('com');
|
||||||
|
|
||||||
|
SELECT '====CUT TO FIRST SIGNIFICANT SUBDOMAIN WITH WWW====';
|
||||||
|
SELECT cutToFirstSignificantSubdomainWithWWW('http://com');
|
||||||
|
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.com');
|
||||||
|
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com');
|
||||||
|
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.foo.example.com');
|
||||||
|
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com:1');
|
||||||
|
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com/');
|
||||||
|
|
||||||
SELECT '====CUT WWW====';
|
SELECT '====CUT WWW====';
|
||||||
SELECT cutWWW('http://www.example.com');
|
SELECT cutWWW('http://www.example.com');
|
||||||
|
Loading…
Reference in New Issue
Block a user