mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 07:01:59 +00:00
Add cutToFirstSignificantSubdomainWithWWW()
Sometimes it is odd to get TLD itself from the cutToFirstSignificantSubdomain() (since you will not get TLD itself if you pass it directly): - cutToFirstSignificantSubdomain('org') -> "" - cutToFirstSignificantSubdomain('www.org') -> org - cutToFirstSignificantSubdomain('kernel.org') -> kernel.org - cutToFirstSignificantSubdomain('www.kernel.org') -> kernel.org So add one more function to get www.org in this case: - cutToFirstSignificantSubdomainWithWWW('org') -> "" - cutToFirstSignificantSubdomainWithWWW('www.org') -> www.org - cutToFirstSignificantSubdomainWithWWW('kernel.org') -> kernel.org - cutToFirstSignificantSubdomainWithWWW('www.kernel.org') -> kernel.org P.S. not sure about the naming though, so it will great if someone has suggestion for the name.
This commit is contained in:
parent
0bc60e2d53
commit
48645eae33
@ -115,7 +115,21 @@ Returns the “first significant subdomain”. This is a non-standard concept sp
|
||||
|
||||
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain” (see the explanation above).
|
||||
|
||||
For example, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
|
||||
For example:
|
||||
|
||||
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
|
||||
- `cutToFirstSignificantSubdomain('www.tr') = 'tr'`.
|
||||
- `cutToFirstSignificantSubdomain('tr') = ''`.
|
||||
|
||||
### cutToFirstSignificantSubdomainWithWWW {#cuttofirstsignificantsubdomainwithwww}
|
||||
|
||||
Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain”, without stripping "www".
|
||||
|
||||
For example:
|
||||
|
||||
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
|
||||
- `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`.
|
||||
- `cutToFirstSignificantSubdomain('tr') = ''`.
|
||||
|
||||
### port(URL\[, default_port = 0\]) {#port}
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
template <bool without_www>
|
||||
struct CutToFirstSignificantSubdomain
|
||||
{
|
||||
static size_t getReserveLengthForElement() { return 15; }
|
||||
@ -18,7 +19,7 @@ struct CutToFirstSignificantSubdomain
|
||||
Pos tmp_data;
|
||||
size_t tmp_length;
|
||||
Pos domain_end;
|
||||
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
|
||||
ExtractFirstSignificantSubdomain<without_www>::execute(data, size, tmp_data, tmp_length, &domain_end);
|
||||
|
||||
if (tmp_length == 0)
|
||||
return;
|
||||
@ -29,11 +30,15 @@ struct CutToFirstSignificantSubdomain
|
||||
};
|
||||
|
||||
struct NameCutToFirstSignificantSubdomain { static constexpr auto name = "cutToFirstSignificantSubdomain"; };
|
||||
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain>;
|
||||
using FunctionCutToFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain<true>>, NameCutToFirstSignificantSubdomain>;
|
||||
|
||||
struct NameCutToFirstSignificantSubdomainWithWWW { static constexpr auto name = "cutToFirstSignificantSubdomainWithWWW"; };
|
||||
using FunctionCutToFirstSignificantSubdomainWithWWW = FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain<false>>, NameCutToFirstSignificantSubdomainWithWWW>;
|
||||
|
||||
void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionCutToFirstSignificantSubdomain>();
|
||||
factory.registerFunction<FunctionCutToFirstSignificantSubdomainWithWWW>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ namespace DB
|
||||
{
|
||||
|
||||
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
|
||||
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, NameFirstSignificantSubdomain>;
|
||||
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain<true>>, NameFirstSignificantSubdomain>;
|
||||
|
||||
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory)
|
||||
{
|
||||
|
@ -7,6 +7,7 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
template <bool without_www>
|
||||
struct ExtractFirstSignificantSubdomain
|
||||
{
|
||||
static size_t getReserveLengthForElement() { return 10; }
|
||||
@ -18,7 +19,7 @@ struct ExtractFirstSignificantSubdomain
|
||||
|
||||
Pos tmp;
|
||||
size_t domain_length;
|
||||
ExtractDomain<true>::execute(data, size, tmp, domain_length);
|
||||
ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
|
||||
|
||||
if (domain_length == 0)
|
||||
return;
|
||||
|
@ -78,6 +78,15 @@ example.com
|
||||
example.com
|
||||
example.com
|
||||
example.com
|
||||
com
|
||||
|
||||
====CUT TO FIRST SIGNIFICANT SUBDOMAIN WITH WWW====
|
||||
|
||||
www.com
|
||||
example.com
|
||||
example.com
|
||||
example.com
|
||||
example.com
|
||||
====CUT WWW====
|
||||
http://example.com
|
||||
http://example.com:1234
|
||||
|
@ -86,6 +86,16 @@ SELECT cutToFirstSignificantSubdomain('http://paul@www.example.com/a/b/c?a=b#d=f
|
||||
SELECT cutToFirstSignificantSubdomain('//paul@www.example.com/a/b/c?a=b#d=f');
|
||||
SELECT cutToFirstSignificantSubdomain('www.example.com');
|
||||
SELECT cutToFirstSignificantSubdomain('example.com');
|
||||
SELECT cutToFirstSignificantSubdomain('www.com');
|
||||
SELECT cutToFirstSignificantSubdomain('com');
|
||||
|
||||
SELECT '====CUT TO FIRST SIGNIFICANT SUBDOMAIN WITH WWW====';
|
||||
SELECT cutToFirstSignificantSubdomainWithWWW('http://com');
|
||||
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.com');
|
||||
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com');
|
||||
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.foo.example.com');
|
||||
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com:1');
|
||||
SELECT cutToFirstSignificantSubdomainWithWWW('http://www.example.com/');
|
||||
|
||||
SELECT '====CUT WWW====';
|
||||
SELECT cutWWW('http://www.example.com');
|
||||
|
Loading…
Reference in New Issue
Block a user