ClickHouse/tests/queries/0_stateless/01601_custom_tld.reference
Azat Khuzhin 1d4a7c7290 Add support of !/* (exclamation/asterisk) in custom TLDs
Public suffix list may contain special characters (you may find format
here - [1]):
- asterisk (*)
- exclamation mark (!)

  [1]: https://github.com/publicsuffix/list/wiki/Format

It is easier to describe how it should be interpreted with an examples.

Consider the following part of the list:

    *.sch.uk
    *.kawasaki.jp
    !city.kawasaki.jp

And here are the results for `cutToFirstSignificantSubdomainCustom()`:

If you have only asterisk (*):

    foo.something.sheffield.sch.uk -> something.sheffield.sch.uk
    sheffield.sch.uk               -> sheffield.sch.uk

If you have exclamation mark (!) too:

    foo.kawasaki.jp                -> foo.kawasaki.jp
    foo.foo.kawasaki.jp            -> foo.foo.kawasaki.jp
    city.kawasaki.jp               -> city.kawasaki.jp
    some.city.kawasaki.jp          -> city.kawasaki.jp

TLDs had been verified wit the following script [2], to match with
python publicsuffix2 module.

  [2]: https://gist.github.com/azat/c1a7a9f1e3519793134ef4b1df5461a6

v2: fix StringHashTable padding requirements
Fixes: #39468
Follow-up for: #17748
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-07-26 08:34:30 +03:00

92 lines
4.0 KiB
Plaintext

-- { echo }
select '-- no-tld';
-- no-tld
-- even if there is no TLD, 2-nd level by default anyway
-- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real)
select cutToFirstSignificantSubdomain('there-is-no-such-domain');
select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain');
foo.there-is-no-such-domain
select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain');
foo.there-is-no-such-domain
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list');
foo.there-is-no-such-domain
select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
foo.there-is-no-such-domain
select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
foo
select '-- generic';
-- generic
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel
kernel
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
kernel.biz.ss
select '-- difference';
-- difference
-- biz.ss is not in the default TLD list, hence:
select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss
biz.ss
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
kernel.biz.ss
select '-- 3+level';
-- 3+level
select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
xx.blogspot.co.at
select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
blogspot
select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
xx.blogspot.co.at
select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
blogspot
select '-- url';
-- url
select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list');
foobar.com
select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list');
foobar.com
select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list');
foobar.com
select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list');
xx.blogspot.co.at
select '-- www';
-- www
select cutToFirstSignificantSubdomainCustomWithWWW('http://www.foo', 'public_suffix_list');
www.foo
select cutToFirstSignificantSubdomainCustom('http://www.foo', 'public_suffix_list');
foo
select '-- vector';
-- vector
select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at/' || toString(number), 'public_suffix_list') from numbers(1);
xx.blogspot.co.at
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain' || toString(number), 'public_suffix_list') from numbers(1);
select '-- no new line';
-- no new line
select cutToFirstSignificantSubdomainCustom('foo.bar', 'no_new_line_list');
foo.bar
select cutToFirstSignificantSubdomainCustom('a.foo.bar', 'no_new_line_list');
a.foo.bar
select cutToFirstSignificantSubdomainCustom('a.foo.baz', 'no_new_line_list');
foo.baz
select '-- asterisk';
-- asterisk
select cutToFirstSignificantSubdomainCustom('foo.something.sheffield.sch.uk', 'public_suffix_list');
something.sheffield.sch.uk
select cutToFirstSignificantSubdomainCustom('something.sheffield.sch.uk', 'public_suffix_list');
something.sheffield.sch.uk
select cutToFirstSignificantSubdomainCustom('sheffield.sch.uk', 'public_suffix_list');
sheffield.sch.uk
select '-- exclamation mark';
-- exclamation mark
select cutToFirstSignificantSubdomainCustom('foo.kawasaki.jp', 'public_suffix_list');
foo.kawasaki.jp
select cutToFirstSignificantSubdomainCustom('foo.foo.kawasaki.jp', 'public_suffix_list');
foo.foo.kawasaki.jp
select cutToFirstSignificantSubdomainCustom('city.kawasaki.jp', 'public_suffix_list');
city.kawasaki.jp
select cutToFirstSignificantSubdomainCustom('some.city.kawasaki.jp', 'public_suffix_list');
city.kawasaki.jp