mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
Fix cutToFirstSignificantSubdomainCustom()/firstSignificantSubdomainCustom() for 3+level domains
Custom TLD lists (added in #17748), may contain domain of the 3-d level, however builtin TLD lists does not have such records, so it is not affected. Note that this will significantly increase hashtable lookups. Fixes: #17748
This commit is contained in:
parent
30cd1c6145
commit
b68517f69e
@ -91,6 +91,69 @@ struct ExtractFirstSignificantSubdomain
|
||||
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// The difference with execute() is due to custom TLD list can have records of any level,
|
||||
/// not only 2-nd level (like non-custom variant), so it requires more lookups.
|
||||
template <class Lookup>
|
||||
static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
|
||||
{
|
||||
res_data = data;
|
||||
res_size = 0;
|
||||
|
||||
Pos tmp;
|
||||
size_t domain_length;
|
||||
ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
|
||||
|
||||
if (domain_length == 0)
|
||||
return;
|
||||
|
||||
if (out_domain_end)
|
||||
*out_domain_end = tmp + domain_length;
|
||||
|
||||
/// cut useless dot
|
||||
if (tmp[domain_length - 1] == '.')
|
||||
--domain_length;
|
||||
|
||||
res_data = tmp;
|
||||
res_size = domain_length;
|
||||
|
||||
auto begin = tmp;
|
||||
auto end = begin + domain_length;
|
||||
const char * last_2_periods[2]{};
|
||||
const char * prev = begin - 1;
|
||||
|
||||
auto pos = find_first_symbols<'.'>(begin, end);
|
||||
while (pos < end)
|
||||
{
|
||||
if (lookup(pos + 1, end - pos - 1))
|
||||
{
|
||||
res_data += prev + 1 - begin;
|
||||
res_size = end - 1 - prev;
|
||||
return;
|
||||
}
|
||||
|
||||
last_2_periods[1] = last_2_periods[0];
|
||||
last_2_periods[0] = pos;
|
||||
prev = pos;
|
||||
pos = find_first_symbols<'.'>(pos + 1, end);
|
||||
}
|
||||
|
||||
/// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing
|
||||
if (!last_2_periods[0])
|
||||
return;
|
||||
|
||||
/// if there is domain of the second level -> always return itself
|
||||
if (!last_2_periods[1])
|
||||
{
|
||||
res_size = last_2_periods[0] - begin;
|
||||
return;
|
||||
}
|
||||
|
||||
/// if there is domain of the 3+ level, and zero records in TLD list ->
|
||||
/// fallback to domain of the second level
|
||||
res_data += last_2_periods[1] + 1 - begin;
|
||||
res_size = last_2_periods[0] - last_2_periods[1] - 1;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom
|
||||
Pos tmp_data;
|
||||
size_t tmp_length;
|
||||
Pos domain_end;
|
||||
ExtractFirstSignificantSubdomain<without_www>::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
|
||||
ExtractFirstSignificantSubdomain<without_www>::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
|
||||
|
||||
if (tmp_length == 0)
|
||||
return;
|
||||
|
@ -1,11 +1,24 @@
|
||||
no-tld
|
||||
-- no-tld
|
||||
|
||||
foo.there-is-no-such-domain
|
||||
foo.there-is-no-such-domain
|
||||
|
||||
foo.there-is-no-such-domain
|
||||
foo.there-is-no-such-domain
|
||||
foo
|
||||
generic
|
||||
-- generic
|
||||
kernel
|
||||
kernel.biz.ss
|
||||
difference
|
||||
-- difference
|
||||
biz.ss
|
||||
kernel.biz.ss
|
||||
-- 3+level
|
||||
xx.blogspot.co.at
|
||||
blogspot
|
||||
xx.blogspot.co.at
|
||||
blogspot
|
||||
-- url
|
||||
foobar.com
|
||||
foobar.com
|
||||
foobar.com
|
||||
xx.blogspot.co.at
|
||||
|
@ -1,16 +1,31 @@
|
||||
select 'no-tld';
|
||||
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
|
||||
select '-- no-tld';
|
||||
-- even if there is no TLD, 2-nd level by default anyway
|
||||
-- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real)
|
||||
select cutToFirstSignificantSubdomain('there-is-no-such-domain');
|
||||
select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain');
|
||||
select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain');
|
||||
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
|
||||
select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list');
|
||||
select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
|
||||
select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
|
||||
|
||||
select 'generic';
|
||||
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
|
||||
select '-- generic';
|
||||
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel
|
||||
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
|
||||
|
||||
select 'difference';
|
||||
select '-- difference';
|
||||
-- biz.ss is not in the default TLD list, hence:
|
||||
select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss
|
||||
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
|
||||
|
||||
select '-- 3+level';
|
||||
select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
|
||||
select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
|
||||
select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
|
||||
select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
|
||||
|
||||
select '-- url';
|
||||
select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list');
|
||||
select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list');
|
||||
select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list');
|
||||
select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list');
|
||||
|
Loading…
Reference in New Issue
Block a user