mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
Fix cutToFirstSignificantSubdomainCustom()/firstSignificantSubdomainCustom() for 3+level domains
Custom TLD lists (added in #17748), may contain domain of the 3-d level, however builtin TLD lists does not have such records, so it is not affected. Note that this will significantly increase hashtable lookups. Fixes: #17748
This commit is contained in:
parent
30cd1c6145
commit
b68517f69e
@ -91,6 +91,69 @@ struct ExtractFirstSignificantSubdomain
|
|||||||
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The difference with execute() is due to custom TLD list can have records of any level,
|
||||||
|
/// not only 2-nd level (like non-custom variant), so it requires more lookups.
|
||||||
|
template <class Lookup>
|
||||||
|
static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
|
||||||
|
{
|
||||||
|
res_data = data;
|
||||||
|
res_size = 0;
|
||||||
|
|
||||||
|
Pos tmp;
|
||||||
|
size_t domain_length;
|
||||||
|
ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
|
||||||
|
|
||||||
|
if (domain_length == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (out_domain_end)
|
||||||
|
*out_domain_end = tmp + domain_length;
|
||||||
|
|
||||||
|
/// cut useless dot
|
||||||
|
if (tmp[domain_length - 1] == '.')
|
||||||
|
--domain_length;
|
||||||
|
|
||||||
|
res_data = tmp;
|
||||||
|
res_size = domain_length;
|
||||||
|
|
||||||
|
auto begin = tmp;
|
||||||
|
auto end = begin + domain_length;
|
||||||
|
const char * last_2_periods[2]{};
|
||||||
|
const char * prev = begin - 1;
|
||||||
|
|
||||||
|
auto pos = find_first_symbols<'.'>(begin, end);
|
||||||
|
while (pos < end)
|
||||||
|
{
|
||||||
|
if (lookup(pos + 1, end - pos - 1))
|
||||||
|
{
|
||||||
|
res_data += prev + 1 - begin;
|
||||||
|
res_size = end - 1 - prev;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
last_2_periods[1] = last_2_periods[0];
|
||||||
|
last_2_periods[0] = pos;
|
||||||
|
prev = pos;
|
||||||
|
pos = find_first_symbols<'.'>(pos + 1, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing
|
||||||
|
if (!last_2_periods[0])
|
||||||
|
return;
|
||||||
|
|
||||||
|
/// if there is domain of the second level -> always return itself
|
||||||
|
if (!last_2_periods[1])
|
||||||
|
{
|
||||||
|
res_size = last_2_periods[0] - begin;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// if there is domain of the 3+ level, and zero records in TLD list ->
|
||||||
|
/// fallback to domain of the second level
|
||||||
|
res_data += last_2_periods[1] + 1 - begin;
|
||||||
|
res_size = last_2_periods[0] - last_2_periods[1] - 1;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom
|
|||||||
Pos tmp_data;
|
Pos tmp_data;
|
||||||
size_t tmp_length;
|
size_t tmp_length;
|
||||||
Pos domain_end;
|
Pos domain_end;
|
||||||
ExtractFirstSignificantSubdomain<without_www>::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
|
ExtractFirstSignificantSubdomain<without_www>::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
|
||||||
|
|
||||||
if (tmp_length == 0)
|
if (tmp_length == 0)
|
||||||
return;
|
return;
|
||||||
|
@ -1,11 +1,24 @@
|
|||||||
no-tld
|
-- no-tld
|
||||||
|
|
||||||
|
foo.there-is-no-such-domain
|
||||||
|
foo.there-is-no-such-domain
|
||||||
|
|
||||||
foo.there-is-no-such-domain
|
foo.there-is-no-such-domain
|
||||||
foo.there-is-no-such-domain
|
foo.there-is-no-such-domain
|
||||||
foo
|
foo
|
||||||
generic
|
-- generic
|
||||||
kernel
|
kernel
|
||||||
kernel.biz.ss
|
kernel.biz.ss
|
||||||
difference
|
-- difference
|
||||||
biz.ss
|
biz.ss
|
||||||
kernel.biz.ss
|
kernel.biz.ss
|
||||||
|
-- 3+level
|
||||||
|
xx.blogspot.co.at
|
||||||
|
blogspot
|
||||||
|
xx.blogspot.co.at
|
||||||
|
blogspot
|
||||||
|
-- url
|
||||||
|
foobar.com
|
||||||
|
foobar.com
|
||||||
|
foobar.com
|
||||||
|
xx.blogspot.co.at
|
||||||
|
@ -1,16 +1,31 @@
|
|||||||
select 'no-tld';
|
select '-- no-tld';
|
||||||
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
|
|
||||||
-- even if there is no TLD, 2-nd level by default anyway
|
-- even if there is no TLD, 2-nd level by default anyway
|
||||||
-- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real)
|
-- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real)
|
||||||
|
select cutToFirstSignificantSubdomain('there-is-no-such-domain');
|
||||||
|
select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain');
|
||||||
|
select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain');
|
||||||
|
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
|
||||||
select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list');
|
select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list');
|
||||||
select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
|
select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
|
||||||
select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
|
select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
|
||||||
|
|
||||||
select 'generic';
|
select '-- generic';
|
||||||
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
|
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel
|
||||||
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
|
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
|
||||||
|
|
||||||
select 'difference';
|
select '-- difference';
|
||||||
-- biz.ss is not in the default TLD list, hence:
|
-- biz.ss is not in the default TLD list, hence:
|
||||||
select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss
|
select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss
|
||||||
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
|
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
|
||||||
|
|
||||||
|
select '-- 3+level';
|
||||||
|
select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
|
||||||
|
select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
|
||||||
|
select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
|
||||||
|
select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
|
||||||
|
|
||||||
|
select '-- url';
|
||||||
|
select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list');
|
||||||
|
select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list');
|
||||||
|
select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list');
|
||||||
|
select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list');
|
||||||
|
Loading…
Reference in New Issue
Block a user