Fix cutToFirstSignificantSubdomainCustom()/firstSignificantSubdomainCustom() for 3+level domains

Custom TLD lists (added in #17748), may contain domain of the 3-d level,
however builtin TLD lists does not have such records, so it is not
affected.

Note that this will significantly increase hashtable lookups.

Fixes: #17748
This commit is contained in:
Azat Khuzhin 2021-03-21 10:55:28 +03:00
parent 30cd1c6145
commit b68517f69e
4 changed files with 101 additions and 10 deletions

View File

@ -91,6 +91,69 @@ struct ExtractFirstSignificantSubdomain
res_size = last_3_periods[0] - last_3_periods[1] - 1; res_size = last_3_periods[0] - last_3_periods[1] - 1;
} }
} }
/// The difference with execute() is due to custom TLD list can have records of any level,
/// not only 2-nd level (like non-custom variant), so it requires more lookups.
template <class Lookup>
static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
{
res_data = data;
res_size = 0;
Pos tmp;
size_t domain_length;
ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
if (domain_length == 0)
return;
if (out_domain_end)
*out_domain_end = tmp + domain_length;
/// cut useless dot
if (tmp[domain_length - 1] == '.')
--domain_length;
res_data = tmp;
res_size = domain_length;
auto begin = tmp;
auto end = begin + domain_length;
const char * last_2_periods[2]{};
const char * prev = begin - 1;
auto pos = find_first_symbols<'.'>(begin, end);
while (pos < end)
{
if (lookup(pos + 1, end - pos - 1))
{
res_data += prev + 1 - begin;
res_size = end - 1 - prev;
return;
}
last_2_periods[1] = last_2_periods[0];
last_2_periods[0] = pos;
prev = pos;
pos = find_first_symbols<'.'>(pos + 1, end);
}
/// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing
if (!last_2_periods[0])
return;
/// if there is domain of the second level -> always return itself
if (!last_2_periods[1])
{
res_size = last_2_periods[0] - begin;
return;
}
/// if there is domain of the 3+ level, and zero records in TLD list ->
/// fallback to domain of the second level
res_data += last_2_periods[1] + 1 - begin;
res_size = last_2_periods[0] - last_2_periods[1] - 1;
}
}; };
} }

View File

@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom
Pos tmp_data; Pos tmp_data;
size_t tmp_length; size_t tmp_length;
Pos domain_end; Pos domain_end;
ExtractFirstSignificantSubdomain<without_www>::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end); ExtractFirstSignificantSubdomain<without_www>::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
if (tmp_length == 0) if (tmp_length == 0)
return; return;

View File

@ -1,11 +1,24 @@
no-tld -- no-tld
foo.there-is-no-such-domain
foo.there-is-no-such-domain
foo.there-is-no-such-domain foo.there-is-no-such-domain
foo.there-is-no-such-domain foo.there-is-no-such-domain
foo foo
generic -- generic
kernel kernel
kernel.biz.ss kernel.biz.ss
difference -- difference
biz.ss biz.ss
kernel.biz.ss kernel.biz.ss
-- 3+level
xx.blogspot.co.at
blogspot
xx.blogspot.co.at
blogspot
-- url
foobar.com
foobar.com
foobar.com
xx.blogspot.co.at

View File

@ -1,16 +1,31 @@
select 'no-tld'; select '-- no-tld';
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
-- even if there is no TLD, 2-nd level by default anyway -- even if there is no TLD, 2-nd level by default anyway
-- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real) -- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real)
select cutToFirstSignificantSubdomain('there-is-no-such-domain');
select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain');
select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain');
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list'); select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list'); select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list'); select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
select 'generic'; select '-- generic';
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
select 'difference'; select '-- difference';
-- biz.ss is not in the default TLD list, hence: -- biz.ss is not in the default TLD list, hence:
select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
select '-- 3+level';
select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
select '-- url';
select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list');