Relax symbols that are allowed in userinfo in netloc()

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
This commit is contained in:
Azat Khuzhin 2023-02-24 20:20:49 +01:00
parent 83071164cc
commit 066389e6ff
3 changed files with 29 additions and 7 deletions

View File

@ -7,6 +7,7 @@
namespace DB
{
/// NOTE: Implementation is not RFC3986 compatible
struct ExtractNetloc
{
/// We use the same as domain function
@ -67,6 +68,7 @@ struct ExtractNetloc
/// Now pos points to the first byte after scheme (if there is).
bool has_identification = false;
Pos hostname_end = end;
Pos question_mark_pos = end;
Pos slash_pos = end;
Pos start_of_host = pos;
@ -90,23 +92,37 @@ struct ExtractNetloc
return std::string_view(start_of_host, pos - start_of_host);
case '@': /// foo:bar@example.ru
has_identification = true;
hostname_end = end;
break;
case ';':
case '=':
case '&':
case '~':
case '%':
/// Symbols above are sub-delims in RFC3986 and should be
/// allowed for userinfo (named identification here).
///
/// NOTE: that those symbols is allowed for reg-name (host)
/// too, but right now host parsing looks more like in
/// RFC1034 (in other words domains that are allowed to be
/// registered).
if (!has_identification)
{
hostname_end = pos;
break;
}
[[fallthrough]];
case ' ': /// restricted symbols in whole URL
case '\t':
case '<':
case '>':
case '%':
case '{':
case '}':
case '|':
case '\\':
case '^':
case '~':
case '[':
case ']':
case ';':
case '=':
case '&':
return pos > start_of_host
? std::string_view(start_of_host, std::min(std::min(pos, question_mark_pos), slash_pos) - start_of_host)
: std::string_view();
@ -116,7 +132,7 @@ struct ExtractNetloc
if (has_identification)
return std::string_view(start_of_host, pos - start_of_host);
else
return std::string_view(start_of_host, std::min(std::min(pos, question_mark_pos), slash_pos) - start_of_host);
return std::string_view(start_of_host, std::min(std::min(std::min(pos, question_mark_pos), slash_pos), hostname_end) - start_of_host);
}
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)

View File

@ -53,7 +53,10 @@ paul:zozo@example.ru
www.example.com
www.example.com
example.com
foo:foo
foo:foo%@foo.com
foo:foo%41bar@foo.com
foo:foo%41%42bar@foo.com
foo:foo%41bar@foo
====DOMAIN====
com

View File

@ -42,6 +42,9 @@ SELECT netloc('//www.example.com') AS Netloc;
SELECT netloc('www.example.com') as Netloc;
SELECT netloc('example.com') as Netloc;
SELECT netloc('http://foo:foo%@foo.com') as Netloc;
SELECT netloc('http://foo:foo%41bar@foo.com') as Netloc;
SELECT netloc('http://foo:foo%41%42bar@foo.com') as Netloc;
SELECT netloc('http://foo:foo%41bar@foo%41.com') as Netloc;
SELECT '====DOMAIN====';
SELECT topLevelDomain('http://paul@www.example.com:80/') AS Domain;