mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-27 18:12:02 +00:00
Relax symbols that are allowed in userinfo in netloc()
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
This commit is contained in:
parent
83071164cc
commit
066389e6ff
@ -7,6 +7,7 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// NOTE: Implementation is not RFC3986 compatible
|
||||
struct ExtractNetloc
|
||||
{
|
||||
/// We use the same as domain function
|
||||
@ -67,6 +68,7 @@ struct ExtractNetloc
|
||||
/// Now pos points to the first byte after scheme (if there is).
|
||||
|
||||
bool has_identification = false;
|
||||
Pos hostname_end = end;
|
||||
Pos question_mark_pos = end;
|
||||
Pos slash_pos = end;
|
||||
Pos start_of_host = pos;
|
||||
@ -90,23 +92,37 @@ struct ExtractNetloc
|
||||
return std::string_view(start_of_host, pos - start_of_host);
|
||||
case '@': /// foo:bar@example.ru
|
||||
has_identification = true;
|
||||
hostname_end = end;
|
||||
break;
|
||||
case ';':
|
||||
case '=':
|
||||
case '&':
|
||||
case '~':
|
||||
case '%':
|
||||
/// Symbols above are sub-delims in RFC3986 and should be
|
||||
/// allowed for userinfo (named identification here).
|
||||
///
|
||||
/// NOTE: that those symbols is allowed for reg-name (host)
|
||||
/// too, but right now host parsing looks more like in
|
||||
/// RFC1034 (in other words domains that are allowed to be
|
||||
/// registered).
|
||||
if (!has_identification)
|
||||
{
|
||||
hostname_end = pos;
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case ' ': /// restricted symbols in whole URL
|
||||
case '\t':
|
||||
case '<':
|
||||
case '>':
|
||||
case '%':
|
||||
case '{':
|
||||
case '}':
|
||||
case '|':
|
||||
case '\\':
|
||||
case '^':
|
||||
case '~':
|
||||
case '[':
|
||||
case ']':
|
||||
case ';':
|
||||
case '=':
|
||||
case '&':
|
||||
return pos > start_of_host
|
||||
? std::string_view(start_of_host, std::min(std::min(pos, question_mark_pos), slash_pos) - start_of_host)
|
||||
: std::string_view();
|
||||
@ -116,7 +132,7 @@ struct ExtractNetloc
|
||||
if (has_identification)
|
||||
return std::string_view(start_of_host, pos - start_of_host);
|
||||
else
|
||||
return std::string_view(start_of_host, std::min(std::min(pos, question_mark_pos), slash_pos) - start_of_host);
|
||||
return std::string_view(start_of_host, std::min(std::min(std::min(pos, question_mark_pos), slash_pos), hostname_end) - start_of_host);
|
||||
}
|
||||
|
||||
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
|
||||
|
@ -53,7 +53,10 @@ paul:zozo@example.ru
|
||||
www.example.com
|
||||
www.example.com
|
||||
example.com
|
||||
foo:foo
|
||||
foo:foo%@foo.com
|
||||
foo:foo%41bar@foo.com
|
||||
foo:foo%41%42bar@foo.com
|
||||
foo:foo%41bar@foo
|
||||
====DOMAIN====
|
||||
com
|
||||
|
||||
|
@ -42,6 +42,9 @@ SELECT netloc('//www.example.com') AS Netloc;
|
||||
SELECT netloc('www.example.com') as Netloc;
|
||||
SELECT netloc('example.com') as Netloc;
|
||||
SELECT netloc('http://foo:foo%@foo.com') as Netloc;
|
||||
SELECT netloc('http://foo:foo%41bar@foo.com') as Netloc;
|
||||
SELECT netloc('http://foo:foo%41%42bar@foo.com') as Netloc;
|
||||
SELECT netloc('http://foo:foo%41bar@foo%41.com') as Netloc;
|
||||
|
||||
SELECT '====DOMAIN====';
|
||||
SELECT topLevelDomain('http://paul@www.example.com:80/') AS Domain;
|
||||
|
Loading…
Reference in New Issue
Block a user