mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 17:20:50 +00:00
dbms: implement firstSignificantSubdomain, cutToFirstSignificantSubdomain. [#METR-13151]
This commit is contained in:
parent
5d21d75e85
commit
0d35ea0bf4
@ -113,6 +113,94 @@ struct ExtractDomain
|
||||
}
|
||||
};
|
||||
|
||||
struct ExtractFirstSignificantSubdomain
|
||||
{
|
||||
static size_t getReserveLengthForElement() { return 10; }
|
||||
|
||||
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
|
||||
{
|
||||
res_data = data;
|
||||
res_size = 0;
|
||||
|
||||
Pos tmp;
|
||||
size_t domain_length;
|
||||
ExtractDomain<true>::execute(data, size, tmp, domain_length);
|
||||
|
||||
if (domain_length == 0)
|
||||
return;
|
||||
|
||||
if (out_domain_end)
|
||||
*out_domain_end = tmp + domain_length;
|
||||
|
||||
/// cut useless dot
|
||||
if (tmp[domain_length - 1] == '.')
|
||||
--domain_length;
|
||||
|
||||
res_data = tmp;
|
||||
res_size = domain_length;
|
||||
|
||||
auto begin = tmp;
|
||||
auto end = begin + domain_length;
|
||||
const char * last_3_periods[3]{};
|
||||
auto pos = static_cast<const char *>(memchr(begin, '.', domain_length));
|
||||
|
||||
while (pos)
|
||||
{
|
||||
last_3_periods[2] = last_3_periods[1];
|
||||
last_3_periods[1] = last_3_periods[0];
|
||||
last_3_periods[0] = pos;
|
||||
pos = static_cast<const char *>(memchr(pos + 1, '.', end - pos - 1));
|
||||
}
|
||||
|
||||
if (!last_3_periods[0])
|
||||
return;
|
||||
|
||||
if (!last_3_periods[1])
|
||||
{
|
||||
res_size = last_3_periods[0] - begin;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!last_3_periods[2])
|
||||
last_3_periods[2] = begin - 1;
|
||||
|
||||
if (!strncmp(last_3_periods[1] + 1, "com", 3) ||
|
||||
!strncmp(last_3_periods[1] + 1, "net", 3) ||
|
||||
!strncmp(last_3_periods[1] + 1, "org", 3) ||
|
||||
!strncmp(last_3_periods[1] + 1, "co", 2))
|
||||
{
|
||||
res_data += last_3_periods[2] + 1 - begin;
|
||||
res_size = last_3_periods[1] - last_3_periods[2] - 1;
|
||||
return;
|
||||
}
|
||||
|
||||
res_data += last_3_periods[1] + 1 - begin;
|
||||
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
||||
}
|
||||
};
|
||||
|
||||
struct CutToFirstSignificantSubdomain
|
||||
{
|
||||
static size_t getReserveLengthForElement() { return 15; }
|
||||
|
||||
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size)
|
||||
{
|
||||
res_data = data;
|
||||
res_size = 0;
|
||||
|
||||
Pos tmp_data;
|
||||
size_t tmp_length;
|
||||
Pos domain_end;
|
||||
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
|
||||
|
||||
if (tmp_length == 0)
|
||||
return;
|
||||
|
||||
res_data = tmp_data;
|
||||
res_size = domain_end - tmp_data;
|
||||
}
|
||||
};
|
||||
|
||||
struct ExtractTopLevelDomain
|
||||
{
|
||||
static size_t getReserveLengthForElement() { return 5; }
|
||||
@ -839,12 +927,15 @@ struct CutSubstringImpl
|
||||
struct NameProtocol { static const char * get() { return "protocol"; } };
|
||||
struct NameDomain { static const char * get() { return "domain"; } };
|
||||
struct NameDomainWithoutWWW { static const char * get() { return "domainWithoutWWW"; } };
|
||||
struct NameFirstSignificantSubdomain { static const char * get() { return "firstSignificantSubdomain"; } };
|
||||
struct NameTopLevelDomain { static const char * get() { return "topLevelDomain"; } };
|
||||
struct NamePath { static const char * get() { return "path"; } };
|
||||
struct NameQueryString { static const char * get() { return "queryString"; } };
|
||||
struct NameFragment { static const char * get() { return "fragment"; } };
|
||||
struct NameQueryStringAndFragment { static const char * get() { return "queryStringAndFragment"; } };
|
||||
|
||||
struct NameCutToFirstSignificantSubdomain { static const char * get() { return "cutToFirstSignificantSubdomain"; } };
|
||||
|
||||
struct NameCutWWW { static const char * get() { return "cutWWW"; } };
|
||||
struct NameCutQueryString { static const char * get() { return "cutQueryString"; } };
|
||||
struct NameCutFragment { static const char * get() { return "cutFragment"; } };
|
||||
@ -856,12 +947,15 @@ struct NameCutURLParameter { static const char * get() { return "cutURLParam
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractProtocol>, NameProtocol> FunctionProtocol;
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractDomain<false> >, NameDomain> FunctionDomain;
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractDomain<true> >, NameDomainWithoutWWW> FunctionDomainWithoutWWW;
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, NameFirstSignificantSubdomain> FunctionFirstSignificantSubdomain;
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain>, NameTopLevelDomain> FunctionTopLevelDomain;
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractPath>, NamePath> FunctionPath;
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractQueryString<true> >, NameQueryString> FunctionQueryString;
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractFragment<true> >, NameFragment> FunctionFragment;
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<ExtractQueryStringAndFragment<true> >, NameQueryStringAndFragment> FunctionQueryStringAndFragment;
|
||||
|
||||
typedef FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, NameCutToFirstSignificantSubdomain> FunctionCutToFirstSignificantSubdomain;
|
||||
|
||||
typedef FunctionStringToString<CutSubstringImpl<ExtractWWW>, NameCutWWW> FunctionCutWWW;
|
||||
typedef FunctionStringToString<CutSubstringImpl<ExtractQueryString<false> >, NameCutQueryString> FunctionCutQueryString;
|
||||
typedef FunctionStringToString<CutSubstringImpl<ExtractFragment<false> >, NameCutFragment> FunctionCutFragment;
|
||||
|
@ -11,6 +11,7 @@ void registerFunctionsURL(FunctionFactory & factory)
|
||||
factory.registerFunction("protocol", F { return new FunctionProtocol; });
|
||||
factory.registerFunction("domain", F { return new FunctionDomain; });
|
||||
factory.registerFunction("domainWithoutWWW", F { return new FunctionDomainWithoutWWW; });
|
||||
factory.registerFunction("firstSignificantSubdomain", F { return new FunctionFirstSignificantSubdomain; });
|
||||
factory.registerFunction("topLevelDomain", F { return new FunctionTopLevelDomain; });
|
||||
factory.registerFunction("path", F { return new FunctionPath; });
|
||||
factory.registerFunction("queryString", F { return new FunctionQueryString; });
|
||||
@ -21,6 +22,7 @@ void registerFunctionsURL(FunctionFactory & factory)
|
||||
factory.registerFunction("extractURLParameterNames", F { return new FunctionExtractURLParameterNames; });
|
||||
factory.registerFunction("URLHierarchy", F { return new FunctionURLHierarchy; });
|
||||
factory.registerFunction("URLPathHierarchy", F { return new FunctionURLPathHierarchy; });
|
||||
factory.registerFunction("cutToFirstSignificantSubdomain", F { return new FunctionCutToFirstSignificantSubdomain; });
|
||||
factory.registerFunction("cutWWW", F { return new FunctionCutWWW; });
|
||||
factory.registerFunction("cutQueryString", F { return new FunctionCutQueryString; });
|
||||
factory.registerFunction("cutFragment", F { return new FunctionCutFragment; });
|
||||
|
Loading…
Reference in New Issue
Block a user