diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md
index 0da74ce1b0e..2250c0ead20 100644
--- a/docs/en/sql-reference/functions/url-functions.md
+++ b/docs/en/sql-reference/functions/url-functions.md
@@ -131,6 +131,38 @@ For example:
- `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`.
- `cutToFirstSignificantSubdomain('tr') = ''`.
+### cutToFirstSignificantSubdomainCustom {#cuttofirstsignificantsubdomaincustom}
+
+Same as `cutToFirstSignificantSubdomain` but accept custom TLD list name, useful if:
+
+- you need fresh TLD list,
+- or you have custom.
+
+Configuration example:
+
+```xml
+
+
+ /path/to/public_suffix_list.dat
+
+```
+
+Example:
+
+- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/', 'public_suffix_list') = 'yandex.com.tr'`.
+
+### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
+
+Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name.
+
+### firstSignificantSubdomainCustom {#firstsignificantsubdomaincustom}
+
+Same as `firstSignificantSubdomain` but accept custom TLD list name.
+
+### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
+
+Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name.
+
### port(URL\[, default_port = 0\]) {#port}
Returns the port or `default_port` if there is no port in the URL (or in case of validation error).
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index ad849f3c6e5..4480f34bb60 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -34,6 +34,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -483,6 +484,9 @@ int Server::main(const std::vector & /*args*/)
Poco::File(dictionaries_lib_path).createDirectories();
}
+ /// top_level_domains_lists
+ TLDListsHolder::getInstance().parseConfig(config());
+
{
Poco::File(path + "data/").createDirectories();
Poco::File(path + "metadata/").createDirectories();
diff --git a/programs/server/config.xml b/programs/server/config.xml
index 851a7654d53..a46c6ae6eec 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -724,6 +724,17 @@
+
+
+
+
+
diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index 384c29ed675..1e381808d16 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -528,6 +528,7 @@
M(559, INVALID_GRPC_QUERY_INFO) \
M(560, ZSTD_ENCODER_FAILED) \
M(561, ZSTD_DECODER_FAILED) \
+ M(562, TLD_LIST_NOT_FOUND) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \
diff --git a/src/Common/TLDListsHolder.cpp b/src/Common/TLDListsHolder.cpp
new file mode 100644
index 00000000000..cbad8beaa7d
--- /dev/null
+++ b/src/Common/TLDListsHolder.cpp
@@ -0,0 +1,94 @@
+#include
+#include
+#include
+#include
+#include
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int TLD_LIST_NOT_FOUND;
+}
+
+///
+/// TLDList
+///
+bool TLDList::insert(const StringRef & host)
+{
+ StringRefHash hash;
+ return tld_container.insert(hash(host)).second;
+}
+bool TLDList::has(const StringRef & host) const
+{
+ StringRefHash hash;
+ return tld_container.has(hash(host));
+}
+
+///
+/// TLDListsHolder
+///
+TLDListsHolder & TLDListsHolder::getInstance()
+{
+ static TLDListsHolder instance;
+ return instance;
+}
+TLDListsHolder::TLDListsHolder() = default;
+
+void TLDListsHolder::parseConfig(const Poco::Util::AbstractConfiguration & config)
+{
+ Poco::Util::AbstractConfiguration::Keys config_keys;
+ config.keys("top_level_domains_lists", config_keys);
+
+ Poco::Logger * log = &Poco::Logger::get("TLDListsHolder");
+
+ for (const auto & key : config_keys)
+ {
+ auto path = config.getString("top_level_domains_lists." + key);
+ LOG_TRACE(log, "{} loading from {}", key, path);
+ size_t hosts = parseAndAddTldList(key, path);
+ LOG_INFO(log, "{} was added ({} hosts)", key, hosts);
+ }
+}
+
+size_t TLDListsHolder::parseAndAddTldList(const std::string & name, const std::string & path)
+{
+ TLDList tld_list;
+
+ ReadBufferFromFile in(path);
+ while (!in.eof())
+ {
+ char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
+ if (newline >= in.buffer().end())
+ break;
+
+ std::string_view line(in.position(), newline - in.position());
+ in.position() = newline + 1;
+
+ /// Skip comments
+ if (line.size() > 2 && line[0] == '/' && line[1] == '/')
+ continue;
+ trim(line);
+ /// Skip empty line
+ if (line.empty())
+ continue;
+ tld_list.insert(StringRef{line.data(), line.size()});
+ }
+
+ size_t tld_list_size = tld_list.size();
+ std::lock_guard lock(tld_lists_map_mutex);
+ tld_lists_map.emplace(name, std::move(tld_list));
+ return tld_list_size;
+}
+
+const TLDList & TLDListsHolder::getTldList(const std::string & name)
+{
+ std::lock_guard lock(tld_lists_map_mutex);
+ auto it = tld_lists_map.find(name);
+ if (it == tld_lists_map.end())
+ throw Exception(ErrorCodes::TLD_LIST_NOT_FOUND, "TLD list {} does not exist", name);
+ return it->second;
+}
+
+}
diff --git a/src/Common/TLDListsHolder.h b/src/Common/TLDListsHolder.h
new file mode 100644
index 00000000000..9ce394267ec
--- /dev/null
+++ b/src/Common/TLDListsHolder.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace DB
+{
+
+/// Custom TLD List
+/// Unlike tldLookup (which uses gperf) this one uses plain HashSet.
+class TLDList
+{
+public:
+ /// Uses StringRefHash
+ using Container = HashSet;
+
+ /// Return true if the tld_container does not contains such element.
+ bool insert(const StringRef & host);
+ bool has(const StringRef & host) const;
+ size_t size() const { return tld_container.size(); }
+
+private:
+ Container tld_container;
+};
+
+class TLDListsHolder
+{
+public:
+ using Map = std::unordered_map;
+
+ static TLDListsHolder & getInstance();
+
+ /// Parse "top_level_domains_lists" section,
+ /// And add each found dictionary.
+ void parseConfig(const Poco::Util::AbstractConfiguration & config);
+
+ /// Parse file and add it as a Set to the list of TLDs
+ /// - "//" -- comment,
+ /// - empty lines will be ignored.
+ ///
+ /// Example: https://publicsuffix.org/list/public_suffix_list.dat
+ ///
+ /// Return size of the list.
+ size_t parseAndAddTldList(const std::string & name, const std::string & path);
+ /// Throws TLD_LIST_NOT_FOUND if list does not exist
+ const TLDList & getTldList(const std::string & name);
+
+protected:
+ TLDListsHolder();
+
+ std::mutex tld_lists_map_mutex;
+ Map tld_lists_map;
+};
+
+}
diff --git a/src/Common/ya.make b/src/Common/ya.make
index 71c0edaea95..558ae25228a 100644
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@@ -68,6 +68,7 @@ SRCS(
StringUtils/StringUtils.cpp
StudentTTest.cpp
SymbolIndex.cpp
+ TLDListsHolder.cpp
TaskStatsInfoGetter.cpp
TerminalSize.cpp
ThreadFuzzer.cpp
diff --git a/src/Functions/URL/firstSignificantSubdomain.h b/src/Functions/URL/ExtractFirstSignificantSubdomain.h
similarity index 77%
rename from src/Functions/URL/firstSignificantSubdomain.h
rename to src/Functions/URL/ExtractFirstSignificantSubdomain.h
index 522e7905f69..c13b5f50156 100644
--- a/src/Functions/URL/firstSignificantSubdomain.h
+++ b/src/Functions/URL/ExtractFirstSignificantSubdomain.h
@@ -7,12 +7,27 @@
namespace DB
{
+struct FirstSignificantSubdomainDefaultLookup
+{
+ bool operator()(const char *src, size_t len) const
+ {
+ return tldLookup::isValid(src, len);
+ }
+};
+
template
struct ExtractFirstSignificantSubdomain
{
static size_t getReserveLengthForElement() { return 10; }
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
+ {
+ FirstSignificantSubdomainDefaultLookup loookup;
+ return execute(loookup, data, size, res_data, res_size, out_domain_end);
+ }
+
+ template
+ static void execute(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
{
res_data = data;
res_size = 0;
@@ -65,7 +80,7 @@ struct ExtractFirstSignificantSubdomain
end_of_level_domain = end;
}
- if (tldLookup::isValid(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1) != nullptr)
+ if (lookup(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1))
{
res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1;
diff --git a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h
new file mode 100644
index 00000000000..244b32459c1
--- /dev/null
+++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_COLUMN;
+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+struct FirstSignificantSubdomainCustomtLookup
+{
+ const TLDList & tld_list;
+ FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name)
+ : tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name))
+ {
+ }
+
+ bool operator()(const char *pos, size_t len) const
+ {
+ return tld_list.has(StringRef{pos, len});
+ }
+};
+
+template
+class FunctionCutToFirstSignificantSubdomainCustomImpl : public IFunction
+{
+public:
+ static constexpr auto name = Name::name;
+ static FunctionPtr create(const Context &) { return std::make_shared(); }
+
+ String getName() const override { return name; }
+ size_t getNumberOfArguments() const override { return 2; }
+
+ DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+ {
+ if (!isString(arguments[0].type))
+ throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+ "Illegal type {} of first argument of function {}. Must be String.",
+ arguments[0].type->getName(), getName());
+ if (!isString(arguments[1].type))
+ throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+ "Illegal type {} of second argument (TLD_list_name) of function {}. Must be String/FixedString.",
+ arguments[1].type->getName(), getName());
+ const auto * column = arguments[1].column.get();
+ if (!column || !checkAndGetColumnConstStringOrFixedString(column))
+ throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+ "The second argument of function {} should be a constant string with the name of the custom TLD",
+ getName());
+
+ return arguments[0].type;
+ }
+
+ ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
+ {
+ const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
+ FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue());
+
+ /// FIXME: convertToFullColumnIfConst() is suboptimal
+ auto column = arguments[0].column->convertToFullColumnIfConst();
+ if (const ColumnString * col = checkAndGetColumn(*column))
+ {
+ auto col_res = ColumnString::create();
+ vector(tld_lookup, col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
+ return col_res;
+ }
+ else
+ throw Exception(
+ "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(),
+ ErrorCodes::ILLEGAL_COLUMN);
+ }
+
+ static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup,
+ const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
+ ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
+ {
+ size_t size = offsets.size();
+ res_offsets.resize(size);
+ res_data.reserve(size * Extractor::getReserveLengthForElement());
+
+ size_t prev_offset = 0;
+ size_t res_offset = 0;
+
+ /// Matched part.
+ Pos start;
+ size_t length;
+
+ for (size_t i = 0; i < size; ++i)
+ {
+ Extractor::execute(tld_lookup, reinterpret_cast(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length);
+
+ res_data.resize(res_data.size() + length + 1);
+ memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length);
+ res_offset += length + 1;
+ res_data[res_offset - 1] = 0;
+
+ res_offsets[i] = res_offset;
+ prev_offset = offsets[i];
+ }
+ }
+};
+
+}
diff --git a/src/Functions/URL/cutToFirstSignificantSubdomain.cpp b/src/Functions/URL/cutToFirstSignificantSubdomain.cpp
index 43d614a7036..82eb366dae6 100644
--- a/src/Functions/URL/cutToFirstSignificantSubdomain.cpp
+++ b/src/Functions/URL/cutToFirstSignificantSubdomain.cpp
@@ -1,6 +1,6 @@
#include
#include
-#include "firstSignificantSubdomain.h"
+#include "ExtractFirstSignificantSubdomain.h"
namespace DB
diff --git a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
new file mode 100644
index 00000000000..11fd27e317b
--- /dev/null
+++ b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
@@ -0,0 +1,43 @@
+#include
+#include "ExtractFirstSignificantSubdomain.h"
+#include "FirstSignificantSubdomainCustomImpl.h"
+
+namespace DB
+{
+
+template
+struct CutToFirstSignificantSubdomainCustom
+{
+ static size_t getReserveLengthForElement() { return 15; }
+
+ static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
+ {
+ res_data = data;
+ res_size = 0;
+
+ Pos tmp_data;
+ size_t tmp_length;
+ Pos domain_end;
+ ExtractFirstSignificantSubdomain::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
+
+ if (tmp_length == 0)
+ return;
+
+ res_data = tmp_data;
+ res_size = domain_end - tmp_data;
+ }
+};
+
+struct NameCutToFirstSignificantSubdomainCustom { static constexpr auto name = "cutToFirstSignificantSubdomainCustom"; };
+using FunctionCutToFirstSignificantSubdomainCustom = FunctionCutToFirstSignificantSubdomainCustomImpl, NameCutToFirstSignificantSubdomainCustom>;
+
+struct NameCutToFirstSignificantSubdomainCustomWithWWW { static constexpr auto name = "cutToFirstSignificantSubdomainCustomWithWWW"; };
+using FunctionCutToFirstSignificantSubdomainCustomWithWWW = FunctionCutToFirstSignificantSubdomainCustomImpl, NameCutToFirstSignificantSubdomainCustomWithWWW>;
+
+void registerFunctionCutToFirstSignificantSubdomainCustom(FunctionFactory & factory)
+{
+ factory.registerFunction();
+ factory.registerFunction();
+}
+
+}
diff --git a/src/Functions/URL/firstSignificantSubdomain.cpp b/src/Functions/URL/firstSignificantSubdomain.cpp
index 7db18824375..87659940938 100644
--- a/src/Functions/URL/firstSignificantSubdomain.cpp
+++ b/src/Functions/URL/firstSignificantSubdomain.cpp
@@ -1,12 +1,13 @@
#include
#include
-#include "firstSignificantSubdomain.h"
+#include "ExtractFirstSignificantSubdomain.h"
namespace DB
{
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
+
using FunctionFirstSignificantSubdomain = FunctionStringToString>, NameFirstSignificantSubdomain>;
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory)
diff --git a/src/Functions/URL/firstSignificantSubdomainCustom.cpp b/src/Functions/URL/firstSignificantSubdomainCustom.cpp
new file mode 100644
index 00000000000..675b4a346de
--- /dev/null
+++ b/src/Functions/URL/firstSignificantSubdomainCustom.cpp
@@ -0,0 +1,18 @@
+#include
+#include "ExtractFirstSignificantSubdomain.h"
+#include "FirstSignificantSubdomainCustomImpl.h"
+
+
+namespace DB
+{
+
+struct NameFirstSignificantSubdomainCustom { static constexpr auto name = "firstSignificantSubdomainCustom"; };
+
+using FunctionFirstSignificantSubdomainCustom = FunctionCutToFirstSignificantSubdomainCustomImpl, NameFirstSignificantSubdomainCustom>;
+
+void registerFunctionFirstSignificantSubdomainCustom(FunctionFactory & factory)
+{
+ factory.registerFunction();
+}
+
+}
diff --git a/src/Functions/URL/registerFunctionsURL.cpp b/src/Functions/URL/registerFunctionsURL.cpp
index f3906c2723e..91118074b7a 100644
--- a/src/Functions/URL/registerFunctionsURL.cpp
+++ b/src/Functions/URL/registerFunctionsURL.cpp
@@ -7,6 +7,7 @@ void registerFunctionProtocol(FunctionFactory & factory);
void registerFunctionDomain(FunctionFactory & factory);
void registerFunctionDomainWithoutWWW(FunctionFactory & factory);
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory);
+void registerFunctionFirstSignificantSubdomainCustom(FunctionFactory & factory);
void registerFunctionTopLevelDomain(FunctionFactory & factory);
void registerFunctionPort(FunctionFactory & factory);
void registerFunctionPath(FunctionFactory & factory);
@@ -20,6 +21,7 @@ void registerFunctionExtractURLParameterNames(FunctionFactory & factory);
void registerFunctionURLHierarchy(FunctionFactory & factory);
void registerFunctionURLPathHierarchy(FunctionFactory & factory);
void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory);
+void registerFunctionCutToFirstSignificantSubdomainCustom(FunctionFactory & factory);
void registerFunctionCutWWW(FunctionFactory & factory);
void registerFunctionCutQueryString(FunctionFactory & factory);
void registerFunctionCutFragment(FunctionFactory & factory);
@@ -34,6 +36,7 @@ void registerFunctionsURL(FunctionFactory & factory)
registerFunctionDomain(factory);
registerFunctionDomainWithoutWWW(factory);
registerFunctionFirstSignificantSubdomain(factory);
+ registerFunctionFirstSignificantSubdomainCustom(factory);
registerFunctionTopLevelDomain(factory);
registerFunctionPort(factory);
registerFunctionPath(factory);
@@ -47,6 +50,7 @@ void registerFunctionsURL(FunctionFactory & factory)
registerFunctionURLHierarchy(factory);
registerFunctionURLPathHierarchy(factory);
registerFunctionCutToFirstSignificantSubdomain(factory);
+ registerFunctionCutToFirstSignificantSubdomainCustom(factory);
registerFunctionCutWWW(factory);
registerFunctionCutQueryString(factory);
registerFunctionCutFragment(factory);
diff --git a/src/Functions/URL/tldLookup.h b/src/Functions/URL/tldLookup.h
index 25857be3dd2..38c118b6bb1 100644
--- a/src/Functions/URL/tldLookup.h
+++ b/src/Functions/URL/tldLookup.h
@@ -1,5 +1,7 @@
#pragma once
+#include
+
// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
class TopLevelDomainLookupHash
{
diff --git a/src/Functions/ya.make b/src/Functions/ya.make
index 436a6a89996..f768ef0c374 100644
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@@ -80,6 +80,7 @@ SRCS(
URL/cutQueryString.cpp
URL/cutQueryStringAndFragment.cpp
URL/cutToFirstSignificantSubdomain.cpp
+ URL/cutToFirstSignificantSubdomainCustom.cpp
URL/cutURLParameter.cpp
URL/cutWWW.cpp
URL/decodeURLComponent.cpp
@@ -89,6 +90,7 @@ SRCS(
URL/extractURLParameterNames.cpp
URL/extractURLParameters.cpp
URL/firstSignificantSubdomain.cpp
+ URL/firstSignificantSubdomainCustom.cpp
URL/fragment.cpp
URL/netloc.cpp
URL/path.cpp