diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index 0da74ce1b0e..2250c0ead20 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -131,6 +131,38 @@ For example: - `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`. - `cutToFirstSignificantSubdomain('tr') = ''`. +### cutToFirstSignificantSubdomainCustom {#cuttofirstsignificantsubdomaincustom} + +Same as `cutToFirstSignificantSubdomain` but accept custom TLD list name, useful if: + +- you need fresh TLD list, +- or you have custom. + +Configuration example: + +```xml + + + /path/to/public_suffix_list.dat + +``` + +Example: + +- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/', 'public_suffix_list') = 'yandex.com.tr'`. + +### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww} + +Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name. + +### firstSignificantSubdomainCustom {#firstsignificantsubdomaincustom} + +Same as `firstSignificantSubdomain` but accept custom TLD list name. + +### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww} + +Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name. + ### port(URL\[, default_port = 0\]) {#port} Returns the port or `default_port` if there is no port in the URL (or in case of validation error). diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index ad849f3c6e5..4480f34bb60 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -483,6 +484,9 @@ int Server::main(const std::vector & /*args*/) Poco::File(dictionaries_lib_path).createDirectories(); } + /// top_level_domains_lists + TLDListsHolder::getInstance().parseConfig(config()); + { Poco::File(path + "data/").createDirectories(); Poco::File(path + "metadata/").createDirectories(); diff --git a/programs/server/config.xml b/programs/server/config.xml index 851a7654d53..a46c6ae6eec 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -724,6 +724,17 @@ + + + + + diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 384c29ed675..1e381808d16 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -528,6 +528,7 @@ M(559, INVALID_GRPC_QUERY_INFO) \ M(560, ZSTD_ENCODER_FAILED) \ M(561, ZSTD_DECODER_FAILED) \ + M(562, TLD_LIST_NOT_FOUND) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/TLDListsHolder.cpp b/src/Common/TLDListsHolder.cpp new file mode 100644 index 00000000000..cbad8beaa7d --- /dev/null +++ b/src/Common/TLDListsHolder.cpp @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TLD_LIST_NOT_FOUND; +} + +/// +/// TLDList +/// +bool TLDList::insert(const StringRef & host) +{ + StringRefHash hash; + return tld_container.insert(hash(host)).second; +} +bool TLDList::has(const StringRef & host) const +{ + StringRefHash hash; + return tld_container.has(hash(host)); +} + +/// +/// TLDListsHolder +/// +TLDListsHolder & TLDListsHolder::getInstance() +{ + static TLDListsHolder instance; + return instance; +} +TLDListsHolder::TLDListsHolder() = default; + +void TLDListsHolder::parseConfig(const Poco::Util::AbstractConfiguration & config) +{ + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys("top_level_domains_lists", config_keys); + + Poco::Logger * log = &Poco::Logger::get("TLDListsHolder"); + + for (const auto & key : config_keys) + { + auto path = config.getString("top_level_domains_lists." + key); + LOG_TRACE(log, "{} loading from {}", key, path); + size_t hosts = parseAndAddTldList(key, path); + LOG_INFO(log, "{} was added ({} hosts)", key, hosts); + } +} + +size_t TLDListsHolder::parseAndAddTldList(const std::string & name, const std::string & path) +{ + TLDList tld_list; + + ReadBufferFromFile in(path); + while (!in.eof()) + { + char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end()); + if (newline >= in.buffer().end()) + break; + + std::string_view line(in.position(), newline - in.position()); + in.position() = newline + 1; + + /// Skip comments + if (line.size() > 2 && line[0] == '/' && line[1] == '/') + continue; + trim(line); + /// Skip empty line + if (line.empty()) + continue; + tld_list.insert(StringRef{line.data(), line.size()}); + } + + size_t tld_list_size = tld_list.size(); + std::lock_guard lock(tld_lists_map_mutex); + tld_lists_map.emplace(name, std::move(tld_list)); + return tld_list_size; +} + +const TLDList & TLDListsHolder::getTldList(const std::string & name) +{ + std::lock_guard lock(tld_lists_map_mutex); + auto it = tld_lists_map.find(name); + if (it == tld_lists_map.end()) + throw Exception(ErrorCodes::TLD_LIST_NOT_FOUND, "TLD list {} does not exist", name); + return it->second; +} + +} diff --git a/src/Common/TLDListsHolder.h b/src/Common/TLDListsHolder.h new file mode 100644 index 00000000000..9ce394267ec --- /dev/null +++ b/src/Common/TLDListsHolder.h @@ -0,0 +1,60 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Custom TLD List +/// Unlike tldLookup (which uses gperf) this one uses plain HashSet. +class TLDList +{ +public: + /// Uses StringRefHash + using Container = HashSet; + + /// Return true if the tld_container does not contains such element. + bool insert(const StringRef & host); + bool has(const StringRef & host) const; + size_t size() const { return tld_container.size(); } + +private: + Container tld_container; +}; + +class TLDListsHolder +{ +public: + using Map = std::unordered_map; + + static TLDListsHolder & getInstance(); + + /// Parse "top_level_domains_lists" section, + /// And add each found dictionary. + void parseConfig(const Poco::Util::AbstractConfiguration & config); + + /// Parse file and add it as a Set to the list of TLDs + /// - "//" -- comment, + /// - empty lines will be ignored. + /// + /// Example: https://publicsuffix.org/list/public_suffix_list.dat + /// + /// Return size of the list. + size_t parseAndAddTldList(const std::string & name, const std::string & path); + /// Throws TLD_LIST_NOT_FOUND if list does not exist + const TLDList & getTldList(const std::string & name); + +protected: + TLDListsHolder(); + + std::mutex tld_lists_map_mutex; + Map tld_lists_map; +}; + +} diff --git a/src/Common/ya.make b/src/Common/ya.make index 71c0edaea95..558ae25228a 100644 --- a/src/Common/ya.make +++ b/src/Common/ya.make @@ -68,6 +68,7 @@ SRCS( StringUtils/StringUtils.cpp StudentTTest.cpp SymbolIndex.cpp + TLDListsHolder.cpp TaskStatsInfoGetter.cpp TerminalSize.cpp ThreadFuzzer.cpp diff --git a/src/Functions/URL/firstSignificantSubdomain.h b/src/Functions/URL/ExtractFirstSignificantSubdomain.h similarity index 77% rename from src/Functions/URL/firstSignificantSubdomain.h rename to src/Functions/URL/ExtractFirstSignificantSubdomain.h index 522e7905f69..c13b5f50156 100644 --- a/src/Functions/URL/firstSignificantSubdomain.h +++ b/src/Functions/URL/ExtractFirstSignificantSubdomain.h @@ -7,12 +7,27 @@ namespace DB { +struct FirstSignificantSubdomainDefaultLookup +{ + bool operator()(const char *src, size_t len) const + { + return tldLookup::isValid(src, len); + } +}; + template struct ExtractFirstSignificantSubdomain { static size_t getReserveLengthForElement() { return 10; } static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr) + { + FirstSignificantSubdomainDefaultLookup loookup; + return execute(loookup, data, size, res_data, res_size, out_domain_end); + } + + template + static void execute(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr) { res_data = data; res_size = 0; @@ -65,7 +80,7 @@ struct ExtractFirstSignificantSubdomain end_of_level_domain = end; } - if (tldLookup::isValid(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1) != nullptr) + if (lookup(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1)) { res_data += last_3_periods[2] + 1 - begin; res_size = last_3_periods[1] - last_3_periods[2] - 1; diff --git a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h new file mode 100644 index 00000000000..244b32459c1 --- /dev/null +++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +struct FirstSignificantSubdomainCustomtLookup +{ + const TLDList & tld_list; + FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name) + : tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name)) + { + } + + bool operator()(const char *pos, size_t len) const + { + return tld_list.has(StringRef{pos, len}); + } +}; + +template +class FunctionCutToFirstSignificantSubdomainCustomImpl : public IFunction +{ +public: + static constexpr auto name = Name::name; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (!isString(arguments[0].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of first argument of function {}. Must be String.", + arguments[0].type->getName(), getName()); + if (!isString(arguments[1].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of second argument (TLD_list_name) of function {}. Must be String/FixedString.", + arguments[1].type->getName(), getName()); + const auto * column = arguments[1].column.get(); + if (!column || !checkAndGetColumnConstStringOrFixedString(column)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "The second argument of function {} should be a constant string with the name of the custom TLD", + getName()); + + return arguments[0].type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override + { + const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); + FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue()); + + /// FIXME: convertToFullColumnIfConst() is suboptimal + auto column = arguments[0].column->convertToFullColumnIfConst(); + if (const ColumnString * col = checkAndGetColumn(*column)) + { + auto col_res = ColumnString::create(); + vector(tld_lookup, col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets()); + return col_res; + } + else + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + + static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup, + const ColumnString::Chars & data, const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) + { + size_t size = offsets.size(); + res_offsets.resize(size); + res_data.reserve(size * Extractor::getReserveLengthForElement()); + + size_t prev_offset = 0; + size_t res_offset = 0; + + /// Matched part. + Pos start; + size_t length; + + for (size_t i = 0; i < size; ++i) + { + Extractor::execute(tld_lookup, reinterpret_cast(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length); + + res_data.resize(res_data.size() + length + 1); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length); + res_offset += length + 1; + res_data[res_offset - 1] = 0; + + res_offsets[i] = res_offset; + prev_offset = offsets[i]; + } + } +}; + +} diff --git a/src/Functions/URL/cutToFirstSignificantSubdomain.cpp b/src/Functions/URL/cutToFirstSignificantSubdomain.cpp index 43d614a7036..82eb366dae6 100644 --- a/src/Functions/URL/cutToFirstSignificantSubdomain.cpp +++ b/src/Functions/URL/cutToFirstSignificantSubdomain.cpp @@ -1,6 +1,6 @@ #include #include -#include "firstSignificantSubdomain.h" +#include "ExtractFirstSignificantSubdomain.h" namespace DB diff --git a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp new file mode 100644 index 00000000000..11fd27e317b --- /dev/null +++ b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp @@ -0,0 +1,43 @@ +#include +#include "ExtractFirstSignificantSubdomain.h" +#include "FirstSignificantSubdomainCustomImpl.h" + +namespace DB +{ + +template +struct CutToFirstSignificantSubdomainCustom +{ + static size_t getReserveLengthForElement() { return 15; } + + static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size) + { + res_data = data; + res_size = 0; + + Pos tmp_data; + size_t tmp_length; + Pos domain_end; + ExtractFirstSignificantSubdomain::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end); + + if (tmp_length == 0) + return; + + res_data = tmp_data; + res_size = domain_end - tmp_data; + } +}; + +struct NameCutToFirstSignificantSubdomainCustom { static constexpr auto name = "cutToFirstSignificantSubdomainCustom"; }; +using FunctionCutToFirstSignificantSubdomainCustom = FunctionCutToFirstSignificantSubdomainCustomImpl, NameCutToFirstSignificantSubdomainCustom>; + +struct NameCutToFirstSignificantSubdomainCustomWithWWW { static constexpr auto name = "cutToFirstSignificantSubdomainCustomWithWWW"; }; +using FunctionCutToFirstSignificantSubdomainCustomWithWWW = FunctionCutToFirstSignificantSubdomainCustomImpl, NameCutToFirstSignificantSubdomainCustomWithWWW>; + +void registerFunctionCutToFirstSignificantSubdomainCustom(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); +} + +} diff --git a/src/Functions/URL/firstSignificantSubdomain.cpp b/src/Functions/URL/firstSignificantSubdomain.cpp index 7db18824375..87659940938 100644 --- a/src/Functions/URL/firstSignificantSubdomain.cpp +++ b/src/Functions/URL/firstSignificantSubdomain.cpp @@ -1,12 +1,13 @@ #include #include -#include "firstSignificantSubdomain.h" +#include "ExtractFirstSignificantSubdomain.h" namespace DB { struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; }; + using FunctionFirstSignificantSubdomain = FunctionStringToString>, NameFirstSignificantSubdomain>; void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory) diff --git a/src/Functions/URL/firstSignificantSubdomainCustom.cpp b/src/Functions/URL/firstSignificantSubdomainCustom.cpp new file mode 100644 index 00000000000..675b4a346de --- /dev/null +++ b/src/Functions/URL/firstSignificantSubdomainCustom.cpp @@ -0,0 +1,18 @@ +#include +#include "ExtractFirstSignificantSubdomain.h" +#include "FirstSignificantSubdomainCustomImpl.h" + + +namespace DB +{ + +struct NameFirstSignificantSubdomainCustom { static constexpr auto name = "firstSignificantSubdomainCustom"; }; + +using FunctionFirstSignificantSubdomainCustom = FunctionCutToFirstSignificantSubdomainCustomImpl, NameFirstSignificantSubdomainCustom>; + +void registerFunctionFirstSignificantSubdomainCustom(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/URL/registerFunctionsURL.cpp b/src/Functions/URL/registerFunctionsURL.cpp index f3906c2723e..91118074b7a 100644 --- a/src/Functions/URL/registerFunctionsURL.cpp +++ b/src/Functions/URL/registerFunctionsURL.cpp @@ -7,6 +7,7 @@ void registerFunctionProtocol(FunctionFactory & factory); void registerFunctionDomain(FunctionFactory & factory); void registerFunctionDomainWithoutWWW(FunctionFactory & factory); void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory); +void registerFunctionFirstSignificantSubdomainCustom(FunctionFactory & factory); void registerFunctionTopLevelDomain(FunctionFactory & factory); void registerFunctionPort(FunctionFactory & factory); void registerFunctionPath(FunctionFactory & factory); @@ -20,6 +21,7 @@ void registerFunctionExtractURLParameterNames(FunctionFactory & factory); void registerFunctionURLHierarchy(FunctionFactory & factory); void registerFunctionURLPathHierarchy(FunctionFactory & factory); void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory); +void registerFunctionCutToFirstSignificantSubdomainCustom(FunctionFactory & factory); void registerFunctionCutWWW(FunctionFactory & factory); void registerFunctionCutQueryString(FunctionFactory & factory); void registerFunctionCutFragment(FunctionFactory & factory); @@ -34,6 +36,7 @@ void registerFunctionsURL(FunctionFactory & factory) registerFunctionDomain(factory); registerFunctionDomainWithoutWWW(factory); registerFunctionFirstSignificantSubdomain(factory); + registerFunctionFirstSignificantSubdomainCustom(factory); registerFunctionTopLevelDomain(factory); registerFunctionPort(factory); registerFunctionPath(factory); @@ -47,6 +50,7 @@ void registerFunctionsURL(FunctionFactory & factory) registerFunctionURLHierarchy(factory); registerFunctionURLPathHierarchy(factory); registerFunctionCutToFirstSignificantSubdomain(factory); + registerFunctionCutToFirstSignificantSubdomainCustom(factory); registerFunctionCutWWW(factory); registerFunctionCutQueryString(factory); registerFunctionCutFragment(factory); diff --git a/src/Functions/URL/tldLookup.h b/src/Functions/URL/tldLookup.h index 25857be3dd2..38c118b6bb1 100644 --- a/src/Functions/URL/tldLookup.h +++ b/src/Functions/URL/tldLookup.h @@ -1,5 +1,7 @@ #pragma once +#include + // Definition of the class generated by gperf, present on gperf/tldLookup.gperf class TopLevelDomainLookupHash { diff --git a/src/Functions/ya.make b/src/Functions/ya.make index 436a6a89996..f768ef0c374 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -80,6 +80,7 @@ SRCS( URL/cutQueryString.cpp URL/cutQueryStringAndFragment.cpp URL/cutToFirstSignificantSubdomain.cpp + URL/cutToFirstSignificantSubdomainCustom.cpp URL/cutURLParameter.cpp URL/cutWWW.cpp URL/decodeURLComponent.cpp @@ -89,6 +90,7 @@ SRCS( URL/extractURLParameterNames.cpp URL/extractURLParameters.cpp URL/firstSignificantSubdomain.cpp + URL/firstSignificantSubdomainCustom.cpp URL/fragment.cpp URL/netloc.cpp URL/path.cpp