Merge pull request #17748 from azat/custom-TLD

Add ability to use custom TLD list
This commit is contained in:
Alexander Kuzmenkov 2020-12-11 12:42:19 +03:00 committed by GitHub
commit fb1221148d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 14190 additions and 25 deletions

View File

@ -64,7 +64,14 @@ function stop_server
function start_server function start_server
{ {
set -m # Spawn server in its own process groups set -m # Spawn server in its own process groups
clickhouse-server --config-file="$FASTTEST_DATA/config.xml" -- --path "$FASTTEST_DATA" --user_files_path "$FASTTEST_DATA/user_files" &>> "$FASTTEST_OUTPUT/server.log" & local opts=(
--config-file="$FASTTEST_DATA/config.xml"
--
--path "$FASTTEST_DATA"
--user_files_path "$FASTTEST_DATA/user_files"
--top_level_domains_path "$FASTTEST_DATA/top_level_domains"
)
clickhouse-server "${opts[@]}" &>> "$FASTTEST_OUTPUT/server.log" &
server_pid=$! server_pid=$!
set +m set +m

View File

@ -53,4 +53,3 @@ COPY * /
CMD ["bash", "-c", "node=$((RANDOM % $(numactl --hardware | sed -n 's/^.*available:\\(.*\\)nodes.*$/\\1/p'))); echo Will bind to NUMA node $node; numactl --cpunodebind=$node --membind=$node /entrypoint.sh"] CMD ["bash", "-c", "node=$((RANDOM % $(numactl --hardware | sed -n 's/^.*available:\\(.*\\)nodes.*$/\\1/p'))); echo Will bind to NUMA node $node; numactl --cpunodebind=$node --membind=$node /entrypoint.sh"]
# docker run --network=host --volume <workspace>:/workspace --volume=<output>:/output -e PR_TO_TEST=<> -e SHA_TO_TEST=<> yandex/clickhouse-performance-comparison # docker run --network=host --volume <workspace>:/workspace --volume=<output>:/output -e PR_TO_TEST=<> -e SHA_TO_TEST=<> yandex/clickhouse-performance-comparison

View File

@ -55,6 +55,7 @@ function configure
# server *config* directives overrides # server *config* directives overrides
--path db0 --path db0
--user_files_path db0/user_files --user_files_path db0/user_files
--top_level_domains_path /top_level_domains
--tcp_port $LEFT_SERVER_PORT --tcp_port $LEFT_SERVER_PORT
) )
left/clickhouse-server "${setup_left_server_opts[@]}" &> setup-server-log.log & left/clickhouse-server "${setup_left_server_opts[@]}" &> setup-server-log.log &
@ -102,6 +103,7 @@ function restart
# server *config* directives overrides # server *config* directives overrides
--path left/db --path left/db
--user_files_path left/db/user_files --user_files_path left/db/user_files
--top_level_domains_path /top_level_domains
--tcp_port $LEFT_SERVER_PORT --tcp_port $LEFT_SERVER_PORT
) )
left/clickhouse-server "${left_server_opts[@]}" &>> left-server-log.log & left/clickhouse-server "${left_server_opts[@]}" &>> left-server-log.log &
@ -116,6 +118,7 @@ function restart
# server *config* directives overrides # server *config* directives overrides
--path right/db --path right/db
--user_files_path right/db/user_files --user_files_path right/db/user_files
--top_level_domains_path /top_level_domains
--tcp_port $RIGHT_SERVER_PORT --tcp_port $RIGHT_SERVER_PORT
) )
right/clickhouse-server "${right_server_opts[@]}" &>> right-server-log.log & right/clickhouse-server "${right_server_opts[@]}" &>> right-server-log.log &

View File

@ -0,0 +1,5 @@
<yandex>
<top_level_domains_lists>
<public_suffix_list>public_suffix_list.dat</public_suffix_list>
</top_level_domains_lists>
</yandex>

File diff suppressed because it is too large Load Diff

View File

@ -131,6 +131,40 @@ For example:
- `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`. - `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`.
- `cutToFirstSignificantSubdomain('tr') = ''`. - `cutToFirstSignificantSubdomain('tr') = ''`.
### cutToFirstSignificantSubdomainCustom {#cuttofirstsignificantsubdomaincustom}
Same as `cutToFirstSignificantSubdomain` but accept custom TLD list name, useful if:
- you need fresh TLD list,
- or you have custom.
Configuration example:
```xml
<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
<top_level_domains_lists>
<!-- https://publicsuffix.org/list/public_suffix_list.dat -->
<public_suffix_list>public_suffix_list.dat</public_suffix_list>
<!-- NOTE: path is under top_level_domains_path -->
</top_level_domains_lists>
```
Example:
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/', 'public_suffix_list') = 'yandex.com.tr'`.
### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name.
### firstSignificantSubdomainCustom {#firstsignificantsubdomaincustom}
Same as `firstSignificantSubdomain` but accept custom TLD list name.
### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name.
### port(URL\[, default_port = 0\]) {#port} ### port(URL\[, default_port = 0\]) {#port}
Returns the port or `default_port` if there is no port in the URL (or in case of validation error). Returns the port or `default_port` if there is no port in the URL (or in case of validation error).

View File

@ -34,6 +34,7 @@
#include <Common/ThreadStatus.h> #include <Common/ThreadStatus.h>
#include <Common/getMappedArea.h> #include <Common/getMappedArea.h>
#include <Common/remapExecutable.h> #include <Common/remapExecutable.h>
#include <Common/TLDListsHolder.h>
#include <IO/HTTPCommon.h> #include <IO/HTTPCommon.h>
#include <IO/UseSSL.h> #include <IO/UseSSL.h>
#include <Interpreters/AsynchronousMetrics.h> #include <Interpreters/AsynchronousMetrics.h>
@ -542,6 +543,12 @@ int Server::main(const std::vector<std::string> & /*args*/)
Poco::File(dictionaries_lib_path).createDirectories(); Poco::File(dictionaries_lib_path).createDirectories();
} }
/// top_level_domains_lists
{
const std::string & top_level_domains_path = config().getString("top_level_domains_path", path + "top_level_domains/") + "/";
TLDListsHolder::getInstance().parseConfig(top_level_domains_path, config());
}
{ {
Poco::File(path + "data/").createDirectories(); Poco::File(path + "data/").createDirectories();
Poco::File(path + "metadata/").createDirectories(); Poco::File(path + "metadata/").createDirectories();

View File

@ -4,4 +4,5 @@
<user_files_path replace="replace">./user_files/</user_files_path> <user_files_path replace="replace">./user_files/</user_files_path>
<format_schema_path replace="replace">./format_schemas/</format_schema_path> <format_schema_path replace="replace">./format_schemas/</format_schema_path>
<access_control_path replace="replace">./access/</access_control_path> <access_control_path replace="replace">./access/</access_control_path>
<top_level_domains_path replace="replace">./top_level_domains/</top_level_domains_path>
</yandex> </yandex>

View File

@ -724,6 +724,19 @@
<!-- <path_to_regions_names_files>/opt/geo/</path_to_regions_names_files> --> <!-- <path_to_regions_names_files>/opt/geo/</path_to_regions_names_files> -->
<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
<!-- Custom TLD lists.
Format: <name>/path/to/file</name>
Changes will not be applied w/o server restart.
Path to the list is under top_level_domains_path (see above).
-->
<top_level_domains_lists>
<!--
<public_suffix_list>/path/to/public_suffix_list.dat</public_suffix_list>
-->
</top_level_domains_lists>
<!-- Configuration of external dictionaries. See: <!-- Configuration of external dictionaries. See:
https://clickhouse.tech/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts https://clickhouse.tech/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts
--> -->

View File

@ -528,6 +528,7 @@
M(559, INVALID_GRPC_QUERY_INFO) \ M(559, INVALID_GRPC_QUERY_INFO) \
M(560, ZSTD_ENCODER_FAILED) \ M(560, ZSTD_ENCODER_FAILED) \
M(561, ZSTD_DECODER_FAILED) \ M(561, ZSTD_DECODER_FAILED) \
M(562, TLD_LIST_NOT_FOUND) \
\ \
M(999, KEEPER_EXCEPTION) \ M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \ M(1000, POCO_EXCEPTION) \

View File

@ -0,0 +1,101 @@
#pragma once
#include <Common/HashTable/HashSet.h>
#include <Common/HashTable/HashTableAllocator.h>
#include <Common/HashTable/StringHashTable.h>
template <typename Key>
struct StringHashSetCell : public HashTableCell<Key, StringHashTableHash, HashTableNoState>
{
using Base = HashTableCell<Key, StringHashTableHash, HashTableNoState>;
using Base::Base;
VoidMapped void_map;
VoidMapped & getMapped() { return void_map; }
const VoidMapped & getMapped() const { return void_map; }
static constexpr bool need_zero_value_storage = false;
};
template <>
struct StringHashSetCell<StringKey16> : public HashTableCell<StringKey16, StringHashTableHash, HashTableNoState>
{
using Base = HashTableCell<StringKey16, StringHashTableHash, HashTableNoState>;
using Base::Base;
VoidMapped void_map;
VoidMapped & getMapped() { return void_map; }
const VoidMapped & getMapped() const { return void_map; }
static constexpr bool need_zero_value_storage = false;
bool isZero(const HashTableNoState & state) const { return isZero(this->key, state); }
// Zero means unoccupied cells in hash table. Use key with last word = 0 as
// zero keys, because such keys are unrepresentable (no way to encode length).
static bool isZero(const StringKey16 & key_, const HashTableNoState &)
{ return key_.high == 0; }
void setZero() { this->key.high = 0; }
};
template <>
struct StringHashSetCell<StringKey24> : public HashTableCell<StringKey24, StringHashTableHash, HashTableNoState>
{
using Base = HashTableCell<StringKey24, StringHashTableHash, HashTableNoState>;
using Base::Base;
VoidMapped void_map;
VoidMapped & getMapped() { return void_map; }
const VoidMapped & getMapped() const { return void_map; }
static constexpr bool need_zero_value_storage = false;
bool isZero(const HashTableNoState & state) const { return isZero(this->key, state); }
// Zero means unoccupied cells in hash table. Use key with last word = 0 as
// zero keys, because such keys are unrepresentable (no way to encode length).
static bool isZero(const StringKey24 & key_, const HashTableNoState &)
{ return key_.c == 0; }
void setZero() { this->key.c = 0; }
};
template <>
struct StringHashSetCell<StringRef> : public HashSetCellWithSavedHash<StringRef, StringHashTableHash, HashTableNoState>
{
using Base = HashSetCellWithSavedHash<StringRef, StringHashTableHash, HashTableNoState>;
using Base::Base;
VoidMapped void_map;
VoidMapped & getMapped() { return void_map; }
const VoidMapped & getMapped() const { return void_map; }
static constexpr bool need_zero_value_storage = false;
};
template <typename Allocator>
struct StringHashSetSubMaps
{
using T0 = StringHashTableEmpty<StringHashSetCell<StringRef>>;
using T1 = HashSetTable<StringKey8, StringHashSetCell<StringKey8>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
using T2 = HashSetTable<StringKey16, StringHashSetCell<StringKey16>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
using T3 = HashSetTable<StringKey24, StringHashSetCell<StringKey24>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
using Ts = HashSetTable<StringRef, StringHashSetCell<StringRef>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
};
template <typename Allocator = HashTableAllocator>
class StringHashSet : public StringHashTable<StringHashSetSubMaps<Allocator>>
{
public:
using Key = StringRef;
using Base = StringHashTable<StringHashSetSubMaps<Allocator>>;
using Self = StringHashSet;
using LookupResult = typename Base::LookupResult;
using Base::Base;
template <typename KeyHolder>
void ALWAYS_INLINE emplace(KeyHolder && key_holder, bool & inserted)
{
LookupResult it;
Base::emplace(key_holder, it, inserted);
}
};

View File

@ -212,7 +212,7 @@ public:
using LookupResult = StringHashTableLookupResult<typename cell_type::mapped_type>; using LookupResult = StringHashTableLookupResult<typename cell_type::mapped_type>;
using ConstLookupResult = StringHashTableLookupResult<const typename cell_type::mapped_type>; using ConstLookupResult = StringHashTableLookupResult<const typename cell_type::mapped_type>;
StringHashTable() {} StringHashTable() = default;
StringHashTable(size_t reserve_for_num_elements) StringHashTable(size_t reserve_for_num_elements)
: m1{reserve_for_num_elements / 4} : m1{reserve_for_num_elements / 4}
@ -222,8 +222,15 @@ public:
{ {
} }
StringHashTable(StringHashTable && rhs) { *this = std::move(rhs); } StringHashTable(StringHashTable && rhs)
~StringHashTable() {} : m1(std::move(rhs.m1))
, m2(std::move(rhs.m2))
, m3(std::move(rhs.m3))
, ms(std::move(rhs.ms))
{
}
~StringHashTable() = default;
public: public:
// Dispatch is written in a way that maximizes the performance: // Dispatch is written in a way that maximizes the performance:

View File

@ -0,0 +1,106 @@
#include <Common/TLDListsHolder.h>
#include <Common/StringUtils/StringUtils.h>
#include <common/logger_useful.h>
#include <IO/ReadBufferFromFile.h>
#include <string_view>
#include <unordered_set>
namespace DB
{
namespace ErrorCodes
{
extern const int TLD_LIST_NOT_FOUND;
}
///
/// TLDList
///
TLDList::TLDList(size_t size)
: tld_container(size)
, pool(std::make_unique<Arena>(10 << 20))
{}
bool TLDList::insert(const StringRef & host)
{
bool inserted;
tld_container.emplace(DB::ArenaKeyHolder{host, *pool}, inserted);
return inserted;
}
bool TLDList::has(const StringRef & host) const
{
return tld_container.has(host);
}
///
/// TLDListsHolder
///
TLDListsHolder & TLDListsHolder::getInstance()
{
static TLDListsHolder instance;
return instance;
}
TLDListsHolder::TLDListsHolder() = default;
void TLDListsHolder::parseConfig(const std::string & top_level_domains_path, const Poco::Util::AbstractConfiguration & config)
{
Poco::Util::AbstractConfiguration::Keys config_keys;
config.keys("top_level_domains_lists", config_keys);
Poco::Logger * log = &Poco::Logger::get("TLDListsHolder");
for (const auto & key : config_keys)
{
const std::string & path = top_level_domains_path + config.getString("top_level_domains_lists." + key);
LOG_TRACE(log, "{} loading from {}", key, path);
size_t hosts = parseAndAddTldList(key, path);
LOG_INFO(log, "{} was added ({} hosts)", key, hosts);
}
}
size_t TLDListsHolder::parseAndAddTldList(const std::string & name, const std::string & path)
{
std::unordered_set<std::string> tld_list_tmp;
ReadBufferFromFile in(path);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
if (newline >= in.buffer().end())
break;
std::string_view line(in.position(), newline - in.position());
in.position() = newline + 1;
/// Skip comments
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
continue;
trim(line);
/// Skip empty line
if (line.empty())
continue;
tld_list_tmp.emplace(line);
}
TLDList tld_list(tld_list_tmp.size());
for (const auto & host : tld_list_tmp)
{
StringRef host_ref{host.data(), host.size()};
tld_list.insert(host_ref);
}
size_t tld_list_size = tld_list.size();
std::lock_guard<std::mutex> lock(tld_lists_map_mutex);
tld_lists_map.insert(std::make_pair(name, std::move(tld_list)));
return tld_list_size;
}
const TLDList & TLDListsHolder::getTldList(const std::string & name)
{
std::lock_guard<std::mutex> lock(tld_lists_map_mutex);
auto it = tld_lists_map.find(name);
if (it == tld_lists_map.end())
throw Exception(ErrorCodes::TLD_LIST_NOT_FOUND, "TLD list {} does not exist", name);
return it->second;
}
}

View File

@ -0,0 +1,65 @@
#pragma once
#include <common/defines.h>
#include <common/StringRef.h>
#include <Common/HashTable/StringHashSet.h>
#include <Common/Arena.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <mutex>
#include <string>
#include <unordered_map>
namespace DB
{
/// Custom TLD List
///
/// Unlike tldLookup (which uses gperf) this one uses plain StringHashSet.
class TLDList
{
public:
using Container = StringHashSet<>;
TLDList(size_t size);
/// Return true if the tld_container does not contains such element.
bool insert(const StringRef & host);
/// Check is there such TLD
bool has(const StringRef & host) const;
size_t size() const { return tld_container.size(); }
private:
Container tld_container;
std::unique_ptr<Arena> pool;
};
class TLDListsHolder
{
public:
using Map = std::unordered_map<std::string, TLDList>;
static TLDListsHolder & getInstance();
/// Parse "top_level_domains_lists" section,
/// And add each found dictionary.
void parseConfig(const std::string & top_level_domains_path, const Poco::Util::AbstractConfiguration & config);
/// Parse file and add it as a Set to the list of TLDs
/// - "//" -- comment,
/// - empty lines will be ignored.
///
/// Example: https://publicsuffix.org/list/public_suffix_list.dat
///
/// Return size of the list.
size_t parseAndAddTldList(const std::string & name, const std::string & path);
/// Throws TLD_LIST_NOT_FOUND if list does not exist
const TLDList & getTldList(const std::string & name);
protected:
TLDListsHolder();
std::mutex tld_lists_map_mutex;
Map tld_lists_map;
};
}

View File

@ -68,6 +68,7 @@ SRCS(
StringUtils/StringUtils.cpp StringUtils/StringUtils.cpp
StudentTTest.cpp StudentTTest.cpp
SymbolIndex.cpp SymbolIndex.cpp
TLDListsHolder.cpp
TaskStatsInfoGetter.cpp TaskStatsInfoGetter.cpp
TerminalSize.cpp TerminalSize.cpp
ThreadFuzzer.cpp ThreadFuzzer.cpp

View File

@ -7,12 +7,27 @@
namespace DB namespace DB
{ {
struct FirstSignificantSubdomainDefaultLookup
{
bool operator()(const char *src, size_t len) const
{
return tldLookup::isValid(src, len);
}
};
template <bool without_www> template <bool without_www>
struct ExtractFirstSignificantSubdomain struct ExtractFirstSignificantSubdomain
{ {
static size_t getReserveLengthForElement() { return 10; } static size_t getReserveLengthForElement() { return 10; }
static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr) static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
{
FirstSignificantSubdomainDefaultLookup loookup;
return execute(loookup, data, size, res_data, res_size, out_domain_end);
}
template <class Lookup>
static void execute(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
{ {
res_data = data; res_data = data;
res_size = 0; res_size = 0;
@ -65,7 +80,7 @@ struct ExtractFirstSignificantSubdomain
end_of_level_domain = end; end_of_level_domain = end;
} }
if (tldLookup::isValid(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1) != nullptr) if (lookup(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1))
{ {
res_data += last_3_periods[2] + 1 - begin; res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1; res_size = last_3_periods[1] - last_3_periods[2] - 1;

View File

@ -0,0 +1,112 @@
#pragma once
#include <Functions/FunctionFactory.h>
#include <Functions/URL/FunctionsURL.h>
#include <Functions/FunctionHelpers.h>
#include <DataTypes/DataTypeString.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnFixedString.h>
#include <Common/TLDListsHolder.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
struct FirstSignificantSubdomainCustomtLookup
{
const TLDList & tld_list;
FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name)
: tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name))
{
}
bool operator()(const char *pos, size_t len) const
{
return tld_list.has(StringRef{pos, len});
}
};
template <typename Extractor, typename Name>
class FunctionCutToFirstSignificantSubdomainCustomImpl : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionCutToFirstSignificantSubdomainCustomImpl>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (!isString(arguments[0].type))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of first argument of function {}. Must be String.",
arguments[0].type->getName(), getName());
if (!isString(arguments[1].type))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of second argument (TLD_list_name) of function {}. Must be String/FixedString.",
arguments[1].type->getName(), getName());
const auto * column = arguments[1].column.get();
if (!column || !checkAndGetColumnConstStringOrFixedString(column))
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
"The second argument of function {} should be a constant string with the name of the custom TLD",
getName());
return arguments[0].type;
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
{
const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue<String>());
/// FIXME: convertToFullColumnIfConst() is suboptimal
auto column = arguments[0].column->convertToFullColumnIfConst();
if (const ColumnString * col = checkAndGetColumn<ColumnString>(*column))
{
auto col_res = ColumnString::create();
vector(tld_lookup, col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
return col_res;
}
else
throw Exception(
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup,
const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
{
size_t size = offsets.size();
res_offsets.resize(size);
res_data.reserve(size * Extractor::getReserveLengthForElement());
size_t prev_offset = 0;
size_t res_offset = 0;
/// Matched part.
Pos start;
size_t length;
for (size_t i = 0; i < size; ++i)
{
Extractor::execute(tld_lookup, reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length);
res_data.resize(res_data.size() + length + 1);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length);
res_offset += length + 1;
res_data[res_offset - 1] = 0;
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
};
}

View File

@ -1,6 +1,6 @@
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h> #include <Functions/FunctionStringToString.h>
#include "firstSignificantSubdomain.h" #include "ExtractFirstSignificantSubdomain.h"
namespace DB namespace DB

View File

@ -0,0 +1,43 @@
#include <Functions/FunctionFactory.h>
#include "ExtractFirstSignificantSubdomain.h"
#include "FirstSignificantSubdomainCustomImpl.h"
namespace DB
{
template <bool without_www>
struct CutToFirstSignificantSubdomainCustom
{
static size_t getReserveLengthForElement() { return 15; }
static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
Pos tmp_data;
size_t tmp_length;
Pos domain_end;
ExtractFirstSignificantSubdomain<without_www>::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
if (tmp_length == 0)
return;
res_data = tmp_data;
res_size = domain_end - tmp_data;
}
};
struct NameCutToFirstSignificantSubdomainCustom { static constexpr auto name = "cutToFirstSignificantSubdomainCustom"; };
using FunctionCutToFirstSignificantSubdomainCustom = FunctionCutToFirstSignificantSubdomainCustomImpl<CutToFirstSignificantSubdomainCustom<true>, NameCutToFirstSignificantSubdomainCustom>;
struct NameCutToFirstSignificantSubdomainCustomWithWWW { static constexpr auto name = "cutToFirstSignificantSubdomainCustomWithWWW"; };
using FunctionCutToFirstSignificantSubdomainCustomWithWWW = FunctionCutToFirstSignificantSubdomainCustomImpl<CutToFirstSignificantSubdomainCustom<false>, NameCutToFirstSignificantSubdomainCustomWithWWW>;
void registerFunctionCutToFirstSignificantSubdomainCustom(FunctionFactory & factory)
{
factory.registerFunction<FunctionCutToFirstSignificantSubdomainCustom>();
factory.registerFunction<FunctionCutToFirstSignificantSubdomainCustomWithWWW>();
}
}

View File

@ -1,12 +1,13 @@
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h> #include <Functions/FunctionStringToString.h>
#include "firstSignificantSubdomain.h" #include "ExtractFirstSignificantSubdomain.h"
namespace DB namespace DB
{ {
struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; }; struct NameFirstSignificantSubdomain { static constexpr auto name = "firstSignificantSubdomain"; };
using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain<true>>, NameFirstSignificantSubdomain>; using FunctionFirstSignificantSubdomain = FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain<true>>, NameFirstSignificantSubdomain>;
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory) void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory)

View File

@ -0,0 +1,18 @@
#include <Functions/FunctionFactory.h>
#include "ExtractFirstSignificantSubdomain.h"
#include "FirstSignificantSubdomainCustomImpl.h"
namespace DB
{
struct NameFirstSignificantSubdomainCustom { static constexpr auto name = "firstSignificantSubdomainCustom"; };
using FunctionFirstSignificantSubdomainCustom = FunctionCutToFirstSignificantSubdomainCustomImpl<ExtractFirstSignificantSubdomain<true>, NameFirstSignificantSubdomainCustom>;
void registerFunctionFirstSignificantSubdomainCustom(FunctionFactory & factory)
{
factory.registerFunction<FunctionFirstSignificantSubdomainCustom>();
}
}

View File

@ -7,6 +7,7 @@ void registerFunctionProtocol(FunctionFactory & factory);
void registerFunctionDomain(FunctionFactory & factory); void registerFunctionDomain(FunctionFactory & factory);
void registerFunctionDomainWithoutWWW(FunctionFactory & factory); void registerFunctionDomainWithoutWWW(FunctionFactory & factory);
void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory); void registerFunctionFirstSignificantSubdomain(FunctionFactory & factory);
void registerFunctionFirstSignificantSubdomainCustom(FunctionFactory & factory);
void registerFunctionTopLevelDomain(FunctionFactory & factory); void registerFunctionTopLevelDomain(FunctionFactory & factory);
void registerFunctionPort(FunctionFactory & factory); void registerFunctionPort(FunctionFactory & factory);
void registerFunctionPath(FunctionFactory & factory); void registerFunctionPath(FunctionFactory & factory);
@ -20,6 +21,7 @@ void registerFunctionExtractURLParameterNames(FunctionFactory & factory);
void registerFunctionURLHierarchy(FunctionFactory & factory); void registerFunctionURLHierarchy(FunctionFactory & factory);
void registerFunctionURLPathHierarchy(FunctionFactory & factory); void registerFunctionURLPathHierarchy(FunctionFactory & factory);
void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory); void registerFunctionCutToFirstSignificantSubdomain(FunctionFactory & factory);
void registerFunctionCutToFirstSignificantSubdomainCustom(FunctionFactory & factory);
void registerFunctionCutWWW(FunctionFactory & factory); void registerFunctionCutWWW(FunctionFactory & factory);
void registerFunctionCutQueryString(FunctionFactory & factory); void registerFunctionCutQueryString(FunctionFactory & factory);
void registerFunctionCutFragment(FunctionFactory & factory); void registerFunctionCutFragment(FunctionFactory & factory);
@ -34,6 +36,7 @@ void registerFunctionsURL(FunctionFactory & factory)
registerFunctionDomain(factory); registerFunctionDomain(factory);
registerFunctionDomainWithoutWWW(factory); registerFunctionDomainWithoutWWW(factory);
registerFunctionFirstSignificantSubdomain(factory); registerFunctionFirstSignificantSubdomain(factory);
registerFunctionFirstSignificantSubdomainCustom(factory);
registerFunctionTopLevelDomain(factory); registerFunctionTopLevelDomain(factory);
registerFunctionPort(factory); registerFunctionPort(factory);
registerFunctionPath(factory); registerFunctionPath(factory);
@ -47,6 +50,7 @@ void registerFunctionsURL(FunctionFactory & factory)
registerFunctionURLHierarchy(factory); registerFunctionURLHierarchy(factory);
registerFunctionURLPathHierarchy(factory); registerFunctionURLPathHierarchy(factory);
registerFunctionCutToFirstSignificantSubdomain(factory); registerFunctionCutToFirstSignificantSubdomain(factory);
registerFunctionCutToFirstSignificantSubdomainCustom(factory);
registerFunctionCutWWW(factory); registerFunctionCutWWW(factory);
registerFunctionCutQueryString(factory); registerFunctionCutQueryString(factory);
registerFunctionCutFragment(factory); registerFunctionCutFragment(factory);

View File

@ -1,5 +1,7 @@
#pragma once #pragma once
#include <cstdlib>
// Definition of the class generated by gperf, present on gperf/tldLookup.gperf // Definition of the class generated by gperf, present on gperf/tldLookup.gperf
class TopLevelDomainLookupHash class TopLevelDomainLookupHash
{ {

View File

@ -80,6 +80,7 @@ SRCS(
URL/cutQueryString.cpp URL/cutQueryString.cpp
URL/cutQueryStringAndFragment.cpp URL/cutQueryStringAndFragment.cpp
URL/cutToFirstSignificantSubdomain.cpp URL/cutToFirstSignificantSubdomain.cpp
URL/cutToFirstSignificantSubdomainCustom.cpp
URL/cutURLParameter.cpp URL/cutURLParameter.cpp
URL/cutWWW.cpp URL/cutWWW.cpp
URL/decodeURLComponent.cpp URL/decodeURLComponent.cpp
@ -89,6 +90,7 @@ SRCS(
URL/extractURLParameterNames.cpp URL/extractURLParameterNames.cpp
URL/extractURLParameters.cpp URL/extractURLParameters.cpp
URL/firstSignificantSubdomain.cpp URL/firstSignificantSubdomain.cpp
URL/firstSignificantSubdomainCustom.cpp
URL/fragment.cpp URL/fragment.cpp
URL/netloc.cpp URL/netloc.cpp
URL/path.cpp URL/path.cpp

View File

@ -29,6 +29,9 @@ target_link_libraries (string_hash_map PRIVATE dbms)
add_executable (string_hash_map_aggregation string_hash_map.cpp) add_executable (string_hash_map_aggregation string_hash_map.cpp)
target_link_libraries (string_hash_map_aggregation PRIVATE dbms) target_link_libraries (string_hash_map_aggregation PRIVATE dbms)
add_executable (string_hash_set string_hash_set.cpp)
target_link_libraries (string_hash_set PRIVATE dbms)
add_executable (two_level_hash_map two_level_hash_map.cpp) add_executable (two_level_hash_map two_level_hash_map.cpp)
target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR})
target_link_libraries (two_level_hash_map PRIVATE dbms) target_link_libraries (two_level_hash_map PRIVATE dbms)

View File

@ -0,0 +1,83 @@
#include <iomanip>
#include <iostream>
#include <vector>
#include <Compression/CompressedReadBuffer.h>
#include <common/types.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h>
#include <Interpreters/AggregationCommon.h>
#include <Common/HashTable/HashSet.h>
#include <Common/HashTable/HashTableKeyHolder.h>
#include <Common/HashTable/StringHashSet.h>
#include <Common/Stopwatch.h>
#include <common/StringRef.h>
/// NOTE: see string_hash_map.cpp for usage example
template <typename Set>
void NO_INLINE bench(const std::vector<StringRef> & data, DB::Arena & pool, const char * name)
{
std::cerr << "method " << name << std::endl;
for (auto t = 0ul; t < 7; ++t)
{
Stopwatch watch;
Set set;
typename Set::LookupResult it;
bool inserted;
for (const auto & value : data)
{
if constexpr (std::is_same_v<StringHashSet<>, Set>)
set.emplace(DB::ArenaKeyHolder{value, pool}, inserted);
else
set.emplace(DB::ArenaKeyHolder{value, pool}, it, inserted);
}
watch.stop();
std::cerr << "arena-memory " << pool.size() + set.getBufferSizeInBytes() << std::endl;
std::cerr << "single-run " << std::setprecision(3)
<< watch.elapsedSeconds() << std::endl;
}
}
int main(int argc, char ** argv)
{
if (argc < 3)
{
std::cerr << "Usage: program n m\n";
return 1;
}
size_t n = std::stol(argv[1]);
size_t m = std::stol(argv[2]);
DB::Arena pool(128 * 1024 * 1024);
std::vector<StringRef> data(n);
std::cerr << "sizeof(Key) = " << sizeof(StringRef) << std::endl;
{
Stopwatch watch;
DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
DB::CompressedReadBuffer in2(in1);
std::string tmp;
for (size_t i = 0; i < n && !in2.eof(); ++i)
{
DB::readStringBinary(tmp, in2);
data[i] = StringRef(pool.insert(tmp.data(), tmp.size()), tmp.size());
}
watch.stop();
std::cerr << std::fixed << std::setprecision(2) << "Vector. Size: " << n << ", elapsed: " << watch.elapsedSeconds() << " ("
<< n / watch.elapsedSeconds() << " elem/sec.)" << std::endl;
}
if (!m || m == 1)
bench<StringHashSet<>>(data, pool, "StringHashSet");
if (!m || m == 2)
bench<HashSetWithSavedHash<StringRef>>(data, pool, "HashSetWithSavedHash");
if (!m || m == 3)
bench<HashSet<StringRef>>(data, pool, "HashSet");
return 0;
}

View File

@ -15,6 +15,16 @@ install (
COMPONENT clickhouse COMPONENT clickhouse
PATTERN "CMakeLists.txt" EXCLUDE PATTERN "CMakeLists.txt" EXCLUDE
PATTERN ".gitignore" EXCLUDE PATTERN ".gitignore" EXCLUDE
PATTERN "top_level_domains" EXCLUDE
)
# Dereference symlink
get_filename_component(TOP_LEVEL_DOMAINS_ABS_DIR config/top_level_domains REALPATH)
install (
DIRECTORY "${TOP_LEVEL_DOMAINS_ABS_DIR}"
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/clickhouse-test/config
USE_SOURCE_PERMISSIONS
COMPONENT clickhouse
) )
install (FILES server-test.xml DESTINATION ${CLICKHOUSE_ETC_DIR}/clickhouse-server COMPONENT clickhouse) install (FILES server-test.xml DESTINATION ${CLICKHOUSE_ETC_DIR}/clickhouse-server COMPONENT clickhouse)

View File

@ -0,0 +1,5 @@
<yandex>
<top_level_domains_lists>
<public_suffix_list>public_suffix_list.dat</public_suffix_list>
</top_level_domains_lists>
</yandex>

View File

@ -0,0 +1,3 @@
<yandex>
<top_level_domains_path replace="replace">/etc/clickhouse-server/top_level_domains/</top_level_domains_path>
</yandex>

View File

@ -31,6 +31,8 @@ ln -sf $SRC_PATH/config.d/test_cluster_with_incorrect_pw.xml $DEST_SERVER_PATH/c
ln -sf $SRC_PATH/config.d/test_keeper_port.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/test_keeper_port.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/logging_no_rotate.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/logging_no_rotate.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/tcp_with_proxy.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/tcp_with_proxy.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/top_level_domains_lists.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/top_level_domains_path.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/users.d/log_queries.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/log_queries.xml $DEST_SERVER_PATH/users.d/
ln -sf $SRC_PATH/users.d/readonly.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/readonly.xml $DEST_SERVER_PATH/users.d/
ln -sf $SRC_PATH/users.d/access_management.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/access_management.xml $DEST_SERVER_PATH/users.d/
@ -42,6 +44,8 @@ ln -sf $SRC_PATH/strings_dictionary.xml $DEST_SERVER_PATH/
ln -sf $SRC_PATH/decimals_dictionary.xml $DEST_SERVER_PATH/ ln -sf $SRC_PATH/decimals_dictionary.xml $DEST_SERVER_PATH/
ln -sf $SRC_PATH/executable_dictionary.xml $DEST_SERVER_PATH/ ln -sf $SRC_PATH/executable_dictionary.xml $DEST_SERVER_PATH/
ln -sf $SRC_PATH/top_level_domains $DEST_SERVER_PATH/
ln -sf $SRC_PATH/server.key $DEST_SERVER_PATH/ ln -sf $SRC_PATH/server.key $DEST_SERVER_PATH/
ln -sf $SRC_PATH/server.crt $DEST_SERVER_PATH/ ln -sf $SRC_PATH/server.crt $DEST_SERVER_PATH/
ln -sf $SRC_PATH/dhparam.pem $DEST_SERVER_PATH/ ln -sf $SRC_PATH/dhparam.pem $DEST_SERVER_PATH/

View File

@ -0,0 +1 @@
../../docker/test/performance-comparison/config/top_level_domains

View File

@ -1,14 +0,0 @@
<test>
<preconditions>
<table_exists>test.hits</table_exists>
</preconditions>
<settings>
<max_threads>1</max_threads>
</settings>
<query>SELECT count() FROM test.hits WHERE NOT ignore(firstSignificantSubdomain(URL))</query>
</test>

View File

@ -1,11 +1,10 @@
<test> <test>
<preconditions> <preconditions>
<table_exists>hits_100m_single</table_exists> <table_exists>hits_100m_single</table_exists>
<table_exists>test.hits</table_exists>
</preconditions> </preconditions>
<substitutions> <substitutions>
<substitution> <substitution>
<name>func</name> <name>func</name>
@ -32,6 +31,12 @@
</values> </values>
</substitution> </substitution>
</substitutions> </substitutions>
<query>SELECT count() FROM hits_100m_single WHERE NOT ignore({func}(URL))</query> <query>SELECT count() FROM hits_100m_single WHERE NOT ignore({func}(URL))</query>
<!-- firstSignificantSubdomain/firstSignificantSubdomainCustom -->
<query>SELECT count() FROM test.hits WHERE NOT ignore(firstSignificantSubdomain(URL)) SETTINGS max_threads=1</query>
<query>SELECT count() FROM test.hits WHERE NOT ignore(firstSignificantSubdomainCustom(URL, 'public_suffix_list')) SETTINGS max_threads=1</query>
<!-- cutToFirstSignificantSubdomain/cutToFirstSignificantSubdomainCustom -->
<query>SELECT count() FROM test.hits WHERE NOT ignore(cutToFirstSignificantSubdomain(URL)) SETTINGS max_threads=1</query>
<query>SELECT count() FROM test.hits WHERE NOT ignore(cutToFirstSignificantSubdomainCustom(URL, 'public_suffix_list')) SETTINGS max_threads=1</query>
</test> </test>

View File

@ -0,0 +1,11 @@
no-tld
foo.there-is-no-such-domain
foo.there-is-no-such-domain
foo
generic
kernel
kernel.biz.ss
difference
biz.ss
kernel.biz.ss

View File

@ -0,0 +1,16 @@
select 'no-tld';
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
-- even if there is no TLD, 2-nd level by default anyway
-- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real)
select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
select 'generic';
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
select 'difference';
-- biz.ss is not in the default TLD list, hence:
select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss