2020-12-03 21:11:38 +00:00
|
|
|
#include <Common/TLDListsHolder.h>
|
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2022-04-27 15:05:45 +00:00
|
|
|
#include <Common/logger_useful.h>
|
2020-12-03 21:11:38 +00:00
|
|
|
#include <IO/ReadBufferFromFile.h>
|
2021-07-29 07:19:00 +00:00
|
|
|
#include <IO/ReadHelpers.h>
|
2020-12-03 21:11:38 +00:00
|
|
|
#include <string_view>
|
2020-12-03 21:11:38 +00:00
|
|
|
#include <unordered_set>
|
2020-12-03 21:11:38 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int TLD_LIST_NOT_FOUND;
|
2021-07-29 07:19:00 +00:00
|
|
|
extern const int LOGICAL_ERROR;
|
2020-12-03 21:11:38 +00:00
|
|
|
}
|
|
|
|
|
2022-07-22 07:36:50 +00:00
|
|
|
constexpr size_t StringHashTablePadRequirement = 8;
|
|
|
|
|
2020-12-03 21:11:38 +00:00
|
|
|
/// TLDList
|
2020-12-03 21:11:38 +00:00
|
|
|
TLDList::TLDList(size_t size)
|
|
|
|
: tld_container(size)
|
2022-07-22 07:36:50 +00:00
|
|
|
, memory_pool(std::make_unique<Arena>())
|
|
|
|
{
|
|
|
|
/// StringHashTable requires padded to 8 bytes key,
|
|
|
|
/// and Arena (memory_pool here) does satisfies this,
|
|
|
|
/// since it has padding with 15 bytes at the right.
|
|
|
|
///
|
|
|
|
/// However, StringHashTable may reference -1 byte of the key,
|
|
|
|
/// so left padding is also required:
|
|
|
|
memory_pool->alignedAlloc(StringHashTablePadRequirement, StringHashTablePadRequirement);
|
|
|
|
}
|
|
|
|
void TLDList::insert(const String & host, TLDType type)
|
2020-12-03 21:11:38 +00:00
|
|
|
{
|
2022-07-22 07:36:50 +00:00
|
|
|
StringRef owned_host{memory_pool->insert(host.data(), host.size()), host.size()};
|
|
|
|
tld_container[owned_host] = type;
|
2020-12-03 21:11:38 +00:00
|
|
|
}
|
2022-07-22 07:36:50 +00:00
|
|
|
TLDType TLDList::lookup(StringRef host) const
|
2020-12-03 21:11:38 +00:00
|
|
|
{
|
2022-07-22 07:36:50 +00:00
|
|
|
if (auto it = tld_container.find(host); it != nullptr)
|
|
|
|
return it->getMapped();
|
|
|
|
return TLDType::TLD_NONE;
|
2020-12-03 21:11:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// TLDListsHolder
|
|
|
|
TLDListsHolder & TLDListsHolder::getInstance()
|
|
|
|
{
|
|
|
|
static TLDListsHolder instance;
|
|
|
|
return instance;
|
|
|
|
}
|
|
|
|
TLDListsHolder::TLDListsHolder() = default;
|
|
|
|
|
2020-12-08 20:54:03 +00:00
|
|
|
void TLDListsHolder::parseConfig(const std::string & top_level_domains_path, const Poco::Util::AbstractConfiguration & config)
|
2020-12-03 21:11:38 +00:00
|
|
|
{
|
|
|
|
Poco::Util::AbstractConfiguration::Keys config_keys;
|
|
|
|
config.keys("top_level_domains_lists", config_keys);
|
|
|
|
|
|
|
|
Poco::Logger * log = &Poco::Logger::get("TLDListsHolder");
|
|
|
|
|
|
|
|
for (const auto & key : config_keys)
|
|
|
|
{
|
2020-12-08 20:54:03 +00:00
|
|
|
const std::string & path = top_level_domains_path + config.getString("top_level_domains_lists." + key);
|
2020-12-03 21:11:38 +00:00
|
|
|
LOG_TRACE(log, "{} loading from {}", key, path);
|
|
|
|
size_t hosts = parseAndAddTldList(key, path);
|
|
|
|
LOG_INFO(log, "{} was added ({} hosts)", key, hosts);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t TLDListsHolder::parseAndAddTldList(const std::string & name, const std::string & path)
|
|
|
|
{
|
2022-07-22 07:36:50 +00:00
|
|
|
std::unordered_map<std::string, TLDType> tld_list_tmp;
|
2020-12-03 21:11:38 +00:00
|
|
|
|
|
|
|
ReadBufferFromFile in(path);
|
2022-07-22 07:36:50 +00:00
|
|
|
String buffer;
|
2020-12-03 21:11:38 +00:00
|
|
|
while (!in.eof())
|
|
|
|
{
|
2022-07-22 07:36:50 +00:00
|
|
|
readEscapedStringUntilEOL(buffer, in);
|
2021-08-26 21:43:21 +00:00
|
|
|
if (!in.eof())
|
|
|
|
++in.position();
|
2022-07-22 07:36:50 +00:00
|
|
|
std::string_view line(buffer);
|
2020-12-03 21:11:38 +00:00
|
|
|
/// Skip comments
|
2022-07-22 07:36:50 +00:00
|
|
|
if (line.starts_with("//"))
|
2020-12-03 21:11:38 +00:00
|
|
|
continue;
|
2022-07-22 07:36:50 +00:00
|
|
|
line = line.substr(0, line.rend() - std::find_if_not(line.rbegin(), line.rend(), ::isspace));
|
2020-12-03 21:11:38 +00:00
|
|
|
/// Skip empty line
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
2022-07-22 07:36:50 +00:00
|
|
|
/// Validate special symbols.
|
|
|
|
if (line.starts_with("*."))
|
|
|
|
{
|
|
|
|
line = line.substr(2);
|
|
|
|
tld_list_tmp.emplace(line, TLDType::TLD_ANY);
|
|
|
|
}
|
|
|
|
else if (line[0] == '!')
|
|
|
|
{
|
|
|
|
line = line.substr(1);
|
|
|
|
tld_list_tmp.emplace(line, TLDType::TLD_EXCLUDE);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
tld_list_tmp.emplace(line, TLDType::TLD_REGULAR);
|
2020-12-03 21:11:38 +00:00
|
|
|
}
|
2021-07-29 07:19:00 +00:00
|
|
|
if (!in.eof())
|
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not all list had been read", name);
|
2020-12-03 21:11:38 +00:00
|
|
|
|
|
|
|
TLDList tld_list(tld_list_tmp.size());
|
2022-07-22 07:36:50 +00:00
|
|
|
for (const auto & [host, type] : tld_list_tmp)
|
2020-12-03 21:11:38 +00:00
|
|
|
{
|
2022-07-22 07:36:50 +00:00
|
|
|
tld_list.insert(host, type);
|
2020-12-03 21:11:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
size_t tld_list_size = tld_list.size();
|
|
|
|
std::lock_guard<std::mutex> lock(tld_lists_map_mutex);
|
2020-12-03 21:11:38 +00:00
|
|
|
tld_lists_map.insert(std::make_pair(name, std::move(tld_list)));
|
2020-12-03 21:11:38 +00:00
|
|
|
return tld_list_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
const TLDList & TLDListsHolder::getTldList(const std::string & name)
|
|
|
|
{
|
|
|
|
std::lock_guard<std::mutex> lock(tld_lists_map_mutex);
|
|
|
|
auto it = tld_lists_map.find(name);
|
|
|
|
if (it == tld_lists_map.end())
|
|
|
|
throw Exception(ErrorCodes::TLD_LIST_NOT_FOUND, "TLD list {} does not exist", name);
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|