2020-12-03 21:11:38 +00:00
|
|
|
#pragma once
|
|
|
|
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/defines.h>
|
|
|
|
#include <base/StringRef.h>
|
2022-07-22 07:36:50 +00:00
|
|
|
#include <Common/HashTable/StringHashMap.h>
|
2020-12-03 21:11:38 +00:00
|
|
|
#include <Common/Arena.h>
|
2020-12-03 21:11:38 +00:00
|
|
|
#include <Poco/Util/AbstractConfiguration.h>
|
|
|
|
#include <mutex>
|
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2022-07-22 07:36:50 +00:00
|
|
|
enum TLDType
|
|
|
|
{
|
|
|
|
/// Does not exist marker
|
|
|
|
TLD_NONE,
|
|
|
|
/// For regular lines
|
|
|
|
TLD_REGULAR,
|
|
|
|
/// For asterisk (*)
|
|
|
|
TLD_ANY,
|
|
|
|
/// For exclamation mark (!)
|
|
|
|
TLD_EXCLUDE,
|
|
|
|
};
|
|
|
|
|
2020-12-03 21:11:38 +00:00
|
|
|
/// Custom TLD List
|
2020-12-03 21:11:38 +00:00
|
|
|
///
|
2022-07-22 07:36:50 +00:00
|
|
|
/// Unlike tldLookup (which uses gperf) this one uses plain StringHashMap.
|
2020-12-03 21:11:38 +00:00
|
|
|
class TLDList
|
|
|
|
{
|
|
|
|
public:
|
2022-07-22 07:36:50 +00:00
|
|
|
using Container = StringHashMap<TLDType>;
|
2020-12-03 21:11:38 +00:00
|
|
|
|
2022-03-11 21:47:28 +00:00
|
|
|
explicit TLDList(size_t size);
|
2020-12-03 21:11:38 +00:00
|
|
|
|
2022-07-22 07:36:50 +00:00
|
|
|
void insert(const String & host, TLDType type);
|
|
|
|
TLDType lookup(StringRef host) const;
|
2020-12-03 21:11:38 +00:00
|
|
|
size_t size() const { return tld_container.size(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
Container tld_container;
|
2022-07-22 07:36:50 +00:00
|
|
|
std::unique_ptr<Arena> memory_pool;
|
2020-12-03 21:11:38 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class TLDListsHolder
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
using Map = std::unordered_map<std::string, TLDList>;
|
|
|
|
|
|
|
|
static TLDListsHolder & getInstance();
|
|
|
|
|
|
|
|
/// Parse "top_level_domains_lists" section,
|
|
|
|
/// And add each found dictionary.
|
2020-12-08 20:54:03 +00:00
|
|
|
void parseConfig(const std::string & top_level_domains_path, const Poco::Util::AbstractConfiguration & config);
|
2020-12-03 21:11:38 +00:00
|
|
|
|
|
|
|
/// Parse file and add it as a Set to the list of TLDs
|
|
|
|
/// - "//" -- comment,
|
|
|
|
/// - empty lines will be ignored.
|
|
|
|
///
|
2022-07-22 07:36:50 +00:00
|
|
|
/// Treats the following special symbols:
|
|
|
|
/// - "*"
|
|
|
|
/// - "!"
|
|
|
|
///
|
|
|
|
/// Format : https://github.com/publicsuffix/list/wiki/Format
|
2020-12-03 21:11:38 +00:00
|
|
|
/// Example: https://publicsuffix.org/list/public_suffix_list.dat
|
|
|
|
///
|
|
|
|
/// Return size of the list.
|
|
|
|
size_t parseAndAddTldList(const std::string & name, const std::string & path);
|
|
|
|
/// Throws TLD_LIST_NOT_FOUND if list does not exist
|
|
|
|
const TLDList & getTldList(const std::string & name);
|
|
|
|
|
|
|
|
protected:
|
|
|
|
TLDListsHolder();
|
|
|
|
|
|
|
|
std::mutex tld_lists_map_mutex;
|
2022-06-14 22:35:55 +00:00
|
|
|
Map tld_lists_map TSA_GUARDED_BY(tld_lists_map_mutex);
|
2020-12-03 21:11:38 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|