ClickHouse/src/Common/SensitiveDataMasker.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

217 lines
6.1 KiB
C++
Raw Normal View History

2019-06-20 07:17:21 +00:00
#include "SensitiveDataMasker.h"
#include <set>
#include <string>
#include <atomic>
#include <re2/re2.h>
#include <re2/stringpiece.h>
#include <Poco/Util/AbstractConfiguration.h>
2022-04-27 15:05:45 +00:00
#include <Common/logger_useful.h>
2019-06-20 07:17:21 +00:00
#include <Common/Exception.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/ProfileEvents.h>
2019-06-20 07:17:21 +00:00
#ifndef NDEBUG
# include <iostream>
#endif
2019-09-03 20:27:46 +00:00
namespace ProfileEvents
{
extern const Event QueryMaskingRulesMatch;
}
2019-06-20 07:17:21 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_COMPILE_REGEXP;
extern const int LOGICAL_ERROR;
2019-06-20 07:17:21 +00:00
extern const int NO_ELEMENTS_IN_CONFIG;
extern const int INVALID_CONFIG_PARAMETER;
}
class SensitiveDataMasker::MaskingRule
{
private:
const std::string name;
const std::string replacement_string;
const std::string regexp_string;
const RE2 regexp;
const re2::StringPiece replacement;
#ifndef NDEBUG
2019-06-20 07:17:21 +00:00
mutable std::atomic<std::uint64_t> matches_count = 0;
#endif
2019-06-20 07:17:21 +00:00
public:
//* TODO: option with hyperscan? https://software.intel.com/en-us/articles/why-and-how-to-replace-pcre-with-hyperscan
// re2::set should also work quite fast, but it doesn't return the match position, only which regexp was matched
2019-09-03 20:07:27 +00:00
MaskingRule(const std::string & name_, const std::string & regexp_string_, const std::string & replacement_string_)
: name(name_)
, replacement_string(replacement_string_)
, regexp_string(regexp_string_)
2019-06-20 07:17:21 +00:00
, regexp(regexp_string, RE2::Quiet)
, replacement(replacement_string)
{
if (!regexp.ok())
2023-01-23 13:16:14 +00:00
throw DB::Exception(DB::ErrorCodes::CANNOT_COMPILE_REGEXP,
"SensitiveDataMasker: cannot compile re2: {}, error: {}. "
"Look at https://github.com/google/re2/wiki/Syntax for reference.",
regexp_string_, regexp.error());
2019-06-20 07:17:21 +00:00
}
2019-09-03 20:07:27 +00:00
2019-09-03 20:27:46 +00:00
uint64_t apply(std::string & data) const
2019-06-20 07:17:21 +00:00
{
auto m = RE2::GlobalReplace(&data, regexp, replacement);
#ifndef NDEBUG
2019-06-20 07:17:21 +00:00
matches_count += m;
#endif
2019-06-20 07:17:21 +00:00
return m;
}
const std::string & getName() const { return name; }
const std::string & getReplacementString() const { return replacement_string; }
#ifndef NDEBUG
2019-06-20 07:17:21 +00:00
uint64_t getMatchesCount() const { return matches_count; }
#endif
2019-06-20 07:17:21 +00:00
};
2020-03-09 02:10:20 +00:00
SensitiveDataMasker::~SensitiveDataMasker() = default;
std::unique_ptr<SensitiveDataMasker> SensitiveDataMasker::sensitive_data_masker = nullptr;
void SensitiveDataMasker::setInstance(std::unique_ptr<SensitiveDataMasker> sensitive_data_masker_)
{
if (!sensitive_data_masker_)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: the 'sensitive_data_masker' is not set");
if (sensitive_data_masker_->rulesCount() > 0)
{
sensitive_data_masker = std::move(sensitive_data_masker_);
}
}
SensitiveDataMasker * SensitiveDataMasker::getInstance()
{
return sensitive_data_masker.get();
}
2019-09-03 20:27:46 +00:00
2019-06-20 07:17:21 +00:00
SensitiveDataMasker::SensitiveDataMasker(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
{
Poco::Util::AbstractConfiguration::Keys keys;
config.keys(config_prefix, keys);
2020-05-30 21:57:37 +00:00
Poco::Logger * logger = &Poco::Logger::get("SensitiveDataMaskerConfigRead");
2019-06-20 07:17:21 +00:00
std::set<std::string> used_names;
for (const auto & rule : keys)
{
if (startsWith(rule, "rule"))
{
auto rule_config_prefix = config_prefix + "." + rule;
auto rule_name = config.getString(rule_config_prefix + ".name", rule_config_prefix);
2019-09-03 20:27:46 +00:00
if (!used_names.insert(rule_name).second)
2019-06-20 07:17:21 +00:00
{
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER,
"query_masking_rules configuration contains more than one rule named '{}'.", rule_name);
2019-06-20 07:17:21 +00:00
}
auto regexp = config.getString(rule_config_prefix + ".regexp", "");
2019-09-03 20:27:46 +00:00
if (regexp.empty())
2019-06-20 07:17:21 +00:00
{
throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG,
"query_masking_rules configuration, rule '{}' has no <regexp> node or <regexp> "
"is empty.", rule_name);
2019-06-20 07:17:21 +00:00
}
auto replace = config.getString(rule_config_prefix + ".replace", "******");
try
{
addMaskingRule(rule_name, regexp, replace);
}
catch (DB::Exception & e)
{
e.addMessage("while adding query masking rule '" + rule_name + "'.");
throw;
}
}
else
{
2020-05-23 22:24:01 +00:00
LOG_WARNING(logger, "Unused param {}.{}", config_prefix, rule);
2019-06-20 07:17:21 +00:00
}
}
2019-09-03 20:27:46 +00:00
auto rules_count = rulesCount();
2019-06-20 07:17:21 +00:00
if (rules_count > 0)
2020-05-23 22:24:01 +00:00
LOG_INFO(logger, "{} query masking rules loaded.", rules_count);
2019-06-20 07:17:21 +00:00
}
void SensitiveDataMasker::addMaskingRule(
const std::string & name, const std::string & regexp_string, const std::string & replacement_string)
{
all_masking_rules.push_back(std::make_unique<MaskingRule>(name, regexp_string, replacement_string));
}
2019-09-03 20:27:46 +00:00
size_t SensitiveDataMasker::wipeSensitiveData(std::string & data) const
2019-06-20 07:17:21 +00:00
{
2019-09-03 20:27:46 +00:00
size_t matches = 0;
2020-04-22 00:29:38 +00:00
for (const auto & rule : all_masking_rules)
2019-06-20 07:17:21 +00:00
matches += rule->apply(data);
if (matches)
ProfileEvents::increment(ProfileEvents::QueryMaskingRulesMatch, matches);
2019-06-20 07:17:21 +00:00
return matches;
}
#ifndef NDEBUG
void SensitiveDataMasker::printStats()
{
for (auto & rule : all_masking_rules)
{
std::cout << rule->getName() << " (replacement to " << rule->getReplacementString() << ") matched " << rule->getMatchesCount()
<< " times" << std::endl;
}
}
#endif
2019-09-03 20:27:46 +00:00
size_t SensitiveDataMasker::rulesCount() const
2019-06-20 07:17:21 +00:00
{
return all_masking_rules.size();
}
std::string wipeSensitiveDataAndCutToLength(const std::string & str, size_t max_length)
{
std::string res = str;
if (auto * masker = SensitiveDataMasker::getInstance())
masker->wipeSensitiveData(res);
size_t length = res.length();
if (max_length && (length > max_length))
{
size_t truncated_len = length - max_length;
res.resize(max_length);
res += "... (truncated " + std::to_string(truncated_len) + " chars)";
}
return res;
}
2019-06-20 07:17:21 +00:00
}