From 82ce2d76c31547539cdb6ed4bec6da065e13e7cb Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 3 Jun 2022 12:06:31 +0300 Subject: [PATCH 01/60] Add KerberosInit class; add kerberos_init console example; modify HDFSCommon.cpp --- src/Access/CMakeLists.txt | 3 + src/Access/KerberosInit.cpp | 162 ++++++++++++++++++++++++++ src/Access/KerberosInit.h | 37 ++++++ src/Access/examples/CMakeLists.txt | 4 + src/Access/examples/kerberos_init.cpp | 36 ++++++ src/Storages/HDFS/HDFSCommon.cpp | 15 ++- 6 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 src/Access/KerberosInit.cpp create mode 100644 src/Access/KerberosInit.h create mode 100644 src/Access/examples/CMakeLists.txt create mode 100644 src/Access/examples/kerberos_init.cpp diff --git a/src/Access/CMakeLists.txt b/src/Access/CMakeLists.txt index e69de29bb2d..83bbe418246 100644 --- a/src/Access/CMakeLists.txt +++ b/src/Access/CMakeLists.txt @@ -0,0 +1,3 @@ +if (ENABLE_EXAMPLES) + add_subdirectory(examples) +endif() diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp new file mode 100644 index 00000000000..8954eccf136 --- /dev/null +++ b/src/Access/KerberosInit.cpp @@ -0,0 +1,162 @@ +#include +#include + +#include +#include +#include + + +int KerberosInit::init(const String & keytab_file, const String & principal, const String & cache_name) +{ + auto adqm_log = &Poco::Logger::get("ADQM"); + LOG_DEBUG(adqm_log,"KerberosInit: begin"); + + krb5_error_code ret; + + // todo: use deftype + //const char *deftype = nullptr; + int flags = 0; + principal_name = new char[256]; + std::copy(principal.begin(), principal.end(), principal_name); + principal_name[principal.size()] = '\0'; + + //memset(&k5, 0, sizeof(k5)); + memset(&k5d, 0, sizeof(k5d)); + k5 = &k5d; + //memset(k5, 0, sizeof(k5_data)); + // begin + ret = krb5_init_context(&k5->ctx); + if (ret) + throw DB::Exception(0, "Error while initializing Kerberos 5 library"); + + if (!cache_name.empty()) + { + ret = krb5_cc_resolve(k5->ctx, cache_name.c_str(), &k5->out_cc); + // todo: analyze return code + LOG_DEBUG(adqm_log,"Resolved cache"); + } + else + { + // Resolve the default ccache and get its type and default principal (if it is initialized). + ret = krb5_cc_default(k5->ctx, &defcache); + if (ret) + throw DB::Exception(0, "Error while getting default ccache"); + LOG_DEBUG(adqm_log,"Resolved default cache"); + // todo: deftype + /*deftype = */krb5_cc_get_type(k5->ctx, defcache); + if (krb5_cc_get_principal(k5->ctx, defcache, &defcache_princ) != 0) + defcache_princ = nullptr; + } + + // Use the specified principal name. + ret = krb5_parse_name_flags(k5->ctx, principal_name, flags, &k5->me); + if (ret) + throw DB::Exception(0, "Error when parsing principal name " + String(principal_name)); + + + // to-do: add more cache init commands + + ret = krb5_unparse_name(k5->ctx, k5->me, &k5->name); + if (ret) + throw DB::Exception(0, "Error when unparsing name"); + + LOG_DEBUG(adqm_log,"KerberosInit: Using principal: {}", k5->name); + + principal_name = k5->name; + + // init: + memset(&my_creds, 0, sizeof(my_creds)); + + ret = krb5_get_init_creds_opt_alloc(k5->ctx, &options); + if (ret) + throw DB::Exception(0, "Error in options allocation"); + + // todo +/* +#ifndef _WIN32 + if (strncmp(opts->keytab_name, "KDB:", 4) == 0) { + ret = kinit_kdb_init(&k5->ctx, k5->me->realm.data); + errctx = k5->ctx; + if (ret) { + com_err(progname, ret, + _("while setting up KDB keytab for realm %s"), + k5->me->realm.data); + goto cleanup; + } + } +#endif +*/ + // Resolve keytab + ret = krb5_kt_resolve(k5->ctx, keytab_file.c_str(), &keytab); + if (ret) + throw DB::Exception(0, "Error resolving keytab "+keytab_file); + + LOG_DEBUG(adqm_log,"KerberosInit: Using keytab: {}", keytab_file); + + // todo: num_pa_opts + + + // todo: in_cc / ccache + + // action: init or renew + // todo: doing only init action: + ret = krb5_get_init_creds_keytab(k5->ctx, &my_creds, k5->me, keytab, 0, nullptr, options); + if (ret) + LOG_DEBUG(adqm_log,"Getting initial credentials"); + + // todo: implement renew action + + + LOG_DEBUG(adqm_log,"Authenticated to Kerberos v5"); + LOG_DEBUG(adqm_log,"KerberosInit: end"); + return 0; +} + +KerberosInit::~KerberosInit() +{ + if (k5->ctx) + { + //begin. cleanup: + if (defcache) + krb5_cc_close(k5->ctx, defcache); + //todo + krb5_free_principal(k5->ctx, defcache_princ); + + // init. cleanup: + //todo: + /* + #ifndef _WIN32 + kinit_kdb_fini(); + #endif + */ + if (options) + krb5_get_init_creds_opt_free(k5->ctx, options); + if (my_creds.client == k5->me) + my_creds.client = nullptr; + /* + if (opts->pa_opts) { + free(opts->pa_opts); + opts->pa_opts = NULL; + opts->num_pa_opts = 0; + } + */ + krb5_free_cred_contents(k5->ctx, &my_creds); + if (keytab) + krb5_kt_close(k5->ctx, keytab); + + + // end: + krb5_free_unparsed_name(k5->ctx, k5->name); + krb5_free_principal(k5->ctx, k5->me); + /* + if (k5->in_cc != NULL) + krb5_cc_close(k5->ctx, k5->in_cc); + if (k5->out_cc != NULL) + krb5_cc_close(k5->ctx, k5->out_cc); + */ + krb5_free_context(k5->ctx); + } + memset(k5, 0, sizeof(*k5)); + + delete[] principal_name; +} diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h new file mode 100644 index 00000000000..ed64ba3b4a8 --- /dev/null +++ b/src/Access/KerberosInit.h @@ -0,0 +1,37 @@ +#pragma once + +#include "config_core.h" + +#include + +//#include +//#include "k5-platform.h" +#include +//#include + +struct k5_data +{ + krb5_context ctx; + krb5_ccache in_cc, out_cc; + krb5_principal me; + char *name; + krb5_boolean switch_to_cache; +}; + +class KerberosInit +{ +public: + int init(const String & keytab_file, const String & principal, const String & cache_name = ""); + ~KerberosInit(); +private: + struct k5_data * k5 = nullptr; + //struct k5_data k5; + struct k5_data k5d; + krb5_ccache defcache = nullptr; + krb5_get_init_creds_opt *options = nullptr; + krb5_creds my_creds; + krb5_keytab keytab = nullptr; + char * principal_name; + krb5_principal defcache_princ = nullptr; +}; + diff --git a/src/Access/examples/CMakeLists.txt b/src/Access/examples/CMakeLists.txt new file mode 100644 index 00000000000..07f75ca0b47 --- /dev/null +++ b/src/Access/examples/CMakeLists.txt @@ -0,0 +1,4 @@ +if (TARGET ch_contrib::krb5) + add_executable (kerberos_init kerberos_init.cpp) + target_link_libraries (kerberos_init PRIVATE dbms ch_contrib::krb5) +endif() diff --git a/src/Access/examples/kerberos_init.cpp b/src/Access/examples/kerberos_init.cpp new file mode 100644 index 00000000000..fb2575dc581 --- /dev/null +++ b/src/Access/examples/kerberos_init.cpp @@ -0,0 +1,36 @@ +#include +#include +#include +#include +#include +#include + +using namespace DB; + +int main(int argc, char ** argv) +{ + std::cout << "Kerberos Init" << "\n"; + + if (argc < 3) + { + std::cout << "Usage:" << "\n" << " kerberos_init keytab principal [cache]" << "\n"; + return 0; + } + + String cache_name = ""; + if (argc == 4) + cache_name = argv[3]; + + Poco::AutoPtr app_channel(new Poco::ConsoleChannel(std::cerr)); + Poco::Logger::root().setChannel(app_channel); + Poco::Logger::root().setLevel("trace"); + + KerberosInit k_init; + try { + k_init.init(argv[1], argv[2], cache_name); + } catch (const Exception & e) { + std::cout << "KerberosInit failure: " << getExceptionMessage(e, false) << "\n"; + } + std::cout << "Done" << "\n"; + return 0; +} diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 2f7b03790ee..71989bfd549 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB @@ -101,7 +102,7 @@ String HDFSBuilderWrapper::getKinitCmd() } void HDFSBuilderWrapper::runKinit() -{ +{ /* String cmd = getKinitCmd(); LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "running kinit: {}", cmd); @@ -113,6 +114,18 @@ void HDFSBuilderWrapper::runKinit() { throw Exception("kinit failure: " + cmd, ErrorCodes::BAD_ARGUMENTS); } + */ + LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: running KerberosInit"); + std::unique_lock lck(kinit_mtx); + KerberosInit k_init; + try { + k_init.init(hadoop_kerberos_keytab,hadoop_kerberos_principal); + } catch (const DB::Exception & e) { + String msg = "KerberosInit failure: " + DB::getExceptionMessage(e, false); + LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: {}",msg); + throw Exception(0, msg); + } + LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: finished KerberosInit"); } HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::AbstractConfiguration & config) From 8b5bf0292748c5cfb0875b66c1d3a02118fea39d Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 3 Jun 2022 18:07:18 +0300 Subject: [PATCH 02/60] Add support of cache commands in KerberosInit --- src/Access/KerberosInit.cpp | 88 +++++++++++++++------------ src/Access/KerberosInit.h | 1 - src/Access/examples/kerberos_init.cpp | 3 +- src/Storages/HDFS/HDFSCommon.cpp | 2 +- 4 files changed, 53 insertions(+), 41 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 8954eccf136..0c643e27bd2 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -1,10 +1,9 @@ #include #include - #include #include #include - +#include int KerberosInit::init(const String & keytab_file, const String & principal, const String & cache_name) { @@ -14,16 +13,14 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con krb5_error_code ret; // todo: use deftype - //const char *deftype = nullptr; + const char *deftype = nullptr; int flags = 0; - principal_name = new char[256]; - std::copy(principal.begin(), principal.end(), principal_name); - principal_name[principal.size()] = '\0'; - //memset(&k5, 0, sizeof(k5)); + if (!std::filesystem::exists(keytab_file)) + throw DB::Exception(0, "Error keytab file does not exist"); + memset(&k5d, 0, sizeof(k5d)); k5 = &k5d; - //memset(k5, 0, sizeof(k5_data)); // begin ret = krb5_init_context(&k5->ctx); if (ret) @@ -42,19 +39,46 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con if (ret) throw DB::Exception(0, "Error while getting default ccache"); LOG_DEBUG(adqm_log,"Resolved default cache"); - // todo: deftype - /*deftype = */krb5_cc_get_type(k5->ctx, defcache); + deftype = krb5_cc_get_type(k5->ctx, defcache); if (krb5_cc_get_principal(k5->ctx, defcache, &defcache_princ) != 0) defcache_princ = nullptr; } // Use the specified principal name. - ret = krb5_parse_name_flags(k5->ctx, principal_name, flags, &k5->me); + ret = krb5_parse_name_flags(k5->ctx, principal.c_str(), flags, &k5->me); if (ret) - throw DB::Exception(0, "Error when parsing principal name " + String(principal_name)); + throw DB::Exception(0, "Error when parsing principal name " + principal); + // Cache related commands + if (k5->out_cc == nullptr && krb5_cc_support_switch(k5->ctx, deftype)) + { + // Use an existing cache for the client principal if we can. + ret = krb5_cc_cache_match(k5->ctx, k5->me, &k5->out_cc); + if (ret && ret != KRB5_CC_NOTFOUND) + throw DB::Exception(0, "Error while searching for ccache for " + principal); + if (!ret) + { + LOG_DEBUG(adqm_log,"Using default cache: {}", krb5_cc_get_name(k5->ctx, k5->out_cc)); + k5->switch_to_cache = 1; + } + else if (defcache_princ != nullptr) + { + // Create a new cache to avoid overwriting the initialized default cache. + ret = krb5_cc_new_unique(k5->ctx, deftype, nullptr, &k5->out_cc); + if (ret) + throw DB::Exception(0, "Error while generating new ccache"); + LOG_DEBUG(adqm_log,"Using default cache: {}", krb5_cc_get_name(k5->ctx, k5->out_cc)); + k5->switch_to_cache = 1; + } + } - // to-do: add more cache init commands + // Use the default cache if we haven't picked one yet. + if (k5->out_cc == nullptr) + { + k5->out_cc = defcache; + defcache = nullptr; + LOG_DEBUG(adqm_log,"Using default cache: {}", krb5_cc_get_name(k5->ctx, k5->out_cc)); + } ret = krb5_unparse_name(k5->ctx, k5->me, &k5->name); if (ret) @@ -62,8 +86,6 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con LOG_DEBUG(adqm_log,"KerberosInit: Using principal: {}", k5->name); - principal_name = k5->name; - // init: memset(&my_creds, 0, sizeof(my_creds)); @@ -93,20 +115,23 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con LOG_DEBUG(adqm_log,"KerberosInit: Using keytab: {}", keytab_file); - // todo: num_pa_opts - - - // todo: in_cc / ccache + if (k5->in_cc) + { + ret = krb5_get_init_creds_opt_set_in_ccache(k5->ctx, options, k5->in_cc); + if (ret) + throw DB::Exception(0, "Error in setting input credential cache"); + } + ret = krb5_get_init_creds_opt_set_out_ccache(k5->ctx, options, k5->out_cc); + if (ret) + throw DB::Exception(0, "Error in setting output credential cache"); // action: init or renew + // todo: implement renew action // todo: doing only init action: ret = krb5_get_init_creds_keytab(k5->ctx, &my_creds, k5->me, keytab, 0, nullptr, options); if (ret) LOG_DEBUG(adqm_log,"Getting initial credentials"); - // todo: implement renew action - - LOG_DEBUG(adqm_log,"Authenticated to Kerberos v5"); LOG_DEBUG(adqm_log,"KerberosInit: end"); return 0; @@ -119,7 +144,6 @@ KerberosInit::~KerberosInit() //begin. cleanup: if (defcache) krb5_cc_close(k5->ctx, defcache); - //todo krb5_free_principal(k5->ctx, defcache_princ); // init. cleanup: @@ -133,30 +157,18 @@ KerberosInit::~KerberosInit() krb5_get_init_creds_opt_free(k5->ctx, options); if (my_creds.client == k5->me) my_creds.client = nullptr; - /* - if (opts->pa_opts) { - free(opts->pa_opts); - opts->pa_opts = NULL; - opts->num_pa_opts = 0; - } - */ krb5_free_cred_contents(k5->ctx, &my_creds); if (keytab) krb5_kt_close(k5->ctx, keytab); - - // end: + // end. cleanup: krb5_free_unparsed_name(k5->ctx, k5->name); krb5_free_principal(k5->ctx, k5->me); - /* - if (k5->in_cc != NULL) + if (k5->in_cc != nullptr) krb5_cc_close(k5->ctx, k5->in_cc); - if (k5->out_cc != NULL) + if (k5->out_cc != nullptr) krb5_cc_close(k5->ctx, k5->out_cc); - */ krb5_free_context(k5->ctx); } memset(k5, 0, sizeof(*k5)); - - delete[] principal_name; } diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h index ed64ba3b4a8..07f619aaa0b 100644 --- a/src/Access/KerberosInit.h +++ b/src/Access/KerberosInit.h @@ -31,7 +31,6 @@ private: krb5_get_init_creds_opt *options = nullptr; krb5_creds my_creds; krb5_keytab keytab = nullptr; - char * principal_name; krb5_principal defcache_princ = nullptr; }; diff --git a/src/Access/examples/kerberos_init.cpp b/src/Access/examples/kerberos_init.cpp index fb2575dc581..554d57fb8b2 100644 --- a/src/Access/examples/kerberos_init.cpp +++ b/src/Access/examples/kerberos_init.cpp @@ -26,7 +26,8 @@ int main(int argc, char ** argv) Poco::Logger::root().setLevel("trace"); KerberosInit k_init; - try { + try + { k_init.init(argv[1], argv[2], cache_name); } catch (const Exception & e) { std::cout << "KerberosInit failure: " << getExceptionMessage(e, false) << "\n"; diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 71989bfd549..f2ec7af368a 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -119,7 +119,7 @@ void HDFSBuilderWrapper::runKinit() std::unique_lock lck(kinit_mtx); KerberosInit k_init; try { - k_init.init(hadoop_kerberos_keytab,hadoop_kerberos_principal); + k_init.init(hadoop_kerberos_keytab,hadoop_kerberos_principal,hadoop_security_kerberos_ticket_cache_path); } catch (const DB::Exception & e) { String msg = "KerberosInit failure: " + DB::getExceptionMessage(e, false); LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: {}",msg); From 323835f51d600f4b406492d5122304095b4b7cbb Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Mon, 6 Jun 2022 11:34:10 +0300 Subject: [PATCH 03/60] Add renew/init logic in KerberosInit --- src/Access/KerberosInit.cpp | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 0c643e27bd2..0142708c699 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -126,11 +126,35 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con throw DB::Exception(0, "Error in setting output credential cache"); // action: init or renew - // todo: implement renew action - // todo: doing only init action: - ret = krb5_get_init_creds_keytab(k5->ctx, &my_creds, k5->me, keytab, 0, nullptr, options); + LOG_DEBUG(adqm_log,"Trying to renew credentials"); + ret = krb5_get_renewed_creds(k5->ctx, &my_creds, k5->me, k5->out_cc, nullptr); if (ret) - LOG_DEBUG(adqm_log,"Getting initial credentials"); + { + LOG_DEBUG(adqm_log,"Renew failed, making init credentials"); + ret = krb5_get_init_creds_keytab(k5->ctx, &my_creds, k5->me, keytab, 0, nullptr, options); + if (ret) + throw DB::Exception(0, "Error in init"); + else + LOG_DEBUG(adqm_log,"Getting initial credentials"); + } + else + { + LOG_DEBUG(adqm_log,"Successfull reviewal"); + ret = krb5_cc_initialize(k5->ctx, k5->out_cc, k5->me); + if (ret) + throw DB::Exception(0, "Error when initializing cache"); + LOG_DEBUG(adqm_log,"Initialized cache"); + ret = krb5_cc_store_cred(k5->ctx, k5->out_cc, &my_creds); + if (ret) + LOG_DEBUG(adqm_log,"Error while storing credentials"); + LOG_DEBUG(adqm_log,"Stored credentials"); + } + + if (k5->switch_to_cache) { + ret = krb5_cc_switch(k5->ctx, k5->out_cc); + if (ret) + throw DB::Exception(0, "Error while switching to new ccache"); + } LOG_DEBUG(adqm_log,"Authenticated to Kerberos v5"); LOG_DEBUG(adqm_log,"KerberosInit: end"); From cb53aa15ec0a670eb98fd07738d95b74cbd1f73c Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 7 Jun 2022 12:06:22 +0300 Subject: [PATCH 04/60] Fix HDFSCommon and test_storage_kerberized_hdfs to make running integration tests --- src/Storages/HDFS/HDFSCommon.cpp | 4 +--- tests/integration/test_storage_kerberized_hdfs/test.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index f2ec7af368a..b13866d7a7b 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -121,9 +121,7 @@ void HDFSBuilderWrapper::runKinit() try { k_init.init(hadoop_kerberos_keytab,hadoop_kerberos_principal,hadoop_security_kerberos_ticket_cache_path); } catch (const DB::Exception & e) { - String msg = "KerberosInit failure: " + DB::getExceptionMessage(e, false); - LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: {}",msg); - throw Exception(0, msg); + throw Exception("KerberosInit failure: "+ DB::getExceptionMessage(e, false), ErrorCodes::BAD_ARGUMENTS); } LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: finished KerberosInit"); } diff --git a/tests/integration/test_storage_kerberized_hdfs/test.py b/tests/integration/test_storage_kerberized_hdfs/test.py index fb00403b952..5ac8b4670f9 100644 --- a/tests/integration/test_storage_kerberized_hdfs/test.py +++ b/tests/integration/test_storage_kerberized_hdfs/test.py @@ -113,7 +113,7 @@ def test_read_table_expired(started_cluster): ) assert False, "Exception have to be thrown" except Exception as ex: - assert "DB::Exception: kinit failure:" in str(ex) + assert "DB::Exception: KerberosInit failure:" in str(ex) started_cluster.unpause_container("hdfskerberos") From a156a77890eeab8423f86a93e7a48e2784eeb0b0 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 7 Jun 2022 14:59:46 +0300 Subject: [PATCH 05/60] Add KerberosInit into StorageKafka --- src/Access/KerberosInit.cpp | 1 + src/Storages/HDFS/HDFSCommon.cpp | 3 ++- src/Storages/Kafka/StorageKafka.cpp | 21 +++++++++++++++++++ .../test_storage_kerberized_kafka/test.py | 4 +++- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 0142708c699..724aa223c2b 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -9,6 +9,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con { auto adqm_log = &Poco::Logger::get("ADQM"); LOG_DEBUG(adqm_log,"KerberosInit: begin"); + //LOG_DEBUG(adqm_log,"KerberosInit: do nothing"); return 0; krb5_error_code ret; diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index b13866d7a7b..35319ce1d83 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -118,7 +118,8 @@ void HDFSBuilderWrapper::runKinit() LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: running KerberosInit"); std::unique_lock lck(kinit_mtx); KerberosInit k_init; - try { + try + { k_init.init(hadoop_kerberos_keytab,hadoop_kerberos_principal,hadoop_security_kerberos_ticket_cache_path); } catch (const DB::Exception & e) { throw Exception("KerberosInit failure: "+ DB::getExceptionMessage(e, false), ErrorCodes::BAD_ARGUMENTS); diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 2409f8dcb6e..f6850f02511 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -44,6 +44,8 @@ #include #include +#include + namespace CurrentMetrics { @@ -515,6 +517,25 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) if (config.has(config_prefix)) loadFromConfig(conf, config, config_prefix); + if (conf.has_property("sasl.kerberos.keytab") && conf.has_property("sasl.kerberos.principal")) + { + LOG_DEBUG(log, "ADQM: preparing KerberosInit"); + String keytab = conf.get("sasl.kerberos.keytab"); + String principal = conf.get("sasl.kerberos.principal"); + LOG_DEBUG(log, "ADQM: keytab: {}, principal: {}", keytab, principal); + LOG_DEBUG(log, "ADQM: running KerberosInit"); + KerberosInit k_init; + try + { + k_init.init(keytab,principal); + } catch (const DB::Exception & e) { + LOG_ERROR(log, "ADQM: KerberosInit failure: {}", DB::getExceptionMessage(e, false)); + } + LOG_DEBUG(log, "ADQM: finished KerberosInit"); + conf.set("sasl.kerberos.kinit.cmd",""); + conf.set("sasl.kerberos.min.time.before.relogin","0"); + } + // Update consumer topic-specific configuration for (const auto & topic : topics) { diff --git a/tests/integration/test_storage_kerberized_kafka/test.py b/tests/integration/test_storage_kerberized_kafka/test.py index 6347ba89c16..98d39dd5d4f 100644 --- a/tests/integration/test_storage_kerberized_kafka/test.py +++ b/tests/integration/test_storage_kerberized_kafka/test.py @@ -122,6 +122,7 @@ def test_kafka_json_as_string(kafka_cluster): {"t": 124, "e": {"x": "test"} } {"F1":"V1","F2":{"F21":"V21","F22":{},"F23":"V23","F24":"2019-12-24T16:28:04"},"F3":"V3"} """ + logging.debug("ADQM: logs: %s", instance.grep_in_log("ADQM")) assert TSV(result) == TSV(expected) assert instance.contains_in_log( "Parsing of message (topic: kafka_json_as_string, partition: 0, offset: 1) return no rows" @@ -182,7 +183,8 @@ def test_kafka_json_as_string_no_kdc(kafka_cluster): assert TSV(result) == TSV(expected) assert instance.contains_in_log("StorageKafka (kafka_no_kdc): Nothing to commit") assert instance.contains_in_log("Ticket expired") - assert instance.contains_in_log("Kerberos ticket refresh failed") + #~ assert instance.contains_in_log("Kerberos ticket refresh failed") + assert instance.contains_in_log("KerberosInit failure:") if __name__ == "__main__": From 2b76d0c6a992631b7727c5b051c98d4e077f4803 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 8 Jun 2022 12:26:35 +0300 Subject: [PATCH 06/60] Add new integration test for kerberized Kafka; remove old kinit code from HDFSCommon --- src/Access/KerberosInit.cpp | 1 - src/Storages/HDFS/HDFSCommon.cpp | 38 +---------------- src/Storages/HDFS/HDFSCommon.h | 2 - .../test_storage_kerberized_kafka/test.py | 41 ++++++++++++++++++- 4 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 724aa223c2b..1fa550a4ec1 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -13,7 +13,6 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con krb5_error_code ret; - // todo: use deftype const char *deftype = nullptr; int flags = 0; diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 35319ce1d83..a3554bafeff 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -77,44 +77,8 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration } } -String HDFSBuilderWrapper::getKinitCmd() -{ - - if (hadoop_kerberos_keytab.empty() || hadoop_kerberos_principal.empty()) - { - throw Exception("Not enough parameters to run kinit", - ErrorCodes::NO_ELEMENTS_IN_CONFIG); - } - - WriteBufferFromOwnString ss; - - String cache_name = hadoop_security_kerberos_ticket_cache_path.empty() ? - String() : - (String(" -c \"") + hadoop_security_kerberos_ticket_cache_path + "\""); - - // command to run looks like - // kinit -R -t /keytab_dir/clickhouse.keytab -k somebody@TEST.CLICKHOUSE.TECH || .. - ss << hadoop_kerberos_kinit_command << cache_name << - " -R -t \"" << hadoop_kerberos_keytab << "\" -k " << hadoop_kerberos_principal << - "|| " << hadoop_kerberos_kinit_command << cache_name << " -t \"" << - hadoop_kerberos_keytab << "\" -k " << hadoop_kerberos_principal; - return ss.str(); -} - void HDFSBuilderWrapper::runKinit() -{ /* - String cmd = getKinitCmd(); - LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "running kinit: {}", cmd); - - std::unique_lock lck(kinit_mtx); - - auto command = ShellCommand::execute(cmd); - auto status = command->tryWait(); - if (status) - { - throw Exception("kinit failure: " + cmd, ErrorCodes::BAD_ARGUMENTS); - } - */ +{ LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: running KerberosInit"); std::unique_lock lck(kinit_mtx); KerberosInit k_init; diff --git a/src/Storages/HDFS/HDFSCommon.h b/src/Storages/HDFS/HDFSCommon.h index 0523849abe5..773661b0745 100644 --- a/src/Storages/HDFS/HDFSCommon.h +++ b/src/Storages/HDFS/HDFSCommon.h @@ -69,8 +69,6 @@ public: private: void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const String & prefix, bool isUser = false); - String getKinitCmd(); - void runKinit(); // hdfs builder relies on an external config data storage diff --git a/tests/integration/test_storage_kerberized_kafka/test.py b/tests/integration/test_storage_kerberized_kafka/test.py index 98d39dd5d4f..091307a2b01 100644 --- a/tests/integration/test_storage_kerberized_kafka/test.py +++ b/tests/integration/test_storage_kerberized_kafka/test.py @@ -122,12 +122,51 @@ def test_kafka_json_as_string(kafka_cluster): {"t": 124, "e": {"x": "test"} } {"F1":"V1","F2":{"F21":"V21","F22":{},"F23":"V23","F24":"2019-12-24T16:28:04"},"F3":"V3"} """ - logging.debug("ADQM: logs: %s", instance.grep_in_log("ADQM")) assert TSV(result) == TSV(expected) assert instance.contains_in_log( "Parsing of message (topic: kafka_json_as_string, partition: 0, offset: 1) return no rows" ) +def test_kafka_json_as_string_request_new_ticket_after_expiration(kafka_cluster): + # Ticket should be expired after the wait time + # On run of SELECT query new ticket should be requested and SELECT query should run fine. + + kafka_produce( + kafka_cluster, + "kafka_json_as_string", + [ + '{"t": 123, "e": {"x": "woof"} }', + "", + '{"t": 124, "e": {"x": "test"} }', + '{"F1":"V1","F2":{"F21":"V21","F22":{},"F23":"V23","F24":"2019-12-24T16:28:04"},"F3":"V3"}', + ], + ) + + instance.query( + """ + CREATE TABLE test.kafka (field String) + ENGINE = Kafka + SETTINGS kafka_broker_list = 'kerberized_kafka1:19092', + kafka_topic_list = 'kafka_json_as_string', + kafka_commit_on_select = 1, + kafka_group_name = 'kafka_json_as_string', + kafka_format = 'JSONAsString', + kafka_flush_interval_ms=1000; + """ + ) + + time.sleep(45) # wait for ticket expiration + + result = instance.query("SELECT * FROM test.kafka;") + expected = """\ +{"t": 123, "e": {"x": "woof"} } +{"t": 124, "e": {"x": "test"} } +{"F1":"V1","F2":{"F21":"V21","F22":{},"F23":"V23","F24":"2019-12-24T16:28:04"},"F3":"V3"} +""" + assert TSV(result) == TSV(expected) + assert instance.contains_in_log( + "Parsing of message (topic: kafka_json_as_string, partition: 0, offset: 1) return no rows" + ) def test_kafka_json_as_string_no_kdc(kafka_cluster): # When the test is run alone (not preceded by any other kerberized kafka test), From 3cfea6e76f88e26b24cf4e6a371f033216dfa37f Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 8 Jun 2022 17:57:45 +0300 Subject: [PATCH 07/60] Cleanup code in KerberosInit, HDFSCommon and StorageKafka; update English and Russian documentation. --- .../table-engines/integrations/hdfs.md | 4 +- .../table-engines/integrations/kafka.md | 2 +- .../table-engines/integrations/hdfs.md | 3 +- .../table-engines/integrations/kafka.md | 2 +- src/Access/KerberosInit.cpp | 188 ++++++++---------- src/Access/KerberosInit.h | 19 +- src/Access/examples/kerberos_init.cpp | 2 + src/Storages/HDFS/HDFSCommon.cpp | 11 +- src/Storages/HDFS/HDFSCommon.h | 3 - src/Storages/Kafka/StorageKafka.cpp | 4 +- 10 files changed, 104 insertions(+), 134 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 503bd779abf..146d1fcb3a6 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -186,7 +186,6 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us | - | - | |hadoop\_kerberos\_keytab | "" | |hadoop\_kerberos\_principal | "" | -|hadoop\_kerberos\_kinit\_command | kinit | |libhdfs3\_conf | "" | ### Limitations {#limitations} @@ -200,8 +199,7 @@ Note that due to libhdfs3 limitations only old-fashioned approach is supported, datanode communications are not secured by SASL (`HADOOP_SECURE_DN_USER` is a reliable indicator of such security approach). Use `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` for reference. -If `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` or `hadoop_kerberos_kinit_command` is specified, `kinit` will be invoked. `hadoop_kerberos_keytab` and `hadoop_kerberos_principal` are mandatory in this case. `kinit` tool and krb5 configuration files are required. - +If `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` or `hadoop_security_kerberos_ticket_cache_path` are specified, Kerberos authentication will be used. `hadoop_kerberos_keytab` and `hadoop_kerberos_principal` are mandatory in this case. ## HDFS Namenode HA support {#namenode-ha} libhdfs3 support HDFS namenode HA. diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index a9d13194a59..94fcbab51ad 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -168,7 +168,7 @@ For a list of possible configuration options, see the [librdkafka configuration ### Kerberos support {#kafka-kerberos-support} To deal with Kerberos-aware Kafka, add `security_protocol` child element with `sasl_plaintext` value. It is enough if Kerberos ticket-granting ticket is obtained and cached by OS facilities. -ClickHouse is able to maintain Kerberos credentials using a keytab file. Consider `sasl_kerberos_service_name`, `sasl_kerberos_keytab`, `sasl_kerberos_principal` and `sasl.kerberos.kinit.cmd` child elements. +ClickHouse is able to maintain Kerberos credentials using a keytab file. Consider `sasl_kerberos_service_name`, `sasl_kerberos_keytab` and `sasl_kerberos_principal` child elements. Example: diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index 0857359e987..84f31c0afcc 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -183,7 +183,6 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9 | - | - | |hadoop\_kerberos\_keytab | "" | |hadoop\_kerberos\_principal | "" | -|hadoop\_kerberos\_kinit\_command | kinit | ### Ограничения {#limitations} * `hadoop_security_kerberos_ticket_cache_path` и `libhdfs3_conf` могут быть определены только на глобальном, а не на пользовательском уровне @@ -196,7 +195,7 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9 коммуникация с узлами данных не защищена SASL (`HADOOP_SECURE_DN_USER` надежный показатель такого подхода к безопасности). Используйте `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` для примера настроек. -Если `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` или `hadoop_kerberos_kinit_command` указаны в настройках, `kinit` будет вызван. `hadoop_kerberos_keytab` и `hadoop_kerberos_principal` обязательны в этом случае. Необходимо также будет установить `kinit` и файлы конфигурации krb5. +Если `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` или `hadoop_security_kerberos_ticket_cache_path` указаны в настройках, будет использоваться аутентификация с помощью Kerberos. `hadoop_kerberos_keytab` и `hadoop_kerberos_principal` обязательны в этом случае. ## Виртуальные столбцы {#virtual-columns} diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index b24a096015d..b51a0113302 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -167,7 +167,7 @@ Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format ### Поддержка Kerberos {#kafka-kerberos-support} Чтобы начать работу с Kafka с поддержкой Kerberos, добавьте дочерний элемент `security_protocol` со значением `sasl_plaintext`. Этого будет достаточно, если получен тикет на получение тикета (ticket-granting ticket) Kerberos и он кэшируется средствами ОС. -ClickHouse может поддерживать учетные данные Kerberos с помощью файла keytab. Рассмотрим дочерние элементы `sasl_kerberos_service_name`, `sasl_kerberos_keytab`, `sasl_kerberos_principal` и `sasl.kerberos.kinit.cmd`. +ClickHouse может поддерживать учетные данные Kerberos с помощью файла keytab. Рассмотрим дочерние элементы `sasl_kerberos_service_name`, `sasl_kerberos_keytab` и `sasl_kerberos_principal`. Пример: diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 1fa550a4ec1..1130447048b 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -5,11 +5,17 @@ #include #include +using namespace DB; + +std::mutex KerberosInit::kinit_mtx; + int KerberosInit::init(const String & keytab_file, const String & principal, const String & cache_name) { - auto adqm_log = &Poco::Logger::get("ADQM"); - LOG_DEBUG(adqm_log,"KerberosInit: begin"); - //LOG_DEBUG(adqm_log,"KerberosInit: do nothing"); return 0; + // Using mutex to prevent cache file corruptions + std::unique_lock lck(kinit_mtx); + + auto log = &Poco::Logger::get("ADQM"); + LOG_DEBUG(log,"Trying to authenticate to Kerberos v5"); krb5_error_code ret; @@ -17,182 +23,152 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con int flags = 0; if (!std::filesystem::exists(keytab_file)) - throw DB::Exception(0, "Error keytab file does not exist"); + throw Exception("Error keytab file does not exist", ErrorCodes::KERBEROS_ERROR); - memset(&k5d, 0, sizeof(k5d)); - k5 = &k5d; - // begin - ret = krb5_init_context(&k5->ctx); + memset(&k5, 0, sizeof(k5)); + ret = krb5_init_context(&k5.ctx); if (ret) - throw DB::Exception(0, "Error while initializing Kerberos 5 library"); + throw Exception("Error while initializing Kerberos 5 library", ErrorCodes::KERBEROS_ERROR); if (!cache_name.empty()) { - ret = krb5_cc_resolve(k5->ctx, cache_name.c_str(), &k5->out_cc); - // todo: analyze return code - LOG_DEBUG(adqm_log,"Resolved cache"); + ret = krb5_cc_resolve(k5.ctx, cache_name.c_str(), &k5.out_cc); + if (ret) + throw Exception("Error in resolving cache", ErrorCodes::KERBEROS_ERROR); + LOG_DEBUG(log,"Resolved cache"); } else { // Resolve the default ccache and get its type and default principal (if it is initialized). - ret = krb5_cc_default(k5->ctx, &defcache); + ret = krb5_cc_default(k5.ctx, &defcache); if (ret) - throw DB::Exception(0, "Error while getting default ccache"); - LOG_DEBUG(adqm_log,"Resolved default cache"); - deftype = krb5_cc_get_type(k5->ctx, defcache); - if (krb5_cc_get_principal(k5->ctx, defcache, &defcache_princ) != 0) + throw Exception("Error while getting default ccache", ErrorCodes::KERBEROS_ERROR); + LOG_DEBUG(log,"Resolved default cache"); + deftype = krb5_cc_get_type(k5.ctx, defcache); + if (krb5_cc_get_principal(k5.ctx, defcache, &defcache_princ) != 0) defcache_princ = nullptr; } // Use the specified principal name. - ret = krb5_parse_name_flags(k5->ctx, principal.c_str(), flags, &k5->me); + ret = krb5_parse_name_flags(k5.ctx, principal.c_str(), flags, &k5.me); if (ret) - throw DB::Exception(0, "Error when parsing principal name " + principal); + throw Exception("Error when parsing principal name " + principal, ErrorCodes::KERBEROS_ERROR); // Cache related commands - if (k5->out_cc == nullptr && krb5_cc_support_switch(k5->ctx, deftype)) + if (k5.out_cc == nullptr && krb5_cc_support_switch(k5.ctx, deftype)) { // Use an existing cache for the client principal if we can. - ret = krb5_cc_cache_match(k5->ctx, k5->me, &k5->out_cc); + ret = krb5_cc_cache_match(k5.ctx, k5.me, &k5.out_cc); if (ret && ret != KRB5_CC_NOTFOUND) - throw DB::Exception(0, "Error while searching for ccache for " + principal); + throw Exception("Error while searching for cache for " + principal, ErrorCodes::KERBEROS_ERROR); if (!ret) { - LOG_DEBUG(adqm_log,"Using default cache: {}", krb5_cc_get_name(k5->ctx, k5->out_cc)); - k5->switch_to_cache = 1; + LOG_DEBUG(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); + k5.switch_to_cache = 1; } else if (defcache_princ != nullptr) { // Create a new cache to avoid overwriting the initialized default cache. - ret = krb5_cc_new_unique(k5->ctx, deftype, nullptr, &k5->out_cc); + ret = krb5_cc_new_unique(k5.ctx, deftype, nullptr, &k5.out_cc); if (ret) - throw DB::Exception(0, "Error while generating new ccache"); - LOG_DEBUG(adqm_log,"Using default cache: {}", krb5_cc_get_name(k5->ctx, k5->out_cc)); - k5->switch_to_cache = 1; + throw Exception("Error while generating new cache", ErrorCodes::KERBEROS_ERROR); + LOG_DEBUG(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); + k5.switch_to_cache = 1; } } // Use the default cache if we haven't picked one yet. - if (k5->out_cc == nullptr) + if (k5.out_cc == nullptr) { - k5->out_cc = defcache; + k5.out_cc = defcache; defcache = nullptr; - LOG_DEBUG(adqm_log,"Using default cache: {}", krb5_cc_get_name(k5->ctx, k5->out_cc)); + LOG_DEBUG(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); } - ret = krb5_unparse_name(k5->ctx, k5->me, &k5->name); + ret = krb5_unparse_name(k5.ctx, k5.me, &k5.name); if (ret) - throw DB::Exception(0, "Error when unparsing name"); + throw Exception("Error when unparsing name", ErrorCodes::KERBEROS_ERROR); + LOG_DEBUG(log,"Using principal: {}", k5.name); - LOG_DEBUG(adqm_log,"KerberosInit: Using principal: {}", k5->name); - - // init: memset(&my_creds, 0, sizeof(my_creds)); - - ret = krb5_get_init_creds_opt_alloc(k5->ctx, &options); + ret = krb5_get_init_creds_opt_alloc(k5.ctx, &options); if (ret) - throw DB::Exception(0, "Error in options allocation"); + throw Exception("Error in options allocation", ErrorCodes::KERBEROS_ERROR); - // todo -/* -#ifndef _WIN32 - if (strncmp(opts->keytab_name, "KDB:", 4) == 0) { - ret = kinit_kdb_init(&k5->ctx, k5->me->realm.data); - errctx = k5->ctx; - if (ret) { - com_err(progname, ret, - _("while setting up KDB keytab for realm %s"), - k5->me->realm.data); - goto cleanup; - } - } -#endif -*/ // Resolve keytab - ret = krb5_kt_resolve(k5->ctx, keytab_file.c_str(), &keytab); + ret = krb5_kt_resolve(k5.ctx, keytab_file.c_str(), &keytab); if (ret) - throw DB::Exception(0, "Error resolving keytab "+keytab_file); + throw Exception("Error in resolving keytab "+keytab_file, ErrorCodes::KERBEROS_ERROR); + LOG_DEBUG(log,"Using keytab: {}", keytab_file); - LOG_DEBUG(adqm_log,"KerberosInit: Using keytab: {}", keytab_file); - - if (k5->in_cc) + if (k5.in_cc) { - ret = krb5_get_init_creds_opt_set_in_ccache(k5->ctx, options, k5->in_cc); + ret = krb5_get_init_creds_opt_set_in_ccache(k5.ctx, options, k5.in_cc); if (ret) - throw DB::Exception(0, "Error in setting input credential cache"); + throw Exception("Error in setting input credential cache", ErrorCodes::KERBEROS_ERROR); } - ret = krb5_get_init_creds_opt_set_out_ccache(k5->ctx, options, k5->out_cc); + ret = krb5_get_init_creds_opt_set_out_ccache(k5.ctx, options, k5.out_cc); if (ret) - throw DB::Exception(0, "Error in setting output credential cache"); + throw Exception("Error in setting output credential cache", ErrorCodes::KERBEROS_ERROR); - // action: init or renew - LOG_DEBUG(adqm_log,"Trying to renew credentials"); - ret = krb5_get_renewed_creds(k5->ctx, &my_creds, k5->me, k5->out_cc, nullptr); + // Action: init or renew + LOG_DEBUG(log,"Trying to renew credentials"); + ret = krb5_get_renewed_creds(k5.ctx, &my_creds, k5.me, k5.out_cc, nullptr); if (ret) { - LOG_DEBUG(adqm_log,"Renew failed, making init credentials"); - ret = krb5_get_init_creds_keytab(k5->ctx, &my_creds, k5->me, keytab, 0, nullptr, options); + LOG_DEBUG(log,"Renew failed, trying to get initial credentials"); + ret = krb5_get_init_creds_keytab(k5.ctx, &my_creds, k5.me, keytab, 0, nullptr, options); if (ret) - throw DB::Exception(0, "Error in init"); + throw Exception("Error in getting initial credentials", ErrorCodes::KERBEROS_ERROR); else - LOG_DEBUG(adqm_log,"Getting initial credentials"); + LOG_DEBUG(log,"Got initial credentials"); } else { - LOG_DEBUG(adqm_log,"Successfull reviewal"); - ret = krb5_cc_initialize(k5->ctx, k5->out_cc, k5->me); + LOG_DEBUG(log,"Successfull renewal"); + ret = krb5_cc_initialize(k5.ctx, k5.out_cc, k5.me); if (ret) - throw DB::Exception(0, "Error when initializing cache"); - LOG_DEBUG(adqm_log,"Initialized cache"); - ret = krb5_cc_store_cred(k5->ctx, k5->out_cc, &my_creds); + throw Exception("Error when initializing cache", ErrorCodes::KERBEROS_ERROR); + LOG_DEBUG(log,"Initialized cache"); + ret = krb5_cc_store_cred(k5.ctx, k5.out_cc, &my_creds); if (ret) - LOG_DEBUG(adqm_log,"Error while storing credentials"); - LOG_DEBUG(adqm_log,"Stored credentials"); + LOG_DEBUG(log,"Error while storing credentials"); + LOG_DEBUG(log,"Stored credentials"); } - if (k5->switch_to_cache) { - ret = krb5_cc_switch(k5->ctx, k5->out_cc); + if (k5.switch_to_cache) { + ret = krb5_cc_switch(k5.ctx, k5.out_cc); if (ret) - throw DB::Exception(0, "Error while switching to new ccache"); + throw Exception("Error while switching to new ccache", ErrorCodes::KERBEROS_ERROR); } - LOG_DEBUG(adqm_log,"Authenticated to Kerberos v5"); - LOG_DEBUG(adqm_log,"KerberosInit: end"); + LOG_DEBUG(log,"Authenticated to Kerberos v5"); return 0; } KerberosInit::~KerberosInit() { - if (k5->ctx) + std::unique_lock lck(kinit_mtx); + if (k5.ctx) { - //begin. cleanup: if (defcache) - krb5_cc_close(k5->ctx, defcache); - krb5_free_principal(k5->ctx, defcache_princ); + krb5_cc_close(k5.ctx, defcache); + krb5_free_principal(k5.ctx, defcache_princ); - // init. cleanup: - //todo: - /* - #ifndef _WIN32 - kinit_kdb_fini(); - #endif - */ if (options) - krb5_get_init_creds_opt_free(k5->ctx, options); - if (my_creds.client == k5->me) + krb5_get_init_creds_opt_free(k5.ctx, options); + if (my_creds.client == k5.me) my_creds.client = nullptr; - krb5_free_cred_contents(k5->ctx, &my_creds); + krb5_free_cred_contents(k5.ctx, &my_creds); if (keytab) - krb5_kt_close(k5->ctx, keytab); + krb5_kt_close(k5.ctx, keytab); - // end. cleanup: - krb5_free_unparsed_name(k5->ctx, k5->name); - krb5_free_principal(k5->ctx, k5->me); - if (k5->in_cc != nullptr) - krb5_cc_close(k5->ctx, k5->in_cc); - if (k5->out_cc != nullptr) - krb5_cc_close(k5->ctx, k5->out_cc); - krb5_free_context(k5->ctx); + krb5_free_unparsed_name(k5.ctx, k5.name); + krb5_free_principal(k5.ctx, k5.me); + if (k5.in_cc != nullptr) + krb5_cc_close(k5.ctx, k5.in_cc); + if (k5.out_cc != nullptr) + krb5_cc_close(k5.ctx, k5.out_cc); + krb5_free_context(k5.ctx); } - memset(k5, 0, sizeof(*k5)); } diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h index 07f619aaa0b..48401cbac1b 100644 --- a/src/Access/KerberosInit.h +++ b/src/Access/KerberosInit.h @@ -4,10 +4,16 @@ #include -//#include -//#include "k5-platform.h" #include -//#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int KERBEROS_ERROR; +} +} struct k5_data { @@ -24,13 +30,14 @@ public: int init(const String & keytab_file, const String & principal, const String & cache_name = ""); ~KerberosInit(); private: - struct k5_data * k5 = nullptr; - //struct k5_data k5; - struct k5_data k5d; + //struct k5_data * k5 = nullptr; + struct k5_data k5; + //struct k5_data k5d; krb5_ccache defcache = nullptr; krb5_get_init_creds_opt *options = nullptr; krb5_creds my_creds; krb5_keytab keytab = nullptr; krb5_principal defcache_princ = nullptr; + static std::mutex kinit_mtx; }; diff --git a/src/Access/examples/kerberos_init.cpp b/src/Access/examples/kerberos_init.cpp index 554d57fb8b2..49923e81fd7 100644 --- a/src/Access/examples/kerberos_init.cpp +++ b/src/Access/examples/kerberos_init.cpp @@ -13,6 +13,7 @@ int main(int argc, char ** argv) if (argc < 3) { + std::cout << "kerberos_init obtains and caches an initial ticket-granting ticket for principal." << "\n\n"; std::cout << "Usage:" << "\n" << " kerberos_init keytab principal [cache]" << "\n"; return 0; } @@ -31,6 +32,7 @@ int main(int argc, char ** argv) k_init.init(argv[1], argv[2], cache_name); } catch (const Exception & e) { std::cout << "KerberosInit failure: " << getExceptionMessage(e, false) << "\n"; + return -1; } std::cout << "Done" << "\n"; return 0; diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index a3554bafeff..e3592a7e53b 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -52,12 +52,6 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration hdfsBuilderSetPrincipal(hdfs_builder, hadoop_kerberos_principal.c_str()); continue; } - else if (key == "hadoop_kerberos_kinit_command") - { - need_kinit = true; - hadoop_kerberos_kinit_command = config.getString(key_path); - continue; - } else if (key == "hadoop_security_kerberos_ticket_cache_path") { if (isUser) @@ -80,13 +74,12 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration void HDFSBuilderWrapper::runKinit() { LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: running KerberosInit"); - std::unique_lock lck(kinit_mtx); KerberosInit k_init; try { k_init.init(hadoop_kerberos_keytab,hadoop_kerberos_principal,hadoop_security_kerberos_ticket_cache_path); } catch (const DB::Exception & e) { - throw Exception("KerberosInit failure: "+ DB::getExceptionMessage(e, false), ErrorCodes::BAD_ARGUMENTS); + throw Exception("KerberosInit failure: "+ getExceptionMessage(e, false), ErrorCodes::KERBEROS_ERROR); } LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: finished KerberosInit"); } @@ -168,8 +161,6 @@ HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::A return builder; } -std::mutex HDFSBuilderWrapper::kinit_mtx; - HDFSFSPtr createHDFSFS(hdfsBuilder * builder) { HDFSFSPtr fs(hdfsBuilderConnect(builder)); diff --git a/src/Storages/HDFS/HDFSCommon.h b/src/Storages/HDFS/HDFSCommon.h index 773661b0745..cacb986a091 100644 --- a/src/Storages/HDFS/HDFSCommon.h +++ b/src/Storages/HDFS/HDFSCommon.h @@ -9,7 +9,6 @@ #include #include -#include #include #include @@ -80,10 +79,8 @@ private: hdfsBuilder * hdfs_builder; String hadoop_kerberos_keytab; String hadoop_kerberos_principal; - String hadoop_kerberos_kinit_command = "kinit"; String hadoop_security_kerberos_ticket_cache_path; - static std::mutex kinit_mtx; std::vector> config_stor; bool need_kinit{false}; }; diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index f6850f02511..d8018aebbcc 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -528,8 +528,8 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) try { k_init.init(keytab,principal); - } catch (const DB::Exception & e) { - LOG_ERROR(log, "ADQM: KerberosInit failure: {}", DB::getExceptionMessage(e, false)); + } catch (const Exception & e) { + LOG_ERROR(log, "ADQM: KerberosInit failure: {}", getExceptionMessage(e, false)); } LOG_DEBUG(log, "ADQM: finished KerberosInit"); conf.set("sasl.kerberos.kinit.cmd",""); From d1d6d874322a012fc5f16975d668dfd0dd554cad Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 9 Jun 2022 11:51:15 +0300 Subject: [PATCH 08/60] Cleanup code in KerberosInit --- src/Access/KerberosInit.cpp | 10 +++++----- src/Storages/HDFS/HDFSCommon.cpp | 4 ++-- src/Storages/Kafka/StorageKafka.cpp | 8 +++----- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 1130447048b..549520e63ea 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -14,7 +14,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con // Using mutex to prevent cache file corruptions std::unique_lock lck(kinit_mtx); - auto log = &Poco::Logger::get("ADQM"); + auto log = &Poco::Logger::get("KerberosInit"); LOG_DEBUG(log,"Trying to authenticate to Kerberos v5"); krb5_error_code ret; @@ -39,10 +39,10 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con } else { - // Resolve the default ccache and get its type and default principal (if it is initialized). + // Resolve the default cache and get its type and default principal (if it is initialized). ret = krb5_cc_default(k5.ctx, &defcache); if (ret) - throw Exception("Error while getting default ccache", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error while getting default cache", ErrorCodes::KERBEROS_ERROR); LOG_DEBUG(log,"Resolved default cache"); deftype = krb5_cc_get_type(k5.ctx, defcache); if (krb5_cc_get_principal(k5.ctx, defcache, &defcache_princ) != 0) @@ -116,7 +116,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con ret = krb5_get_renewed_creds(k5.ctx, &my_creds, k5.me, k5.out_cc, nullptr); if (ret) { - LOG_DEBUG(log,"Renew failed, trying to get initial credentials"); + LOG_DEBUG(log,"Renew failed ({}). Trying to get initial credentials", ret); ret = krb5_get_init_creds_keytab(k5.ctx, &my_creds, k5.me, keytab, 0, nullptr, options); if (ret) throw Exception("Error in getting initial credentials", ErrorCodes::KERBEROS_ERROR); @@ -139,7 +139,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con if (k5.switch_to_cache) { ret = krb5_cc_switch(k5.ctx, k5.out_cc); if (ret) - throw Exception("Error while switching to new ccache", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error while switching to new cache", ErrorCodes::KERBEROS_ERROR); } LOG_DEBUG(log,"Authenticated to Kerberos v5"); diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index e3592a7e53b..160c2279d17 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -73,7 +73,7 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration void HDFSBuilderWrapper::runKinit() { - LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: running KerberosInit"); + LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "Running KerberosInit"); KerberosInit k_init; try { @@ -81,7 +81,7 @@ void HDFSBuilderWrapper::runKinit() } catch (const DB::Exception & e) { throw Exception("KerberosInit failure: "+ getExceptionMessage(e, false), ErrorCodes::KERBEROS_ERROR); } - LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "ADQM: finished KerberosInit"); + LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "Finished KerberosInit"); } HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::AbstractConfiguration & config) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index d8018aebbcc..b473b9fee56 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -519,19 +519,17 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) if (conf.has_property("sasl.kerberos.keytab") && conf.has_property("sasl.kerberos.principal")) { - LOG_DEBUG(log, "ADQM: preparing KerberosInit"); String keytab = conf.get("sasl.kerberos.keytab"); String principal = conf.get("sasl.kerberos.principal"); - LOG_DEBUG(log, "ADQM: keytab: {}, principal: {}", keytab, principal); - LOG_DEBUG(log, "ADQM: running KerberosInit"); + LOG_DEBUG(log, "Running KerberosInit"); KerberosInit k_init; try { k_init.init(keytab,principal); } catch (const Exception & e) { - LOG_ERROR(log, "ADQM: KerberosInit failure: {}", getExceptionMessage(e, false)); + LOG_ERROR(log, "KerberosInit failure: {}", getExceptionMessage(e, false)); } - LOG_DEBUG(log, "ADQM: finished KerberosInit"); + LOG_DEBUG(log, "Finished KerberosInit"); conf.set("sasl.kerberos.kinit.cmd",""); conf.set("sasl.kerberos.min.time.before.relogin","0"); } From 4c560584c700cdffbd66beab89a7e06e7546f646 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 10 Jun 2022 12:38:39 +0300 Subject: [PATCH 09/60] Code cleanup in KerberosInit and kafka integration tests --- src/Access/KerberosInit.h | 6 ++---- tests/integration/test_storage_kerberized_kafka/test.py | 5 +++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h index 48401cbac1b..f5269793c97 100644 --- a/src/Access/KerberosInit.h +++ b/src/Access/KerberosInit.h @@ -20,7 +20,7 @@ struct k5_data krb5_context ctx; krb5_ccache in_cc, out_cc; krb5_principal me; - char *name; + char * name; krb5_boolean switch_to_cache; }; @@ -30,11 +30,9 @@ public: int init(const String & keytab_file, const String & principal, const String & cache_name = ""); ~KerberosInit(); private: - //struct k5_data * k5 = nullptr; struct k5_data k5; - //struct k5_data k5d; krb5_ccache defcache = nullptr; - krb5_get_init_creds_opt *options = nullptr; + krb5_get_init_creds_opt * options = nullptr; krb5_creds my_creds; krb5_keytab keytab = nullptr; krb5_principal defcache_princ = nullptr; diff --git a/tests/integration/test_storage_kerberized_kafka/test.py b/tests/integration/test_storage_kerberized_kafka/test.py index 091307a2b01..7856361deda 100644 --- a/tests/integration/test_storage_kerberized_kafka/test.py +++ b/tests/integration/test_storage_kerberized_kafka/test.py @@ -127,6 +127,7 @@ def test_kafka_json_as_string(kafka_cluster): "Parsing of message (topic: kafka_json_as_string, partition: 0, offset: 1) return no rows" ) + def test_kafka_json_as_string_request_new_ticket_after_expiration(kafka_cluster): # Ticket should be expired after the wait time # On run of SELECT query new ticket should be requested and SELECT query should run fine. @@ -155,7 +156,7 @@ def test_kafka_json_as_string_request_new_ticket_after_expiration(kafka_cluster) """ ) - time.sleep(45) # wait for ticket expiration + time.sleep(45) # wait for ticket expiration result = instance.query("SELECT * FROM test.kafka;") expected = """\ @@ -168,6 +169,7 @@ def test_kafka_json_as_string_request_new_ticket_after_expiration(kafka_cluster) "Parsing of message (topic: kafka_json_as_string, partition: 0, offset: 1) return no rows" ) + def test_kafka_json_as_string_no_kdc(kafka_cluster): # When the test is run alone (not preceded by any other kerberized kafka test), # we need a ticket to @@ -222,7 +224,6 @@ def test_kafka_json_as_string_no_kdc(kafka_cluster): assert TSV(result) == TSV(expected) assert instance.contains_in_log("StorageKafka (kafka_no_kdc): Nothing to commit") assert instance.contains_in_log("Ticket expired") - #~ assert instance.contains_in_log("Kerberos ticket refresh failed") assert instance.contains_in_log("KerberosInit failure:") From 9bf6b9d491fa5e8bba23a61fcca0a004b6c66451 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 15 Jun 2022 11:37:02 +0300 Subject: [PATCH 10/60] Add kinit presence handling in StorageKafka; Cleanup code in HDFSCommon --- src/Storages/HDFS/HDFSCommon.cpp | 4 +++- src/Storages/Kafka/StorageKafka.cpp | 12 +++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 160c2279d17..3ade1b1cfdf 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -78,7 +78,9 @@ void HDFSBuilderWrapper::runKinit() try { k_init.init(hadoop_kerberos_keytab,hadoop_kerberos_principal,hadoop_security_kerberos_ticket_cache_path); - } catch (const DB::Exception & e) { + } + catch (const DB::Exception & e) + { throw Exception("KerberosInit failure: "+ getExceptionMessage(e, false), ErrorCodes::KERBEROS_ERROR); } LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "Finished KerberosInit"); diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index b473b9fee56..39b8adfbec4 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -517,6 +517,12 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) if (config.has(config_prefix)) loadFromConfig(conf, config, config_prefix); + if (conf.has_property("sasl.kerberos.kinit.cmd")) + LOG_WARNING(log, "kinit executable is not allowed."); + + conf.set("sasl.kerberos.kinit.cmd",""); + conf.set("sasl.kerberos.min.time.before.relogin","0"); + if (conf.has_property("sasl.kerberos.keytab") && conf.has_property("sasl.kerberos.principal")) { String keytab = conf.get("sasl.kerberos.keytab"); @@ -526,12 +532,12 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) try { k_init.init(keytab,principal); - } catch (const Exception & e) { + } + catch (const Exception & e) + { LOG_ERROR(log, "KerberosInit failure: {}", getExceptionMessage(e, false)); } LOG_DEBUG(log, "Finished KerberosInit"); - conf.set("sasl.kerberos.kinit.cmd",""); - conf.set("sasl.kerberos.min.time.before.relogin","0"); } // Update consumer topic-specific configuration From dd5b0ee0656c6b10f25920f8557c2ef9f2b08260 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 15 Jun 2022 17:02:53 +0300 Subject: [PATCH 11/60] Add kerberosInit() function to call KeberosInit --- src/Access/KerberosInit.cpp | 36 +++++++++++++++++++++++---- src/Access/KerberosInit.h | 25 +------------------ src/Access/examples/kerberos_init.cpp | 3 +-- src/Storages/HDFS/HDFSCommon.cpp | 3 +-- src/Storages/Kafka/StorageKafka.cpp | 3 +-- 5 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 549520e63ea..95633dea077 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -7,13 +7,31 @@ using namespace DB; -std::mutex KerberosInit::kinit_mtx; +struct k5_data +{ + krb5_context ctx; + krb5_ccache in_cc, out_cc; + krb5_principal me; + char * name; + krb5_boolean switch_to_cache; +}; + +class KerberosInit +{ +public: + int init(const String & keytab_file, const String & principal, const String & cache_name = ""); + ~KerberosInit(); +private: + struct k5_data k5; + krb5_ccache defcache = nullptr; + krb5_get_init_creds_opt * options = nullptr; + krb5_creds my_creds; + krb5_keytab keytab = nullptr; + krb5_principal defcache_princ = nullptr; +}; int KerberosInit::init(const String & keytab_file, const String & principal, const String & cache_name) { - // Using mutex to prevent cache file corruptions - std::unique_lock lck(kinit_mtx); - auto log = &Poco::Logger::get("KerberosInit"); LOG_DEBUG(log,"Trying to authenticate to Kerberos v5"); @@ -148,7 +166,6 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con KerberosInit::~KerberosInit() { - std::unique_lock lck(kinit_mtx); if (k5.ctx) { if (defcache) @@ -172,3 +189,12 @@ KerberosInit::~KerberosInit() krb5_free_context(k5.ctx); } } + +int kerberosInit(const String & keytab_file, const String & principal, const String & cache_name) +{ + // Using mutex to prevent cache file corruptions + static std::mutex kinit_mtx; + std::unique_lock lck(kinit_mtx); + KerberosInit k_init; + return k_init.init(keytab_file, principal, cache_name); +} diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h index f5269793c97..94910661056 100644 --- a/src/Access/KerberosInit.h +++ b/src/Access/KerberosInit.h @@ -15,27 +15,4 @@ namespace ErrorCodes } } -struct k5_data -{ - krb5_context ctx; - krb5_ccache in_cc, out_cc; - krb5_principal me; - char * name; - krb5_boolean switch_to_cache; -}; - -class KerberosInit -{ -public: - int init(const String & keytab_file, const String & principal, const String & cache_name = ""); - ~KerberosInit(); -private: - struct k5_data k5; - krb5_ccache defcache = nullptr; - krb5_get_init_creds_opt * options = nullptr; - krb5_creds my_creds; - krb5_keytab keytab = nullptr; - krb5_principal defcache_princ = nullptr; - static std::mutex kinit_mtx; -}; - +int kerberosInit(const String & keytab_file, const String & principal, const String & cache_name = ""); diff --git a/src/Access/examples/kerberos_init.cpp b/src/Access/examples/kerberos_init.cpp index 49923e81fd7..9a42a9e00e5 100644 --- a/src/Access/examples/kerberos_init.cpp +++ b/src/Access/examples/kerberos_init.cpp @@ -26,10 +26,9 @@ int main(int argc, char ** argv) Poco::Logger::root().setChannel(app_channel); Poco::Logger::root().setLevel("trace"); - KerberosInit k_init; try { - k_init.init(argv[1], argv[2], cache_name); + kerberosInit(argv[1], argv[2], cache_name); } catch (const Exception & e) { std::cout << "KerberosInit failure: " << getExceptionMessage(e, false) << "\n"; return -1; diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 3ade1b1cfdf..9d3cb2353b6 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -74,10 +74,9 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration void HDFSBuilderWrapper::runKinit() { LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "Running KerberosInit"); - KerberosInit k_init; try { - k_init.init(hadoop_kerberos_keytab,hadoop_kerberos_principal,hadoop_security_kerberos_ticket_cache_path); + kerberosInit(hadoop_kerberos_keytab,hadoop_kerberos_principal,hadoop_security_kerberos_ticket_cache_path); } catch (const DB::Exception & e) { diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 39b8adfbec4..789df36fcf0 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -528,10 +528,9 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) String keytab = conf.get("sasl.kerberos.keytab"); String principal = conf.get("sasl.kerberos.principal"); LOG_DEBUG(log, "Running KerberosInit"); - KerberosInit k_init; try { - k_init.init(keytab,principal); + kerberosInit(keytab,principal); } catch (const Exception & e) { From 1c26424371adc53f8eefd8d0e3cc604954be089e Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 15 Jun 2022 19:35:21 +0300 Subject: [PATCH 12/60] Change message in StorageKafka; Code style correction --- src/Access/KerberosInit.cpp | 3 ++- src/Storages/Kafka/StorageKafka.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 95633dea077..6c4927dd4fa 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -154,7 +154,8 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con LOG_DEBUG(log,"Stored credentials"); } - if (k5.switch_to_cache) { + if (k5.switch_to_cache) + { ret = krb5_cc_switch(k5.ctx, k5.out_cc); if (ret) throw Exception("Error while switching to new cache", ErrorCodes::KERBEROS_ERROR); diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 789df36fcf0..b4072cd6c0e 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -518,7 +518,7 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) loadFromConfig(conf, config, config_prefix); if (conf.has_property("sasl.kerberos.kinit.cmd")) - LOG_WARNING(log, "kinit executable is not allowed."); + LOG_WARNING(log, "sasl.kerberos.kinit.cmd configuration parameter is ignored."); conf.set("sasl.kerberos.kinit.cmd",""); conf.set("sasl.kerberos.min.time.before.relogin","0"); From 89a659e738a6a7236300f918b508ad0273745304 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 15 Jun 2022 20:08:16 +0300 Subject: [PATCH 13/60] Move krb header files from KerberosInit.h to KerberosInit.cpp --- src/Access/KerberosInit.cpp | 2 ++ src/Access/KerberosInit.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 6c4927dd4fa..4fcb280f033 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -4,6 +4,8 @@ #include #include #include +#include +#include using namespace DB; diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h index 94910661056..60f959ab4ba 100644 --- a/src/Access/KerberosInit.h +++ b/src/Access/KerberosInit.h @@ -4,9 +4,6 @@ #include -#include -#include - namespace DB { namespace ErrorCodes From 344fbe8de4038ec6f93d67eb3455eda1ad82ce87 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 15 Jun 2022 20:26:42 +0300 Subject: [PATCH 14/60] Fix code style --- src/Access/KerberosInit.cpp | 10 +++++++++- src/Access/KerberosInit.h | 8 -------- src/Access/examples/kerberos_init.cpp | 6 ++++-- src/Storages/HDFS/HDFSCommon.cpp | 1 + 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 4fcb280f033..304bb69d424 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -9,6 +9,14 @@ using namespace DB; +namespace DB +{ +namespace ErrorCodes +{ + extern const int KERBEROS_ERROR; +} +} + struct k5_data { krb5_context ctx; @@ -145,7 +153,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con } else { - LOG_DEBUG(log,"Successfull renewal"); + LOG_DEBUG(log,"Successful renewal"); ret = krb5_cc_initialize(k5.ctx, k5.out_cc, k5.me); if (ret) throw Exception("Error when initializing cache", ErrorCodes::KERBEROS_ERROR); diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h index 60f959ab4ba..cfbbfc2e7d8 100644 --- a/src/Access/KerberosInit.h +++ b/src/Access/KerberosInit.h @@ -4,12 +4,4 @@ #include -namespace DB -{ -namespace ErrorCodes -{ - extern const int KERBEROS_ERROR; -} -} - int kerberosInit(const String & keytab_file, const String & principal, const String & cache_name = ""); diff --git a/src/Access/examples/kerberos_init.cpp b/src/Access/examples/kerberos_init.cpp index 9a42a9e00e5..4868182fba2 100644 --- a/src/Access/examples/kerberos_init.cpp +++ b/src/Access/examples/kerberos_init.cpp @@ -29,8 +29,10 @@ int main(int argc, char ** argv) try { kerberosInit(argv[1], argv[2], cache_name); - } catch (const Exception & e) { - std::cout << "KerberosInit failure: " << getExceptionMessage(e, false) << "\n"; + } + catch (const Exception & e) + { + std::cout << "KerberosInit failure: " << getExceptionMessage(e, false) << "\n"; return -1; } std::cout << "Done" << "\n"; diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 9d3cb2353b6..8134aaf4981 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -21,6 +21,7 @@ namespace ErrorCodes extern const int NETWORK_ERROR; extern const int EXCESSIVE_ELEMENT_IN_CONFIG; extern const int NO_ELEMENTS_IN_CONFIG; + extern const int KERBEROS_ERROR; } const String HDFSBuilderWrapper::CONFIG_PREFIX = "hdfs"; From d93fd3bd2dcdfec6d1eba790508e393d91858999 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 16 Jun 2022 09:30:40 +0000 Subject: [PATCH 15/60] Add complilation support for case when krb5 is not used --- src/Access/KerberosInit.cpp | 2 ++ src/Access/KerberosInit.h | 4 ++++ src/Storages/HDFS/HDFSCommon.cpp | 5 ++++- src/Storages/Kafka/StorageKafka.cpp | 6 ++++-- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 304bb69d424..8d4e2339bbf 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -4,6 +4,7 @@ #include #include #include +#if USE_KRB5 #include #include @@ -209,3 +210,4 @@ int kerberosInit(const String & keytab_file, const String & principal, const Str KerberosInit k_init; return k_init.init(keytab_file, principal, cache_name); } +#endif // USE_KRB5 diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h index cfbbfc2e7d8..4731baf706f 100644 --- a/src/Access/KerberosInit.h +++ b/src/Access/KerberosInit.h @@ -4,4 +4,8 @@ #include +#if USE_KRB5 + int kerberosInit(const String & keytab_file, const String & principal, const String & cache_name = ""); + +#endif // USE_KRB5 diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 8134aaf4981..bee8ac21acb 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -10,8 +10,9 @@ #include #include #include +#if USE_KRB5 #include - +#endif // USE_KRB5 namespace DB { @@ -74,6 +75,7 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration void HDFSBuilderWrapper::runKinit() { + #if USE_KRB5 LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "Running KerberosInit"); try { @@ -84,6 +86,7 @@ void HDFSBuilderWrapper::runKinit() throw Exception("KerberosInit failure: "+ getExceptionMessage(e, false), ErrorCodes::KERBEROS_ERROR); } LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "Finished KerberosInit"); + #endif // USE_KRB5 } HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::AbstractConfiguration & config) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index b4072cd6c0e..a24630fa3b1 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -43,9 +43,9 @@ #include #include - +#if USE_KRB5 #include - +#endif // USE_KRB5 namespace CurrentMetrics { @@ -517,6 +517,7 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) if (config.has(config_prefix)) loadFromConfig(conf, config, config_prefix); + #if USE_KRB5 if (conf.has_property("sasl.kerberos.kinit.cmd")) LOG_WARNING(log, "sasl.kerberos.kinit.cmd configuration parameter is ignored."); @@ -538,6 +539,7 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) } LOG_DEBUG(log, "Finished KerberosInit"); } + #endif // USE_KRB5 // Update consumer topic-specific configuration for (const auto & topic : topics) From 6e28275569d9ede757ab490f110e578524775458 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 16 Jun 2022 14:21:04 +0300 Subject: [PATCH 16/60] Add warnings about using krb5 parameters --- src/Storages/HDFS/HDFSCommon.cpp | 19 ++++++++++++++++--- src/Storages/HDFS/HDFSCommon.h | 9 +++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index bee8ac21acb..87b2d02ada0 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -20,9 +20,10 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int NETWORK_ERROR; + #if USE_KRB5 extern const int EXCESSIVE_ELEMENT_IN_CONFIG; - extern const int NO_ELEMENTS_IN_CONFIG; extern const int KERBEROS_ERROR; + #endif // USE_KRB5 } const String HDFSBuilderWrapper::CONFIG_PREFIX = "hdfs"; @@ -43,19 +44,28 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration String key_name; if (key == "hadoop_kerberos_keytab") { + #if USE_KRB5 need_kinit = true; hadoop_kerberos_keytab = config.getString(key_path); + #else // USE_KRB5 + LOG_WARNING(&Poco::Logger::get("HDFSClient"), "hadoop_kerberos_keytab parameter is ignored because ClickHouse was built without support of krb5 library."); + #endif // USE_KRB5 continue; } else if (key == "hadoop_kerberos_principal") { + #if USE_KRB5 need_kinit = true; hadoop_kerberos_principal = config.getString(key_path); hdfsBuilderSetPrincipal(hdfs_builder, hadoop_kerberos_principal.c_str()); + #else // USE_KRB5 + LOG_WARNING(&Poco::Logger::get("HDFSClient"), "hadoop_kerberos_principal parameter is ignored because ClickHouse was built without support of krb5 library."); + #endif // USE_KRB5 continue; } else if (key == "hadoop_security_kerberos_ticket_cache_path") { + #if USE_KRB5 if (isUser) { throw Exception("hadoop.security.kerberos.ticket.cache.path cannot be set per user", @@ -64,6 +74,9 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration hadoop_security_kerberos_ticket_cache_path = config.getString(key_path); // standard param - pass further + #else // USE_KRB5 + LOG_WARNING(&Poco::Logger::get("HDFSClient"), "hadoop.security.kerberos.ticket.cache.path parameter is ignored because ClickHouse was built without support of krb5 library."); + #endif // USE_KRB5 } key_name = boost::replace_all_copy(key, "_", "."); @@ -73,9 +86,9 @@ void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration } } +#if USE_KRB5 void HDFSBuilderWrapper::runKinit() { - #if USE_KRB5 LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "Running KerberosInit"); try { @@ -86,8 +99,8 @@ void HDFSBuilderWrapper::runKinit() throw Exception("KerberosInit failure: "+ getExceptionMessage(e, false), ErrorCodes::KERBEROS_ERROR); } LOG_DEBUG(&Poco::Logger::get("HDFSClient"), "Finished KerberosInit"); - #endif // USE_KRB5 } +#endif // USE_KRB5 HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::AbstractConfiguration & config) { diff --git a/src/Storages/HDFS/HDFSCommon.h b/src/Storages/HDFS/HDFSCommon.h index cacb986a091..9eb2dfd3e46 100644 --- a/src/Storages/HDFS/HDFSCommon.h +++ b/src/Storages/HDFS/HDFSCommon.h @@ -68,8 +68,6 @@ public: private: void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const String & prefix, bool isUser = false); - void runKinit(); - // hdfs builder relies on an external config data storage std::pair& keep(const String & k, const String & v) { @@ -77,12 +75,15 @@ private: } hdfsBuilder * hdfs_builder; + std::vector> config_stor; + + #if USE_KRB5 + void runKinit(); String hadoop_kerberos_keytab; String hadoop_kerberos_principal; String hadoop_security_kerberos_ticket_cache_path; - - std::vector> config_stor; bool need_kinit{false}; + #endif // USE_KRB5 }; using HDFSFSPtr = std::unique_ptr, detail::HDFSFsDeleter>; From 0ab6bfd5d805b358cb61ae053909527a9253e47b Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 16 Jun 2022 14:25:37 +0300 Subject: [PATCH 17/60] Add warning about using krb5 parameter in StorageKafka.cpp --- src/Storages/Kafka/StorageKafka.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index a24630fa3b1..e65ac27b096 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -539,6 +539,9 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf) } LOG_DEBUG(log, "Finished KerberosInit"); } + #else // USE_KRB5 + if (conf.has_property("sasl.kerberos.keytab") || conf.has_property("sasl.kerberos.principal")) + LOG_WARNING(log, "Kerberos-related parameters are ignored because ClickHouse was built without support of krb5 library."); #endif // USE_KRB5 // Update consumer topic-specific configuration From ed3fe84b63d14cb05898cf12eac8024ae3921110 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 16 Jun 2022 14:45:27 +0300 Subject: [PATCH 18/60] Fix runKinit() is called only for USE_KRB5 --- src/Storages/HDFS/HDFSCommon.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 87b2d02ada0..3ac9f50231d 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -171,10 +171,12 @@ HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::A } } + #if USE_KRB5 if (builder.need_kinit) { builder.runKinit(); } + #endif // USE_KRB5 return builder; } From 53d656d89f9f1dafd89c21d32e3f73ed741f72db Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 20 Jun 2022 17:35:24 +0000 Subject: [PATCH 19/60] Fix deadlock in OvercommitTracker logging --- src/Common/MemoryTracker.cpp | 7 +-- src/Common/OvercommitTracker.cpp | 31 ++++++++---- src/Interpreters/InternalTextLogsQueue.cpp | 1 - src/Interpreters/InternalTextLogsQueue.h | 56 ---------------------- 4 files changed, 23 insertions(+), 72 deletions(-) diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index 51f4c83dc23..a9aef7d8465 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -338,11 +338,8 @@ void MemoryTracker::free(Int64 size) accounted_size += new_amount; } } - if (!OvercommitTrackerBlockerInThread::isBlocked()) - { - if (auto * overcommit_tracker_ptr = overcommit_tracker.load(std::memory_order_relaxed); overcommit_tracker_ptr) - overcommit_tracker_ptr->tryContinueQueryExecutionAfterFree(accounted_size); - } + if (auto * overcommit_tracker_ptr = overcommit_tracker.load(std::memory_order_relaxed)) + overcommit_tracker_ptr->tryContinueQueryExecutionAfterFree(accounted_size); if (auto * loaded_next = parent.load(std::memory_order_relaxed)) loaded_next->free(size); diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index 3cef72eb8b4..097b6637569 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -23,8 +23,16 @@ OvercommitTracker::OvercommitTracker(std::mutex & global_mutex_) , allow_release(true) {} +#define LOG_DEBUG_SAFE(...) \ + { \ + OvercommitTrackerBlockerInThread blocker; \ + LOG_DEBUG(__VA_ARGS__); \ + } + OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) { + if (OvercommitTrackerBlockerInThread::isBlocked()) + return OvercommitResult::NONE; // NOTE: Do not change the order of locks // // global_mutex must be acquired before overcommit_m, because @@ -73,7 +81,7 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int }); auto wait_end_time = std::chrono::system_clock::now(); ProfileEvents::increment(ProfileEvents::MemoryOvercommitWaitTimeMicroseconds, (wait_end_time - wait_start_time) / 1us); - LOG_DEBUG(getLogger(), "Memory was{} freed within timeout", (timeout ? " not" : "")); + LOG_DEBUG_SAFE(getLogger(), "Memory was{} freed within timeout", (timeout ? " not" : "")) required_memory -= amount; Int64 still_need = required_per_thread[tracker]; // If enough memory is freed it will be 0 @@ -98,6 +106,9 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int void OvercommitTracker::tryContinueQueryExecutionAfterFree(Int64 amount) { + if (OvercommitTrackerBlockerInThread::isBlocked()) + return; + std::lock_guard guard(overcommit_m); if (cancellation_state != QueryCancellationState::NONE) { @@ -112,7 +123,7 @@ void OvercommitTracker::onQueryStop(MemoryTracker * tracker) std::unique_lock lk(overcommit_m); if (picked_tracker == tracker) { - LOG_DEBUG(getLogger(), "Picked query stopped"); + LOG_DEBUG_SAFE(getLogger(), "Picked query stopped") reset(); cv.notify_all(); @@ -140,7 +151,7 @@ void UserOvercommitTracker::pickQueryToExcludeImpl() // At this moment query list must be read only. // This is guaranteed by locking global_mutex in OvercommitTracker::needToStopQuery. auto & queries = user_process_list->queries; - LOG_DEBUG(logger, "Trying to choose query to stop from {} queries", queries.size()); + LOG_DEBUG_SAFE(logger, "Trying to choose query to stop from {} queries", queries.size()) for (auto const & query : queries) { if (query.second->isKilled()) @@ -151,15 +162,15 @@ void UserOvercommitTracker::pickQueryToExcludeImpl() continue; auto ratio = memory_tracker->getOvercommitRatio(); - LOG_DEBUG(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit); + LOG_DEBUG_SAFE(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit) if (ratio.soft_limit != 0 && current_ratio < ratio) { query_tracker = memory_tracker; current_ratio = ratio; } } - LOG_DEBUG(logger, "Selected to stop query with overcommit ratio {}/{}", - current_ratio.committed, current_ratio.soft_limit); + LOG_DEBUG_SAFE(logger, "Selected to stop query with overcommit ratio {}/{}", + current_ratio.committed, current_ratio.soft_limit) picked_tracker = query_tracker; } @@ -174,7 +185,7 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl() OvercommitRatio current_ratio{0, 0}; // At this moment query list must be read only. // This is guaranteed by locking global_mutex in OvercommitTracker::needToStopQuery. - LOG_DEBUG(logger, "Trying to choose query to stop from {} queries", process_list->size()); + LOG_DEBUG_SAFE(logger, "Trying to choose query to stop from {} queries", process_list->size()) for (auto const & query : process_list->processes) { if (query.isKilled()) @@ -190,15 +201,15 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl() if (!memory_tracker) continue; auto ratio = memory_tracker->getOvercommitRatio(user_soft_limit); - LOG_DEBUG(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit); + LOG_DEBUG_SAFE(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit) if (current_ratio < ratio) { query_tracker = memory_tracker; current_ratio = ratio; } } - LOG_DEBUG(logger, "Selected to stop query with overcommit ratio {}/{}", - current_ratio.committed, current_ratio.soft_limit); + LOG_DEBUG_SAFE(logger, "Selected to stop query with overcommit ratio {}/{}", + current_ratio.committed, current_ratio.soft_limit) picked_tracker = query_tracker; } diff --git a/src/Interpreters/InternalTextLogsQueue.cpp b/src/Interpreters/InternalTextLogsQueue.cpp index 6176e3cc865..2172a6f4261 100644 --- a/src/Interpreters/InternalTextLogsQueue.cpp +++ b/src/Interpreters/InternalTextLogsQueue.cpp @@ -38,7 +38,6 @@ MutableColumns InternalTextLogsQueue::getSampleColumns() void InternalTextLogsQueue::pushBlock(Block && log_block) { - OvercommitTrackerBlockerInThread blocker; static Block sample_block = getSampleBlock(); if (blocksHaveEqualStructure(sample_block, log_block)) diff --git a/src/Interpreters/InternalTextLogsQueue.h b/src/Interpreters/InternalTextLogsQueue.h index a7193a55178..53710fa3bd2 100644 --- a/src/Interpreters/InternalTextLogsQueue.h +++ b/src/Interpreters/InternalTextLogsQueue.h @@ -18,62 +18,6 @@ public: static Block getSampleBlock(); static MutableColumns getSampleColumns(); - template - bool push(Args &&... args) - { - OvercommitTrackerBlockerInThread blocker; - return ConcurrentBoundedQueue::push(std::forward(args)...); - } - - template - bool emplace(Args &&... args) - { - OvercommitTrackerBlockerInThread blocker; - return ConcurrentBoundedQueue::emplace(std::forward(args)...); - } - - template - bool pop(Args &&... args) - { - OvercommitTrackerBlockerInThread blocker; - return ConcurrentBoundedQueue::pop(std::forward(args)...); - } - - template - bool tryPush(Args &&... args) - { - OvercommitTrackerBlockerInThread blocker; - return ConcurrentBoundedQueue::tryPush(std::forward(args)...); - } - - template - bool tryEmplace(Args &&... args) - { - OvercommitTrackerBlockerInThread blocker; - return ConcurrentBoundedQueue::tryEmplace(std::forward(args)...); - } - - template - bool tryPop(Args &&... args) - { - OvercommitTrackerBlockerInThread blocker; - return ConcurrentBoundedQueue::tryPop(std::forward(args)...); - } - - template - void clear(Args &&... args) - { - OvercommitTrackerBlockerInThread blocker; - return ConcurrentBoundedQueue::clear(std::forward(args)...); - } - - template - void clearAndFinish(Args &&... args) - { - OvercommitTrackerBlockerInThread blocker; - return ConcurrentBoundedQueue::clearAndFinish(std::forward(args)...); - } - /// Is used to pass block from remote server to the client void pushBlock(Block && log_block); From 9ca368ac20a35e6e49f3162f00f06a6206b9e6dd Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 20 Jun 2022 23:10:40 +0000 Subject: [PATCH 20/60] Use do while(false) in macro --- src/Common/OvercommitTracker.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index 097b6637569..1e36c99b41c 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -24,10 +24,10 @@ OvercommitTracker::OvercommitTracker(std::mutex & global_mutex_) {} #define LOG_DEBUG_SAFE(...) \ - { \ + do { \ OvercommitTrackerBlockerInThread blocker; \ LOG_DEBUG(__VA_ARGS__); \ - } + } while (false) OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) { @@ -81,7 +81,7 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int }); auto wait_end_time = std::chrono::system_clock::now(); ProfileEvents::increment(ProfileEvents::MemoryOvercommitWaitTimeMicroseconds, (wait_end_time - wait_start_time) / 1us); - LOG_DEBUG_SAFE(getLogger(), "Memory was{} freed within timeout", (timeout ? " not" : "")) + LOG_DEBUG_SAFE(getLogger(), "Memory was{} freed within timeout", (timeout ? " not" : "")); required_memory -= amount; Int64 still_need = required_per_thread[tracker]; // If enough memory is freed it will be 0 @@ -123,7 +123,7 @@ void OvercommitTracker::onQueryStop(MemoryTracker * tracker) std::unique_lock lk(overcommit_m); if (picked_tracker == tracker) { - LOG_DEBUG_SAFE(getLogger(), "Picked query stopped") + LOG_DEBUG_SAFE(getLogger(), "Picked query stopped"); reset(); cv.notify_all(); @@ -151,7 +151,7 @@ void UserOvercommitTracker::pickQueryToExcludeImpl() // At this moment query list must be read only. // This is guaranteed by locking global_mutex in OvercommitTracker::needToStopQuery. auto & queries = user_process_list->queries; - LOG_DEBUG_SAFE(logger, "Trying to choose query to stop from {} queries", queries.size()) + LOG_DEBUG_SAFE(logger, "Trying to choose query to stop from {} queries", queries.size()); for (auto const & query : queries) { if (query.second->isKilled()) @@ -162,7 +162,7 @@ void UserOvercommitTracker::pickQueryToExcludeImpl() continue; auto ratio = memory_tracker->getOvercommitRatio(); - LOG_DEBUG_SAFE(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit) + LOG_DEBUG_SAFE(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit); if (ratio.soft_limit != 0 && current_ratio < ratio) { query_tracker = memory_tracker; @@ -170,7 +170,7 @@ void UserOvercommitTracker::pickQueryToExcludeImpl() } } LOG_DEBUG_SAFE(logger, "Selected to stop query with overcommit ratio {}/{}", - current_ratio.committed, current_ratio.soft_limit) + current_ratio.committed, current_ratio.soft_limit); picked_tracker = query_tracker; } @@ -185,7 +185,7 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl() OvercommitRatio current_ratio{0, 0}; // At this moment query list must be read only. // This is guaranteed by locking global_mutex in OvercommitTracker::needToStopQuery. - LOG_DEBUG_SAFE(logger, "Trying to choose query to stop from {} queries", process_list->size()) + LOG_DEBUG_SAFE(logger, "Trying to choose query to stop from {} queries", process_list->size()); for (auto const & query : process_list->processes) { if (query.isKilled()) @@ -201,7 +201,7 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl() if (!memory_tracker) continue; auto ratio = memory_tracker->getOvercommitRatio(user_soft_limit); - LOG_DEBUG_SAFE(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit) + LOG_DEBUG_SAFE(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit); if (current_ratio < ratio) { query_tracker = memory_tracker; @@ -209,7 +209,7 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl() } } LOG_DEBUG_SAFE(logger, "Selected to stop query with overcommit ratio {}/{}", - current_ratio.committed, current_ratio.soft_limit) + current_ratio.committed, current_ratio.soft_limit); picked_tracker = query_tracker; } From e10f079bd35c377170abb2a1b984e83f1ce7c0a3 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 21 Jun 2022 10:15:33 +0000 Subject: [PATCH 21/60] Get rid of allocations in OvercommitTracker --- src/Common/OvercommitTracker.cpp | 19 ++++++++++--------- src/Common/OvercommitTracker.h | 10 ++++++++-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index 1e36c99b41c..6f688ca28ff 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -20,6 +20,8 @@ OvercommitTracker::OvercommitTracker(std::mutex & global_mutex_) , global_mutex(global_mutex_) , freed_memory(0) , required_memory(0) + , next_id(0) + , id_to_release(0) , allow_release(true) {} @@ -42,6 +44,8 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int std::unique_lock global_lock(global_mutex); std::unique_lock lk(overcommit_m); + size_t id = next_id++; + auto max_wait_time = tracker->getOvercommitWaitingTime(); if (max_wait_time == ZERO_MICROSEC) @@ -73,23 +77,21 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int allow_release = true; required_memory += amount; - required_per_thread[tracker] = amount; auto wait_start_time = std::chrono::system_clock::now(); - bool timeout = !cv.wait_for(lk, max_wait_time, [this, tracker]() + bool timeout = !cv.wait_for(lk, max_wait_time, [this, id]() { - return required_per_thread[tracker] == 0 || cancellation_state == QueryCancellationState::NONE; + return id < id_to_release || cancellation_state == QueryCancellationState::NONE; }); auto wait_end_time = std::chrono::system_clock::now(); ProfileEvents::increment(ProfileEvents::MemoryOvercommitWaitTimeMicroseconds, (wait_end_time - wait_start_time) / 1us); LOG_DEBUG_SAFE(getLogger(), "Memory was{} freed within timeout", (timeout ? " not" : "")); required_memory -= amount; - Int64 still_need = required_per_thread[tracker]; // If enough memory is freed it will be 0 - required_per_thread.erase(tracker); + bool still_need = !(id < id_to_release); // True if thread wasn't released // If threads where not released since last call of this method, // we can release them now. - if (allow_release && required_memory <= freed_memory && still_need != 0) + if (allow_release && required_memory <= freed_memory && still_need) releaseThreads(); // All required amount of memory is free now and selected query to stop doesn't know about it. @@ -98,7 +100,7 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int reset(); if (timeout) return OvercommitResult::TIMEOUTED; - if (still_need != 0) + if (still_need) return OvercommitResult::NOT_ENOUGH_FREED; else return OvercommitResult::MEMORY_FREED; @@ -132,8 +134,7 @@ void OvercommitTracker::onQueryStop(MemoryTracker * tracker) void OvercommitTracker::releaseThreads() { - for (auto & required : required_per_thread) - required.second = 0; + id_to_release = next_id; freed_memory = 0; allow_release = false; // To avoid repeating call of this method in OvercommitTracker::needToStopQuery cv.notify_all(); diff --git a/src/Common/OvercommitTracker.h b/src/Common/OvercommitTracker.h index 80aaed68e37..6a03c679422 100644 --- a/src/Common/OvercommitTracker.h +++ b/src/Common/OvercommitTracker.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -104,6 +105,10 @@ private: picked_tracker = nullptr; cancellation_state = QueryCancellationState::NONE; freed_memory = 0; + + next_id = 0; + id_to_release = 0; + allow_release = true; } @@ -111,8 +116,6 @@ private: QueryCancellationState cancellation_state; - std::unordered_map required_per_thread; - // Global mutex which is used in ProcessList to synchronize // insertion and deletion of queries. // OvercommitTracker::pickQueryToExcludeImpl() implementations @@ -122,6 +125,9 @@ private: Int64 freed_memory; Int64 required_memory; + size_t next_id; // Id provided to the next thread to come in OvercommitTracker + size_t id_to_release; // We can release all threads with id smaller than this + bool allow_release; }; From 099055c1833224bf9212897a7a283a7cf529c803 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 21 Jun 2022 12:26:13 +0000 Subject: [PATCH 22/60] Explicitly forbid allocations in OvercommitTracker --- src/Common/OvercommitTracker.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index 6f688ca28ff..74235748345 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -25,14 +25,24 @@ OvercommitTracker::OvercommitTracker(std::mutex & global_mutex_) , allow_release(true) {} -#define LOG_DEBUG_SAFE(...) \ - do { \ - OvercommitTrackerBlockerInThread blocker; \ - LOG_DEBUG(__VA_ARGS__); \ +#define LOG_DEBUG_SAFE(...) \ + do { \ + OvercommitTrackerBlockerInThread blocker; \ + try \ + { \ + ALLOW_ALLOCATIONS_IN_SCOPE; \ + LOG_DEBUG(__VA_ARGS__); \ + } \ + catch (std::bad_alloc const &) \ + { \ + fprintf(stderr, "Allocation failed during writing to log in OvercommitTracker\n"); \ + } \ } while (false) OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) { + DENY_ALLOCATIONS_IN_SCOPE; + if (OvercommitTrackerBlockerInThread::isBlocked()) return OvercommitResult::NONE; // NOTE: Do not change the order of locks @@ -108,6 +118,8 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int void OvercommitTracker::tryContinueQueryExecutionAfterFree(Int64 amount) { + DENY_ALLOCATIONS_IN_SCOPE; + if (OvercommitTrackerBlockerInThread::isBlocked()) return; @@ -122,6 +134,8 @@ void OvercommitTracker::tryContinueQueryExecutionAfterFree(Int64 amount) void OvercommitTracker::onQueryStop(MemoryTracker * tracker) { + DENY_ALLOCATIONS_IN_SCOPE; + std::unique_lock lk(overcommit_m); if (picked_tracker == tracker) { From 1d6eea6cfa8068284ab06f2c7d5f8dfc8e091783 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 21 Jun 2022 18:55:17 +0300 Subject: [PATCH 23/60] Replace LOG_DEBUG by LOG_TRACE in KerberosInit; Correct exception message; Prohibit making a copy of KerberosInit --- src/Access/KerberosInit.cpp | 40 ++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 8d4e2339bbf..e78830bdaa5 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #if USE_KRB5 #include #include @@ -27,7 +28,10 @@ struct k5_data krb5_boolean switch_to_cache; }; -class KerberosInit +/** + * This class implements programmatic implementation of kinit. + */ +class KerberosInit : boost::noncopyable { public: int init(const String & keytab_file, const String & principal, const String & cache_name = ""); @@ -44,7 +48,7 @@ private: int KerberosInit::init(const String & keytab_file, const String & principal, const String & cache_name) { auto log = &Poco::Logger::get("KerberosInit"); - LOG_DEBUG(log,"Trying to authenticate to Kerberos v5"); + LOG_TRACE(log,"Trying to authenticate to Kerberos v5"); krb5_error_code ret; @@ -52,7 +56,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con int flags = 0; if (!std::filesystem::exists(keytab_file)) - throw Exception("Error keytab file does not exist", ErrorCodes::KERBEROS_ERROR); + throw Exception("Keytab file does not exist", ErrorCodes::KERBEROS_ERROR); memset(&k5, 0, sizeof(k5)); ret = krb5_init_context(&k5.ctx); @@ -64,7 +68,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con ret = krb5_cc_resolve(k5.ctx, cache_name.c_str(), &k5.out_cc); if (ret) throw Exception("Error in resolving cache", ErrorCodes::KERBEROS_ERROR); - LOG_DEBUG(log,"Resolved cache"); + LOG_TRACE(log,"Resolved cache"); } else { @@ -72,7 +76,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con ret = krb5_cc_default(k5.ctx, &defcache); if (ret) throw Exception("Error while getting default cache", ErrorCodes::KERBEROS_ERROR); - LOG_DEBUG(log,"Resolved default cache"); + LOG_TRACE(log,"Resolved default cache"); deftype = krb5_cc_get_type(k5.ctx, defcache); if (krb5_cc_get_principal(k5.ctx, defcache, &defcache_princ) != 0) defcache_princ = nullptr; @@ -92,7 +96,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con throw Exception("Error while searching for cache for " + principal, ErrorCodes::KERBEROS_ERROR); if (!ret) { - LOG_DEBUG(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); + LOG_TRACE(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); k5.switch_to_cache = 1; } else if (defcache_princ != nullptr) @@ -101,7 +105,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con ret = krb5_cc_new_unique(k5.ctx, deftype, nullptr, &k5.out_cc); if (ret) throw Exception("Error while generating new cache", ErrorCodes::KERBEROS_ERROR); - LOG_DEBUG(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); + LOG_TRACE(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); k5.switch_to_cache = 1; } } @@ -111,13 +115,13 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con { k5.out_cc = defcache; defcache = nullptr; - LOG_DEBUG(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); + LOG_TRACE(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); } ret = krb5_unparse_name(k5.ctx, k5.me, &k5.name); if (ret) throw Exception("Error when unparsing name", ErrorCodes::KERBEROS_ERROR); - LOG_DEBUG(log,"Using principal: {}", k5.name); + LOG_TRACE(log,"Using principal: {}", k5.name); memset(&my_creds, 0, sizeof(my_creds)); ret = krb5_get_init_creds_opt_alloc(k5.ctx, &options); @@ -128,7 +132,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con ret = krb5_kt_resolve(k5.ctx, keytab_file.c_str(), &keytab); if (ret) throw Exception("Error in resolving keytab "+keytab_file, ErrorCodes::KERBEROS_ERROR); - LOG_DEBUG(log,"Using keytab: {}", keytab_file); + LOG_TRACE(log,"Using keytab: {}", keytab_file); if (k5.in_cc) { @@ -141,28 +145,28 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con throw Exception("Error in setting output credential cache", ErrorCodes::KERBEROS_ERROR); // Action: init or renew - LOG_DEBUG(log,"Trying to renew credentials"); + LOG_TRACE(log,"Trying to renew credentials"); ret = krb5_get_renewed_creds(k5.ctx, &my_creds, k5.me, k5.out_cc, nullptr); if (ret) { - LOG_DEBUG(log,"Renew failed ({}). Trying to get initial credentials", ret); + LOG_TRACE(log,"Renew failed ({}). Trying to get initial credentials", ret); ret = krb5_get_init_creds_keytab(k5.ctx, &my_creds, k5.me, keytab, 0, nullptr, options); if (ret) throw Exception("Error in getting initial credentials", ErrorCodes::KERBEROS_ERROR); else - LOG_DEBUG(log,"Got initial credentials"); + LOG_TRACE(log,"Got initial credentials"); } else { - LOG_DEBUG(log,"Successful renewal"); + LOG_TRACE(log,"Successful renewal"); ret = krb5_cc_initialize(k5.ctx, k5.out_cc, k5.me); if (ret) throw Exception("Error when initializing cache", ErrorCodes::KERBEROS_ERROR); - LOG_DEBUG(log,"Initialized cache"); + LOG_TRACE(log,"Initialized cache"); ret = krb5_cc_store_cred(k5.ctx, k5.out_cc, &my_creds); if (ret) - LOG_DEBUG(log,"Error while storing credentials"); - LOG_DEBUG(log,"Stored credentials"); + LOG_TRACE(log,"Error while storing credentials"); + LOG_TRACE(log,"Stored credentials"); } if (k5.switch_to_cache) @@ -172,7 +176,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con throw Exception("Error while switching to new cache", ErrorCodes::KERBEROS_ERROR); } - LOG_DEBUG(log,"Authenticated to Kerberos v5"); + LOG_TRACE(log,"Authenticated to Kerberos v5"); return 0; } From f2811995887893e800bea3d0fe87cbd89d187d27 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 22 Jun 2022 11:28:00 +0300 Subject: [PATCH 24/60] Fix code style in KerberosInit; Add anonymous namespace; Add comment about using kerberos_init --- src/Access/KerberosInit.cpp | 9 ++++++--- src/Access/examples/kerberos_init.cpp | 7 +++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index e78830bdaa5..435a8126b05 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -19,7 +19,9 @@ namespace ErrorCodes } } -struct k5_data +namespace +{ +struct K5Data { krb5_context ctx; krb5_ccache in_cc, out_cc; @@ -37,17 +39,18 @@ public: int init(const String & keytab_file, const String & principal, const String & cache_name = ""); ~KerberosInit(); private: - struct k5_data k5; + struct K5Data k5; krb5_ccache defcache = nullptr; krb5_get_init_creds_opt * options = nullptr; krb5_creds my_creds; krb5_keytab keytab = nullptr; krb5_principal defcache_princ = nullptr; }; +} int KerberosInit::init(const String & keytab_file, const String & principal, const String & cache_name) { - auto log = &Poco::Logger::get("KerberosInit"); + auto * log = &Poco::Logger::get("KerberosInit"); LOG_TRACE(log,"Trying to authenticate to Kerberos v5"); krb5_error_code ret; diff --git a/src/Access/examples/kerberos_init.cpp b/src/Access/examples/kerberos_init.cpp index 4868182fba2..5dbe92a5b57 100644 --- a/src/Access/examples/kerberos_init.cpp +++ b/src/Access/examples/kerberos_init.cpp @@ -5,6 +5,13 @@ #include #include +/** The example demonstrates using of kerberosInit function to obtain and cache Kerberos ticket-granting ticket. + * The first argument specifies keytab file. The second argument specifies principal name. + * The third argument is optional. It specifies credentials cache location. + * After successful run of kerberos_init it is useful to call klist command to list cached Kerberos tickets. + * It is also useful to run kdestroy to destroy Kerberos tickets if needed. + */ + using namespace DB; int main(int argc, char ** argv) From 7bd65c8c24b444d23514758a27269b614560a96c Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 22 Jun 2022 16:31:48 +0300 Subject: [PATCH 25/60] Add comments to KerberosInit; Remove input cache and flags from KerberosInit --- src/Access/KerberosInit.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 435a8126b05..d3be559cba7 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -24,7 +24,7 @@ namespace struct K5Data { krb5_context ctx; - krb5_ccache in_cc, out_cc; + krb5_ccache out_cc; krb5_principal me; char * name; krb5_boolean switch_to_cache; @@ -42,6 +42,7 @@ private: struct K5Data k5; krb5_ccache defcache = nullptr; krb5_get_init_creds_opt * options = nullptr; + // Credentials structure including ticket, session key, and lifetime info. krb5_creds my_creds; krb5_keytab keytab = nullptr; krb5_principal defcache_princ = nullptr; @@ -56,7 +57,6 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con krb5_error_code ret; const char *deftype = nullptr; - int flags = 0; if (!std::filesystem::exists(keytab_file)) throw Exception("Keytab file does not exist", ErrorCodes::KERBEROS_ERROR); @@ -86,7 +86,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con } // Use the specified principal name. - ret = krb5_parse_name_flags(k5.ctx, principal.c_str(), flags, &k5.me); + ret = krb5_parse_name_flags(k5.ctx, principal.c_str(), 0, &k5.me); if (ret) throw Exception("Error when parsing principal name " + principal, ErrorCodes::KERBEROS_ERROR); @@ -126,7 +126,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con throw Exception("Error when unparsing name", ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Using principal: {}", k5.name); - memset(&my_creds, 0, sizeof(my_creds)); + // Allocate a new initial credential options structure. ret = krb5_get_init_creds_opt_alloc(k5.ctx, &options); if (ret) throw Exception("Error in options allocation", ErrorCodes::KERBEROS_ERROR); @@ -137,22 +137,20 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con throw Exception("Error in resolving keytab "+keytab_file, ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Using keytab: {}", keytab_file); - if (k5.in_cc) - { - ret = krb5_get_init_creds_opt_set_in_ccache(k5.ctx, options, k5.in_cc); - if (ret) - throw Exception("Error in setting input credential cache", ErrorCodes::KERBEROS_ERROR); - } + // Set an output credential cache in initial credential options. ret = krb5_get_init_creds_opt_set_out_ccache(k5.ctx, options, k5.out_cc); if (ret) throw Exception("Error in setting output credential cache", ErrorCodes::KERBEROS_ERROR); // Action: init or renew LOG_TRACE(log,"Trying to renew credentials"); + memset(&my_creds, 0, sizeof(my_creds)); + // Get renewed credential from KDC using an existing credential from output cache. ret = krb5_get_renewed_creds(k5.ctx, &my_creds, k5.me, k5.out_cc, nullptr); if (ret) { LOG_TRACE(log,"Renew failed ({}). Trying to get initial credentials", ret); + // Request KDC for an initial credentials using keytab. ret = krb5_get_init_creds_keytab(k5.ctx, &my_creds, k5.me, keytab, 0, nullptr, options); if (ret) throw Exception("Error in getting initial credentials", ErrorCodes::KERBEROS_ERROR); @@ -162,10 +160,12 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con else { LOG_TRACE(log,"Successful renewal"); + // Initialize a credential cache. Destroy any existing contents of cache and initialize it for the default principal. ret = krb5_cc_initialize(k5.ctx, k5.out_cc, k5.me); if (ret) throw Exception("Error when initializing cache", ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Initialized cache"); + // Store credentials in a credential cache. ret = krb5_cc_store_cred(k5.ctx, k5.out_cc, &my_creds); if (ret) LOG_TRACE(log,"Error while storing credentials"); @@ -174,6 +174,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con if (k5.switch_to_cache) { + // Make a credential cache the primary cache for its collection. ret = krb5_cc_switch(k5.ctx, k5.out_cc); if (ret) throw Exception("Error while switching to new cache", ErrorCodes::KERBEROS_ERROR); @@ -201,8 +202,6 @@ KerberosInit::~KerberosInit() krb5_free_unparsed_name(k5.ctx, k5.name); krb5_free_principal(k5.ctx, k5.me); - if (k5.in_cc != nullptr) - krb5_cc_close(k5.ctx, k5.in_cc); if (k5.out_cc != nullptr) krb5_cc_close(k5.ctx, k5.out_cc); krb5_free_context(k5.ctx); From 3544fbe5c49b13f81e5a1870b75b3950ab5f2647 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 22 Jun 2022 14:05:20 +0000 Subject: [PATCH 26/60] cleanup --- src/Common/OvercommitTracker.cpp | 25 +++++++++++++------------ src/Common/OvercommitTracker.h | 1 - 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index 74235748345..4faed833428 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -25,18 +25,19 @@ OvercommitTracker::OvercommitTracker(std::mutex & global_mutex_) , allow_release(true) {} -#define LOG_DEBUG_SAFE(...) \ - do { \ - OvercommitTrackerBlockerInThread blocker; \ - try \ - { \ - ALLOW_ALLOCATIONS_IN_SCOPE; \ - LOG_DEBUG(__VA_ARGS__); \ - } \ - catch (std::bad_alloc const &) \ - { \ - fprintf(stderr, "Allocation failed during writing to log in OvercommitTracker\n"); \ - } \ +#define LOG_DEBUG_SAFE(...) \ + do { \ + OvercommitTrackerBlockerInThread blocker; \ + try \ + { \ + ALLOW_ALLOCATIONS_IN_SCOPE; \ + LOG_DEBUG(__VA_ARGS__); \ + } \ + catch (...) \ + { \ + if (fprintf(stderr, "Allocation failed during writing to log in OvercommitTracker\n") != -1) \ + ; \ + } \ } while (false) OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) diff --git a/src/Common/OvercommitTracker.h b/src/Common/OvercommitTracker.h index 6a03c679422..6f6788493a9 100644 --- a/src/Common/OvercommitTracker.h +++ b/src/Common/OvercommitTracker.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include From ffab75baa7ec4242ef1d689095a3f64d6c0d4da3 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 22 Jun 2022 23:33:20 +0200 Subject: [PATCH 27/60] Checkout full repositories for performance tests --- .github/workflows/backport_branches.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index b93c1b61ffd..e88a121b5d3 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -143,6 +143,8 @@ jobs: sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" - name: Check out repository code uses: actions/checkout@v2 + with: + fetch-depth: 0 # For a proper version and performance artifacts - name: Build run: | git -C "$GITHUB_WORKSPACE" submodule sync --recursive @@ -188,6 +190,8 @@ jobs: sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" - name: Check out repository code uses: actions/checkout@v2 + with: + fetch-depth: 0 # For a proper version and performance artifacts - name: Build run: | git -C "$GITHUB_WORKSPACE" submodule sync --recursive From 4bf1fc47188c7c158f6661352c608370d779909e Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 23 Jun 2022 10:28:31 +0300 Subject: [PATCH 28/60] Add error code and message displaying in exceptions of KerberosInit; Correct code style in KerberosInit --- src/Access/KerberosInit.cpp | 54 ++++++++++++++++++++++--------------- src/Access/KerberosInit.h | 2 +- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index d3be559cba7..9ca45b12531 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #if USE_KRB5 #include #include @@ -36,23 +37,34 @@ struct K5Data class KerberosInit : boost::noncopyable { public: - int init(const String & keytab_file, const String & principal, const String & cache_name = ""); + void init(const String & keytab_file, const String & principal, const String & cache_name = ""); ~KerberosInit(); private: - struct K5Data k5; + struct K5Data k5 {}; krb5_ccache defcache = nullptr; krb5_get_init_creds_opt * options = nullptr; // Credentials structure including ticket, session key, and lifetime info. krb5_creds my_creds; krb5_keytab keytab = nullptr; krb5_principal defcache_princ = nullptr; + String fmtError(krb5_error_code code); }; } -int KerberosInit::init(const String & keytab_file, const String & principal, const String & cache_name) + +String KerberosInit::fmtError(krb5_error_code code) +{ + const char *msg; + msg = krb5_get_error_message(k5.ctx, code); + String fmt_error = fmt::format(" ({}, {})", code, msg); + krb5_free_error_message(k5.ctx, msg); + return fmt_error; +} + +void KerberosInit::init(const String & keytab_file, const String & principal, const String & cache_name) { auto * log = &Poco::Logger::get("KerberosInit"); - LOG_TRACE(log,"Trying to authenticate to Kerberos v5"); + LOG_TRACE(log,"Trying to authenticate with Kerberos v5"); krb5_error_code ret; @@ -61,16 +73,15 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con if (!std::filesystem::exists(keytab_file)) throw Exception("Keytab file does not exist", ErrorCodes::KERBEROS_ERROR); - memset(&k5, 0, sizeof(k5)); ret = krb5_init_context(&k5.ctx); if (ret) - throw Exception("Error while initializing Kerberos 5 library", ErrorCodes::KERBEROS_ERROR); + throw Exception(fmt::format("Error while initializing Kerberos 5 library ({})", ret), ErrorCodes::KERBEROS_ERROR); if (!cache_name.empty()) { ret = krb5_cc_resolve(k5.ctx, cache_name.c_str(), &k5.out_cc); if (ret) - throw Exception("Error in resolving cache", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error in resolving cache" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Resolved cache"); } else @@ -78,7 +89,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con // Resolve the default cache and get its type and default principal (if it is initialized). ret = krb5_cc_default(k5.ctx, &defcache); if (ret) - throw Exception("Error while getting default cache", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error while getting default cache" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Resolved default cache"); deftype = krb5_cc_get_type(k5.ctx, defcache); if (krb5_cc_get_principal(k5.ctx, defcache, &defcache_princ) != 0) @@ -88,7 +99,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con // Use the specified principal name. ret = krb5_parse_name_flags(k5.ctx, principal.c_str(), 0, &k5.me); if (ret) - throw Exception("Error when parsing principal name " + principal, ErrorCodes::KERBEROS_ERROR); + throw Exception("Error when parsing principal name " + principal + fmtError(ret), ErrorCodes::KERBEROS_ERROR); // Cache related commands if (k5.out_cc == nullptr && krb5_cc_support_switch(k5.ctx, deftype)) @@ -96,8 +107,8 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con // Use an existing cache for the client principal if we can. ret = krb5_cc_cache_match(k5.ctx, k5.me, &k5.out_cc); if (ret && ret != KRB5_CC_NOTFOUND) - throw Exception("Error while searching for cache for " + principal, ErrorCodes::KERBEROS_ERROR); - if (!ret) + throw Exception("Error while searching for cache for " + principal + fmtError(ret), ErrorCodes::KERBEROS_ERROR); + if (0 == ret) { LOG_TRACE(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); k5.switch_to_cache = 1; @@ -107,7 +118,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con // Create a new cache to avoid overwriting the initialized default cache. ret = krb5_cc_new_unique(k5.ctx, deftype, nullptr, &k5.out_cc); if (ret) - throw Exception("Error while generating new cache", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error while generating new cache" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Using default cache: {}", krb5_cc_get_name(k5.ctx, k5.out_cc)); k5.switch_to_cache = 1; } @@ -123,24 +134,24 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con ret = krb5_unparse_name(k5.ctx, k5.me, &k5.name); if (ret) - throw Exception("Error when unparsing name", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error when unparsing name" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Using principal: {}", k5.name); // Allocate a new initial credential options structure. ret = krb5_get_init_creds_opt_alloc(k5.ctx, &options); if (ret) - throw Exception("Error in options allocation", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error in options allocation" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); // Resolve keytab ret = krb5_kt_resolve(k5.ctx, keytab_file.c_str(), &keytab); if (ret) - throw Exception("Error in resolving keytab "+keytab_file, ErrorCodes::KERBEROS_ERROR); + throw Exception("Error in resolving keytab "+keytab_file + fmtError(ret), ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Using keytab: {}", keytab_file); // Set an output credential cache in initial credential options. ret = krb5_get_init_creds_opt_set_out_ccache(k5.ctx, options, k5.out_cc); if (ret) - throw Exception("Error in setting output credential cache", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error in setting output credential cache" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); // Action: init or renew LOG_TRACE(log,"Trying to renew credentials"); @@ -153,7 +164,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con // Request KDC for an initial credentials using keytab. ret = krb5_get_init_creds_keytab(k5.ctx, &my_creds, k5.me, keytab, 0, nullptr, options); if (ret) - throw Exception("Error in getting initial credentials", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error in getting initial credentials" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); else LOG_TRACE(log,"Got initial credentials"); } @@ -163,7 +174,7 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con // Initialize a credential cache. Destroy any existing contents of cache and initialize it for the default principal. ret = krb5_cc_initialize(k5.ctx, k5.out_cc, k5.me); if (ret) - throw Exception("Error when initializing cache", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error when initializing cache" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); LOG_TRACE(log,"Initialized cache"); // Store credentials in a credential cache. ret = krb5_cc_store_cred(k5.ctx, k5.out_cc, &my_creds); @@ -177,11 +188,10 @@ int KerberosInit::init(const String & keytab_file, const String & principal, con // Make a credential cache the primary cache for its collection. ret = krb5_cc_switch(k5.ctx, k5.out_cc); if (ret) - throw Exception("Error while switching to new cache", ErrorCodes::KERBEROS_ERROR); + throw Exception("Error while switching to new cache" + fmtError(ret), ErrorCodes::KERBEROS_ERROR); } LOG_TRACE(log,"Authenticated to Kerberos v5"); - return 0; } KerberosInit::~KerberosInit() @@ -208,12 +218,12 @@ KerberosInit::~KerberosInit() } } -int kerberosInit(const String & keytab_file, const String & principal, const String & cache_name) +void kerberosInit(const String & keytab_file, const String & principal, const String & cache_name) { // Using mutex to prevent cache file corruptions static std::mutex kinit_mtx; std::unique_lock lck(kinit_mtx); KerberosInit k_init; - return k_init.init(keytab_file, principal, cache_name); + k_init.init(keytab_file, principal, cache_name); } #endif // USE_KRB5 diff --git a/src/Access/KerberosInit.h b/src/Access/KerberosInit.h index 4731baf706f..5a11a275529 100644 --- a/src/Access/KerberosInit.h +++ b/src/Access/KerberosInit.h @@ -6,6 +6,6 @@ #if USE_KRB5 -int kerberosInit(const String & keytab_file, const String & principal, const String & cache_name = ""); +void kerberosInit(const String & keytab_file, const String & principal, const String & cache_name = ""); #endif // USE_KRB5 From cb748cd8ec46b1962bfd28c41eeaab4b53e90c75 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 23 Jun 2022 16:11:48 +0300 Subject: [PATCH 29/60] Fix code style in KerberosInit --- src/Access/KerberosInit.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Access/KerberosInit.cpp b/src/Access/KerberosInit.cpp index 9ca45b12531..ace03a5e0b5 100644 --- a/src/Access/KerberosInit.cpp +++ b/src/Access/KerberosInit.cpp @@ -47,12 +47,12 @@ private: krb5_creds my_creds; krb5_keytab keytab = nullptr; krb5_principal defcache_princ = nullptr; - String fmtError(krb5_error_code code); + String fmtError(krb5_error_code code) const; }; } -String KerberosInit::fmtError(krb5_error_code code) +String KerberosInit::fmtError(krb5_error_code code) const { const char *msg; msg = krb5_get_error_message(k5.ctx, code); @@ -75,7 +75,7 @@ void KerberosInit::init(const String & keytab_file, const String & principal, co ret = krb5_init_context(&k5.ctx); if (ret) - throw Exception(fmt::format("Error while initializing Kerberos 5 library ({})", ret), ErrorCodes::KERBEROS_ERROR); + throw Exception(ErrorCodes::KERBEROS_ERROR, "Error while initializing Kerberos 5 library ({})", ret); if (!cache_name.empty()) { @@ -160,7 +160,8 @@ void KerberosInit::init(const String & keytab_file, const String & principal, co ret = krb5_get_renewed_creds(k5.ctx, &my_creds, k5.me, k5.out_cc, nullptr); if (ret) { - LOG_TRACE(log,"Renew failed ({}). Trying to get initial credentials", ret); + LOG_TRACE(log,"Renew failed {}", fmtError(ret)); + LOG_TRACE(log,"Trying to get initial credentials"); // Request KDC for an initial credentials using keytab. ret = krb5_get_init_creds_keytab(k5.ctx, &my_creds, k5.me, keytab, 0, nullptr, options); if (ret) From 7f7d082fb30e120ddd0cf94ec8fccd01b2442ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 23 Jun 2022 15:23:37 +0200 Subject: [PATCH 30/60] Add implicit_transaction setting --- src/Core/Settings.h | 1 + .../InterpreterTransactionControlQuery.h | 1 - src/Interpreters/executeQuery.cpp | 94 ++++++++++++++----- .../02345_implicit_transaction.reference | 14 +++ .../02345_implicit_transaction.sql | 92 ++++++++++++++++++ 5 files changed, 178 insertions(+), 24 deletions(-) create mode 100644 tests/queries/0_stateless/02345_implicit_transaction.reference create mode 100644 tests/queries/0_stateless/02345_implicit_transaction.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f1fd9d20f00..72ba2c0c13b 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -601,6 +601,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, count_distinct_optimization, false, "Rewrite count distinct to subquery of group by", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ + M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \ M(Bool, throw_if_no_data_to_insert, true, "Enables or disables empty INSERTs, enabled by default", 0) \ M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \ diff --git a/src/Interpreters/InterpreterTransactionControlQuery.h b/src/Interpreters/InterpreterTransactionControlQuery.h index bf2dc7891a7..a66a740ce0c 100644 --- a/src/Interpreters/InterpreterTransactionControlQuery.h +++ b/src/Interpreters/InterpreterTransactionControlQuery.h @@ -20,7 +20,6 @@ public: bool ignoreLimits() const override { return true; } bool supportsTransactions() const override { return true; } -private: BlockIO executeBegin(ContextMutablePtr session_context); BlockIO executeCommit(ContextMutablePtr session_context); static BlockIO executeRollback(ContextMutablePtr session_context); diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 24649128cee..ae622e5e1f0 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -42,18 +42,19 @@ #include #include #include -#include #include +#include +#include #include #include #include -#include #include +#include #include -#include -#include -#include #include +#include +#include +#include #include #include @@ -68,6 +69,7 @@ #include #include +#include #include @@ -416,7 +418,9 @@ static std::tuple executeQueryImpl( chassert(txn->getState() != MergeTreeTransaction::COMMITTING); chassert(txn->getState() != MergeTreeTransaction::COMMITTED); if (txn->getState() == MergeTreeTransaction::ROLLED_BACK && !ast->as() && !ast->as()) - throw Exception(ErrorCodes::INVALID_TRANSACTION, "Cannot execute query because current transaction failed. Expecting ROLLBACK statement."); + throw Exception( + ErrorCodes::INVALID_TRANSACTION, + "Cannot execute query because current transaction failed. Expecting ROLLBACK statement"); } /// Interpret SETTINGS clauses as early as possible (before invoking the corresponding interpreter), @@ -498,6 +502,7 @@ static std::tuple executeQueryImpl( BlockIO res; String query_for_logging; + std::shared_ptr implicit_txn_control{}; try { @@ -626,6 +631,27 @@ static std::tuple executeQueryImpl( } else { + /// We need to start the (implicit) transaction before getting the interpreter as this will get links to the latest snapshots + if (!context->getCurrentTransaction() && settings.implicit_transaction && !ast->as()) + { + try + { + /// If there is no session (which is the default for the HTTP Handler), set up one just for this as it is necessary + /// to control the transaction lifetime + if (!context->hasSessionContext()) + context->makeSessionContext(); + + auto tc = std::make_shared(ast, context); + tc->executeBegin(context->getSessionContext()); + implicit_txn_control = std::move(tc); + } + catch (Exception & e) + { + e.addMessage("while starting a transaction with 'implicit_transaction'"); + throw; + } + } + interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal)); if (context->getCurrentTransaction() && !interpreter->supportsTransactions() && @@ -813,15 +839,16 @@ static std::tuple executeQueryImpl( }; /// Also make possible for caller to log successful query finish and exception during execution. - auto finish_callback = [elem, context, ast, - log_queries, - log_queries_min_type = settings.log_queries_min_type, - log_queries_min_query_duration_ms = settings.log_queries_min_query_duration_ms.totalMilliseconds(), - log_processors_profiles = settings.log_processors_profiles, - status_info_to_query_log, - pulling_pipeline = pipeline.pulling() - ] - (QueryPipeline & query_pipeline) mutable + auto finish_callback = [elem, + context, + ast, + log_queries, + log_queries_min_type = settings.log_queries_min_type, + log_queries_min_query_duration_ms = settings.log_queries_min_query_duration_ms.totalMilliseconds(), + log_processors_profiles = settings.log_processors_profiles, + status_info_to_query_log, + implicit_txn_control, + pulling_pipeline = pipeline.pulling()](QueryPipeline & query_pipeline) mutable { QueryStatus * process_list_elem = context->getProcessListElement(); @@ -942,15 +969,30 @@ static std::tuple executeQueryImpl( opentelemetry_span_log->add(span); } + + if (implicit_txn_control) + { + implicit_txn_control->executeCommit(context->getSessionContext()); + implicit_txn_control.reset(); + } }; - auto exception_callback = [elem, context, ast, - log_queries, - log_queries_min_type = settings.log_queries_min_type, - log_queries_min_query_duration_ms = settings.log_queries_min_query_duration_ms.totalMilliseconds(), - quota(quota), status_info_to_query_log] () mutable + auto exception_callback = [elem, + context, + ast, + log_queries, + log_queries_min_type = settings.log_queries_min_type, + log_queries_min_query_duration_ms = settings.log_queries_min_query_duration_ms.totalMilliseconds(), + quota(quota), + status_info_to_query_log, + implicit_txn_control]() mutable { - if (auto txn = context->getCurrentTransaction()) + if (implicit_txn_control) + { + implicit_txn_control->executeRollback(context->getSessionContext()); + implicit_txn_control.reset(); + } + else if (auto txn = context->getCurrentTransaction()) txn->onException(); if (quota) @@ -1000,7 +1042,6 @@ static std::tuple executeQueryImpl( { ProfileEvents::increment(ProfileEvents::FailedInsertQuery); } - }; res.finish_callback = std::move(finish_callback); @@ -1009,8 +1050,15 @@ static std::tuple executeQueryImpl( } catch (...) { - if (auto txn = context->getCurrentTransaction()) + if (implicit_txn_control) + { + implicit_txn_control->executeRollback(context->getSessionContext()); + implicit_txn_control.reset(); + } + else if (auto txn = context->getCurrentTransaction()) + { txn->onException(); + } if (!internal) { diff --git a/tests/queries/0_stateless/02345_implicit_transaction.reference b/tests/queries/0_stateless/02345_implicit_transaction.reference new file mode 100644 index 00000000000..e4dd35600f7 --- /dev/null +++ b/tests/queries/0_stateless/02345_implicit_transaction.reference @@ -0,0 +1,14 @@ +no_transaction_landing 10000 +no_transaction_target 0 +after_transaction_landing 0 +after_transaction_target 0 +after_implicit_txn_in_query_settings_landing 0 +after_implicit_txn_in_query_settings_target 0 +after_implicit_txn_in_session_landing 0 +after_implicit_txn_in_session_target 0 +inside_txn_and_implicit 1 +inside_txn_and_implicit 1 +in_transaction 10000 +out_transaction 0 +{"'implicit_True'":"implicit_True","all":"2","is_empty":0} +{"'implicit_False'":"implicit_False","all":"2","is_empty":1} diff --git a/tests/queries/0_stateless/02345_implicit_transaction.sql b/tests/queries/0_stateless/02345_implicit_transaction.sql new file mode 100644 index 00000000000..677affeec39 --- /dev/null +++ b/tests/queries/0_stateless/02345_implicit_transaction.sql @@ -0,0 +1,92 @@ +CREATE TABLE landing (n Int64) engine=MergeTree order by n; +CREATE TABLE target (n Int64) engine=MergeTree order by n; +CREATE MATERIALIZED VIEW landing_to_target TO target AS + SELECT n + throwIf(n == 3333) + FROM landing; + +INSERT INTO landing SELECT * FROM numbers(10000); -- { serverError 395 } +SELECT 'no_transaction_landing', count() FROM landing; +SELECT 'no_transaction_target', count() FROM target; + +TRUNCATE TABLE landing; +TRUNCATE TABLE target; + + +BEGIN TRANSACTION; +INSERT INTO landing SELECT * FROM numbers(10000); -- { serverError 395 } +ROLLBACK; +SELECT 'after_transaction_landing', count() FROM landing; +SELECT 'after_transaction_target', count() FROM target; + +-- Same but using implicit_transaction +INSERT INTO landing SETTINGS implicit_transaction=True SELECT * FROM numbers(10000); -- { serverError 395 } +SELECT 'after_implicit_txn_in_query_settings_landing', count() FROM landing; +SELECT 'after_implicit_txn_in_query_settings_target', count() FROM target; + +-- Same but using implicit_transaction in a session +SET implicit_transaction=True; +INSERT INTO landing SELECT * FROM numbers(10000); -- { serverError 395 } +SET implicit_transaction=False; +SELECT 'after_implicit_txn_in_session_landing', count() FROM landing; +SELECT 'after_implicit_txn_in_session_target', count() FROM target; + +-- Reading from incompatible sources with implicit_transaction works the same way as with normal transactions: +-- Currently reading from system tables inside a transaction is Not implemented: +SELECT name, value, changed FROM system.settings where name = 'implicit_transaction' SETTINGS implicit_transaction=True; -- { serverError 48 } + + +-- Verify that you don't have to manually close transactions with implicit_transaction +SET implicit_transaction=True; +SELECT throwIf(number == 0) FROM numbers(100); -- { serverError 395 } +SELECT throwIf(number == 0) FROM numbers(100); -- { serverError 395 } +SELECT throwIf(number == 0) FROM numbers(100); -- { serverError 395 } +SELECT throwIf(number == 0) FROM numbers(100); -- { serverError 395 } +SET implicit_transaction=False; + +-- implicit_transaction is ignored when inside a transaction (no recursive transaction error) +BEGIN TRANSACTION; +SELECT 'inside_txn_and_implicit', 1 SETTINGS implicit_transaction=True; +SELECT throwIf(number == 0) FROM numbers(100) SETTINGS implicit_transaction=True; -- { serverError 395 } +ROLLBACK; + +SELECT 'inside_txn_and_implicit', 1 SETTINGS implicit_transaction=True; + +-- You can work with transactions even if `implicit_transaction=True` is set +SET implicit_transaction=True; +BEGIN TRANSACTION; +INSERT INTO target SELECT * FROM numbers(10000); +SELECT 'in_transaction', count() FROM target; +ROLLBACK; +SELECT 'out_transaction', count() FROM target; +SET implicit_transaction=False; + + +-- Verify that the transaction_id column is populated correctly +SELECT 'Looking_at_transaction_id_True' FORMAT Null SETTINGS implicit_transaction=1; +-- Verify that the transaction_id column is NOT populated without transaction +SELECT 'Looking_at_transaction_id_False' FORMAT Null SETTINGS implicit_transaction=0; +SYSTEM FLUSH LOGS; + +SELECT + 'implicit_True', + count() as all, + transaction_id = (0,0,'00000000-0000-0000-0000-000000000000') as is_empty +FROM system.query_log +WHERE + current_database = currentDatabase() AND + event_date >= yesterday() AND + query LIKE '-- Verify that the transaction_id column is populated correctly%' +GROUP BY transaction_id +FORMAT JSONEachRow; + +SELECT + 'implicit_False', + count() as all, + transaction_id = (0,0,'00000000-0000-0000-0000-000000000000') as is_empty +FROM system.query_log +WHERE + current_database = currentDatabase() AND + event_date >= yesterday() AND + query LIKE '-- Verify that the transaction_id column is NOT populated without transaction%' +GROUP BY transaction_id +FORMAT JSONEachRow; From 072f64c80088750c996b8056b0d7c459cdaa5669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 Jun 2022 16:54:47 +0200 Subject: [PATCH 31/60] Improvements based on review --- src/Interpreters/executeQuery.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index ae622e5e1f0..4b328f0466e 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -626,8 +626,10 @@ static std::tuple executeQueryImpl( if (!table_id.empty()) context->setInsertionTable(table_id); - if (context->getCurrentTransaction() && context->getSettingsRef().throw_on_unsupported_query_inside_transaction) + if (context->getCurrentTransaction() && settings.throw_on_unsupported_query_inside_transaction) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Async inserts inside transactions are not supported"); + if (settings.implicit_transaction && settings.throw_on_unsupported_query_inside_transaction) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Async inserts with 'implicit_transaction' are not supported"); } else { @@ -636,6 +638,9 @@ static std::tuple executeQueryImpl( { try { + if (context->isGlobalContext()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context cannot create transactions"); + /// If there is no session (which is the default for the HTTP Handler), set up one just for this as it is necessary /// to control the transaction lifetime if (!context->hasSessionContext()) @@ -972,8 +977,18 @@ static std::tuple executeQueryImpl( if (implicit_txn_control) { - implicit_txn_control->executeCommit(context->getSessionContext()); - implicit_txn_control.reset(); + try + { + implicit_txn_control->executeCommit(context->getSessionContext()); + implicit_txn_control.reset(); + } + catch (const Exception &) + { + /// An exception might happen when trying to commit the transaction. For example we might get an immediate exception + /// because ZK is down and wait_changes_become_visible_after_commit_mode == WAIT_UNKNOWN + implicit_txn_control.reset(); + throw; + } } }; From bb7c6279645f76d3bedd72514dfc2d520990fa92 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 15:16:57 +0200 Subject: [PATCH 32/60] Cosmetics: Pass patterns around as std::string_view instead of StringRef - The patterns are not used in hashing, there should not be a performance impact when we use stuff from the standard library instead. - added forgotten .reserve() in FunctionsMultiStringPosition.h --- src/Common/Volnitsky.h | 16 ++++++++-------- src/Functions/FunctionsMultiStringFuzzySearch.h | 3 +-- src/Functions/FunctionsMultiStringPosition.h | 3 ++- src/Functions/FunctionsMultiStringSearch.h | 3 +-- src/Functions/MultiMatchAllIndicesImpl.h | 4 ++-- src/Functions/MultiMatchAnyImpl.h | 6 +++--- src/Functions/MultiSearchAllPositionsImpl.h | 2 +- src/Functions/MultiSearchFirstIndexImpl.h | 2 +- src/Functions/MultiSearchFirstPositionImpl.h | 2 +- src/Functions/MultiSearchImpl.h | 2 +- src/Functions/PositionImpl.h | 8 ++++---- src/Functions/Regexps.h | 6 +++--- src/Functions/hyperscanRegexpChecker.cpp | 8 ++++---- src/Functions/hyperscanRegexpChecker.h | 2 +- 14 files changed, 33 insertions(+), 34 deletions(-) diff --git a/src/Common/Volnitsky.h b/src/Common/Volnitsky.h index 7eca0c0fe53..a6aef293ac1 100644 --- a/src/Common/Volnitsky.h +++ b/src/Common/Volnitsky.h @@ -476,7 +476,7 @@ class MultiVolnitskyBase { private: /// needles and their offsets - const std::vector & needles; + const std::vector & needles; /// fallback searchers @@ -502,7 +502,7 @@ private: static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8; public: - explicit MultiVolnitskyBase(const std::vector & needles_) : needles{needles_}, step{0}, last{0} + explicit MultiVolnitskyBase(const std::vector & needles_) : needles{needles_}, step{0}, last{0} { fallback_searchers.reserve(needles.size()); hash = std::unique_ptr(new OffsetId[VolnitskyTraits::hash_size]); /// No zero initialization, it will be done later. @@ -535,8 +535,8 @@ public: for (; last < size; ++last) { - const char * cur_needle_data = needles[last].data; - const size_t cur_needle_size = needles[last].size; + const char * cur_needle_data = needles[last].data(); + const size_t cur_needle_size = needles[last].size(); /// save the indices of fallback searchers if (VolnitskyTraits::isFallbackNeedle(cur_needle_size)) @@ -593,7 +593,7 @@ public: { const auto res = pos - (hash[cell_num].off - 1); const size_t ind = hash[cell_num].id; - if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) + if (res + needles[ind].size() <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) return true; } } @@ -625,7 +625,7 @@ public: { const auto res = pos - (hash[cell_num].off - 1); const size_t ind = hash[cell_num].id; - if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) + if (res + needles[ind].size() <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) answer = std::min(answer, ind); } } @@ -663,7 +663,7 @@ public: { const auto res = pos - (hash[cell_num].off - 1); const size_t ind = hash[cell_num].id; - if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) + if (res + needles[ind].size() <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) answer = std::min(answer, res - haystack); } } @@ -699,7 +699,7 @@ public: const auto * res = pos - (hash[cell_num].off - 1); const size_t ind = hash[cell_num].id; if (answer[ind] == 0 - && res + needles[ind].size <= haystack_end + && res + needles[ind].size() <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) answer[ind] = count_chars(haystack, res); } diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index e082cfbec2f..3c2c0e523ff 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -13,7 +13,6 @@ #include #include #include -#include #include @@ -115,7 +114,7 @@ public: + ", should be at most " + std::to_string(LimitArgs), ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - std::vector refs; + std::vector refs; refs.reserve(src_arr.size()); for (const auto & el : src_arr) diff --git a/src/Functions/FunctionsMultiStringPosition.h b/src/Functions/FunctionsMultiStringPosition.h index 357606f4042..855b5448b87 100644 --- a/src/Functions/FunctionsMultiStringPosition.h +++ b/src/Functions/FunctionsMultiStringPosition.h @@ -98,7 +98,8 @@ public: + ", should be at most 255", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - std::vector refs; + std::vector refs; + refs.reserve(src_arr.size()); for (const auto & el : src_arr) refs.emplace_back(el.get()); diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 4bc8af3f214..6ab85e632e4 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -13,7 +13,6 @@ #include #include #include -#include namespace DB @@ -107,7 +106,7 @@ public: + ", should be at most " + std::to_string(LimitArgs), ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - std::vector refs; + std::vector refs; refs.reserve(src_arr.size()); for (const auto & el : src_arr) diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index 80a71548deb..a9479a9d7f5 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -44,7 +44,7 @@ struct MultiMatchAllIndicesImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const std::vector & needles, PaddedPODArray & res, PaddedPODArray & offsets) { @@ -54,7 +54,7 @@ struct MultiMatchAllIndicesImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const std::vector & needles, PaddedPODArray & res, PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance) diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index fbbefe7be1d..f8d790c98cc 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -46,7 +46,7 @@ struct MultiMatchAnyImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const std::vector & needles, PaddedPODArray & res, PaddedPODArray & offsets) { @@ -56,7 +56,7 @@ struct MultiMatchAnyImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const std::vector & needles, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance) @@ -120,7 +120,7 @@ struct MultiMatchAnyImpl memset(accum.data(), 0, accum.size()); for (size_t j = 0; j < needles.size(); ++j) { - MatchImpl::vectorConstant(haystack_data, haystack_offsets, needles[j].toString(), nullptr, accum); + MatchImpl::vectorConstant(haystack_data, haystack_offsets, std::string(needles[j].data(), needles[j].size()), nullptr, accum); for (size_t i = 0; i < res.size(); ++i) { if constexpr (FindAny) diff --git a/src/Functions/MultiSearchAllPositionsImpl.h b/src/Functions/MultiSearchAllPositionsImpl.h index f54fe41f20c..4356d6110f1 100644 --- a/src/Functions/MultiSearchAllPositionsImpl.h +++ b/src/Functions/MultiSearchAllPositionsImpl.h @@ -15,7 +15,7 @@ struct MultiSearchAllPositionsImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const std::vector & needles, PaddedPODArray & res) { auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 diff --git a/src/Functions/MultiSearchFirstIndexImpl.h b/src/Functions/MultiSearchFirstIndexImpl.h index 26709119f6e..e452ac03902 100644 --- a/src/Functions/MultiSearchFirstIndexImpl.h +++ b/src/Functions/MultiSearchFirstIndexImpl.h @@ -22,7 +22,7 @@ struct MultiSearchFirstIndexImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const std::vector & needles, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets) { diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 1db8dcbde83..1aa89b54a1e 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -22,7 +22,7 @@ struct MultiSearchFirstPositionImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const std::vector & needles, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets) { diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index 7cb0cefe580..91c3d71fdb9 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -22,7 +22,7 @@ struct MultiSearchImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const std::vector & needles, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets) { diff --git a/src/Functions/PositionImpl.h b/src/Functions/PositionImpl.h index 82e58cdc643..5380fcc36d9 100644 --- a/src/Functions/PositionImpl.h +++ b/src/Functions/PositionImpl.h @@ -38,7 +38,7 @@ struct PositionCaseSensitiveASCII return SearcherInSmallHaystack(needle_data, needle_size); } - static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) + static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) { return MultiSearcherInBigHaystack(needles); } @@ -74,7 +74,7 @@ struct PositionCaseInsensitiveASCII return SearcherInSmallHaystack(needle_data, needle_size); } - static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) + static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) { return MultiSearcherInBigHaystack(needles); } @@ -106,7 +106,7 @@ struct PositionCaseSensitiveUTF8 return SearcherInSmallHaystack(needle_data, needle_size); } - static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) + static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) { return MultiSearcherInBigHaystack(needles); } @@ -154,7 +154,7 @@ struct PositionCaseInsensitiveUTF8 return SearcherInSmallHaystack(needle_data, needle_size); } - static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) + static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) { return MultiSearcherInBigHaystack(needles); } diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index ac37875f91e..a85c89774a2 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -278,15 +278,15 @@ inline Regexps constructRegexps(const std::vector & str_patterns, std::o /// template has its own copy of local static variables which must not be the same /// for different hyperscan compilations. template -inline Regexps * get(const std::vector & patterns, std::optional edit_distance) +inline Regexps * get(const std::vector & patterns, std::optional edit_distance) { /// C++11 has thread-safe function-local static on most modern compilers. static Pool known_regexps; /// Different variables for different pattern parameters. std::vector str_patterns; str_patterns.reserve(patterns.size()); - for (const StringRef & ref : patterns) - str_patterns.push_back(ref.toString()); + for (const auto & pattern : patterns) + str_patterns.push_back(std::string(pattern.data(), pattern.size())); /// Get the lock for finding database. std::unique_lock lock(known_regexps.mutex); diff --git a/src/Functions/hyperscanRegexpChecker.cpp b/src/Functions/hyperscanRegexpChecker.cpp index b3c46e34daa..325817fa47e 100644 --- a/src/Functions/hyperscanRegexpChecker.cpp +++ b/src/Functions/hyperscanRegexpChecker.cpp @@ -9,16 +9,16 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -void checkRegexp(const std::vector & refs, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) +void checkRegexp(const std::vector & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) { if (max_hyperscan_regexp_length > 0 || max_hyperscan_regexp_total_length > 0) { size_t total_regexp_length = 0; - for (const auto & pattern : refs) + for (const auto & regexp : regexps) { - if (max_hyperscan_regexp_length > 0 && pattern.size > max_hyperscan_regexp_length) + if (max_hyperscan_regexp_length > 0 && regexp.size() > max_hyperscan_regexp_length) throw Exception("Regexp length too large", ErrorCodes::BAD_ARGUMENTS); - total_regexp_length += pattern.size; + total_regexp_length += regexp.size(); } if (max_hyperscan_regexp_total_length > 0 && total_regexp_length > max_hyperscan_regexp_total_length) diff --git a/src/Functions/hyperscanRegexpChecker.h b/src/Functions/hyperscanRegexpChecker.h index 1eea53722c1..ee8cc7d3954 100644 --- a/src/Functions/hyperscanRegexpChecker.h +++ b/src/Functions/hyperscanRegexpChecker.h @@ -5,6 +5,6 @@ namespace DB { -void checkRegexp(const std::vector & refs, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length); +void checkRegexp(const std::vector & refs, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length); } From 2ebfd01c2ee712ebe597daa345932b971587b9ca Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 15:20:03 +0200 Subject: [PATCH 33/60] Cosmetics: Pull out settings variable --- src/Functions/FunctionsMultiStringFuzzySearch.h | 10 +++++----- src/Functions/FunctionsMultiStringSearch.h | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 3c2c0e523ff..6c016c9d6cb 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -36,12 +36,12 @@ public: static constexpr auto name = Impl::name; static FunctionPtr create(ContextPtr context) { - if (Impl::is_using_hyperscan && !context->getSettingsRef().allow_hyperscan) - throw Exception( - "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); + const auto & settings = context->getSettingsRef(); - return std::make_shared( - context->getSettingsRef().max_hyperscan_regexp_length, context->getSettingsRef().max_hyperscan_regexp_total_length); + if (Impl::is_using_hyperscan && !settings.allow_hyperscan) + throw Exception("Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); + + return std::make_shared(settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); } FunctionsMultiStringFuzzySearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 6ab85e632e4..0286e119608 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -49,12 +49,12 @@ public: static constexpr auto name = Impl::name; static FunctionPtr create(ContextPtr context) { - if (Impl::is_using_hyperscan && !context->getSettingsRef().allow_hyperscan) - throw Exception( - "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); + const auto & settings = context->getSettingsRef(); - return std::make_shared( - context->getSettingsRef().max_hyperscan_regexp_length, context->getSettingsRef().max_hyperscan_regexp_total_length); + if (Impl::is_using_hyperscan && !settings.allow_hyperscan) + throw Exception("Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); + + return std::make_shared(settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); } FunctionsMultiStringSearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) From 072e0855a82fc3f07ac77e7c89bc82eee6ea3d18 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 15:21:32 +0200 Subject: [PATCH 34/60] Cosmetics: Make member variables const --- src/Functions/FunctionsMultiStringFuzzySearch.h | 4 ++-- src/Functions/FunctionsMultiStringSearch.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 6c016c9d6cb..fa09d54a75c 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -143,8 +143,8 @@ public: } private: - size_t max_hyperscan_regexp_length; - size_t max_hyperscan_regexp_total_length; + const size_t max_hyperscan_regexp_length; + const size_t max_hyperscan_regexp_total_length; }; } diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 0286e119608..a5e27727545 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -134,8 +134,8 @@ public: } private: - size_t max_hyperscan_regexp_length; - size_t max_hyperscan_regexp_total_length; + const size_t max_hyperscan_regexp_length; + const size_t max_hyperscan_regexp_total_length; }; } From e5c74a14f7b5d21da3be38727166c5279a2032a6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 15:24:17 +0200 Subject: [PATCH 35/60] Cosmetics: More consistent naming - rename utility function and file to "checkHyperscanRegexp" --- src/Functions/FunctionsMultiStringFuzzySearch.h | 4 ++-- src/Functions/FunctionsMultiStringSearch.h | 4 ++-- ...rscanRegexpChecker.cpp => checkHyperscanRegexp.cpp} | 4 ++-- src/Functions/checkHyperscanRegexp.h | 10 ++++++++++ src/Functions/hyperscanRegexpChecker.h | 10 ---------- 5 files changed, 16 insertions(+), 16 deletions(-) rename src/Functions/{hyperscanRegexpChecker.cpp => checkHyperscanRegexp.cpp} (79%) create mode 100644 src/Functions/checkHyperscanRegexp.h delete mode 100644 src/Functions/hyperscanRegexpChecker.h diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index fa09d54a75c..cc45a407099 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include @@ -121,7 +121,7 @@ public: refs.emplace_back(el.get()); if (Impl::is_using_hyperscan) - checkRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + checkHyperscanRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index a5e27727545..76c56fa0030 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include @@ -113,7 +113,7 @@ public: refs.emplace_back(el.get()); if (Impl::is_using_hyperscan) - checkRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + checkHyperscanRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); diff --git a/src/Functions/hyperscanRegexpChecker.cpp b/src/Functions/checkHyperscanRegexp.cpp similarity index 79% rename from src/Functions/hyperscanRegexpChecker.cpp rename to src/Functions/checkHyperscanRegexp.cpp index 325817fa47e..ba9705208d5 100644 --- a/src/Functions/hyperscanRegexpChecker.cpp +++ b/src/Functions/checkHyperscanRegexp.cpp @@ -1,4 +1,4 @@ -#include +#include #include @@ -9,7 +9,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -void checkRegexp(const std::vector & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) +void checkHyperscanRegexp(const std::vector & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) { if (max_hyperscan_regexp_length > 0 || max_hyperscan_regexp_total_length > 0) { diff --git a/src/Functions/checkHyperscanRegexp.h b/src/Functions/checkHyperscanRegexp.h new file mode 100644 index 00000000000..2aac44115fc --- /dev/null +++ b/src/Functions/checkHyperscanRegexp.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace DB +{ + +void checkHyperscanRegexp(const std::vector & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length); + +} diff --git a/src/Functions/hyperscanRegexpChecker.h b/src/Functions/hyperscanRegexpChecker.h deleted file mode 100644 index ee8cc7d3954..00000000000 --- a/src/Functions/hyperscanRegexpChecker.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -void checkRegexp(const std::vector & refs, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length); - -} From 1273756911cd4d83253ca11a281d73aa2c83bb91 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 15:33:34 +0200 Subject: [PATCH 36/60] Cosmetics: fmt-based exceptions --- .../FunctionsMultiStringFuzzySearch.h | 34 +++++++------------ src/Functions/FunctionsMultiStringSearch.h | 24 ++++++------- 2 files changed, 22 insertions(+), 36 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index cc45a407099..1d6b8cfeffd 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -39,15 +39,14 @@ public: const auto & settings = context->getSettingsRef(); if (Impl::is_using_hyperscan && !settings.allow_hyperscan) - throw Exception("Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); + throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); return std::make_shared(settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); } FunctionsMultiStringFuzzySearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) : max_hyperscan_regexp_length(max_hyperscan_regexp_length_), max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) - { - } + {} String getName() const override { return name; } @@ -59,17 +58,15 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception( - "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); if (!isUnsignedInteger(arguments[1])) - throw Exception( - "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[1]->getName(), getName()); const DataTypeArray * array_type = checkAndGetDataType(arguments[2].get()); if (!array_type || !checkAndGetDataType(array_type->getNestedType().get())) - throw Exception( - "Illegal type " + arguments[2]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[2]->getName(), getName()); + return Impl::getReturnType(); } @@ -92,27 +89,20 @@ public: else if ((col_const_num = checkAndGetColumnConst(num_ptr.get()))) edit_distance = col_const_num->getValue(); else - throw Exception( - "Illegal column " + arguments[1].column->getName() - + ". The number is not const or does not fit in UInt32", - ErrorCodes::ILLEGAL_COLUMN); - + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The number is not const or does not fit in UInt32", arguments[1].column->getName()); const ColumnPtr & arr_ptr = arguments[2].column; const ColumnConst * col_const_arr = checkAndGetColumnConst(arr_ptr.get()); if (!col_const_arr) - throw Exception( - "Illegal column " + arguments[2].column->getName() + ". The array is not const", - ErrorCodes::ILLEGAL_COLUMN); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The array is not const", arguments[2].column->getName()); Array src_arr = col_const_arr->getValue(); if (src_arr.size() > LimitArgs) - throw Exception( - "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(src_arr.size()) - + ", should be at most " + std::to_string(LimitArgs), - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be at most {}", + getName(), std::to_string(src_arr.size()), std::to_string(LimitArgs)); std::vector refs; refs.reserve(src_arr.size()); @@ -134,7 +124,7 @@ public: Impl::vectorConstant( col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance); else - throw Exception("Illegal column " + arguments[0].column->getName(), ErrorCodes::ILLEGAL_COLUMN); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arguments[0].column->getName()); if constexpr (Impl::is_column_array) return ColumnArray::create(std::move(col_res), std::move(col_offsets)); diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 76c56fa0030..53eda533cb6 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -52,15 +52,14 @@ public: const auto & settings = context->getSettingsRef(); if (Impl::is_using_hyperscan && !settings.allow_hyperscan) - throw Exception("Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); + throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); return std::make_shared(settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); } FunctionsMultiStringSearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) : max_hyperscan_regexp_length(max_hyperscan_regexp_length_), max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) - { - } + {} String getName() const override { return name; } @@ -72,13 +71,12 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception( - "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); const DataTypeArray * array_type = checkAndGetDataType(arguments[1].get()); if (!array_type || !checkAndGetDataType(array_type->getNestedType().get())) - throw Exception( - "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[1]->getName(), getName()); + return Impl::getReturnType(); } @@ -94,17 +92,15 @@ public: const ColumnConst * col_const_arr = checkAndGetColumnConst(arr_ptr.get()); if (!col_const_arr) - throw Exception( - "Illegal column " + arguments[1].column->getName() + ". The array is not const", - ErrorCodes::ILLEGAL_COLUMN); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The array is not const", arguments[1].column->getName()); Array src_arr = col_const_arr->getValue(); if (src_arr.size() > LimitArgs) throw Exception( - "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(src_arr.size()) - + ", should be at most " + std::to_string(LimitArgs), - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be at most {}", + getName(), std::to_string(src_arr.size()), std::to_string(LimitArgs)); std::vector refs; refs.reserve(src_arr.size()); @@ -125,7 +121,7 @@ public: if (col_haystack_vector) Impl::vectorConstant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res); else - throw Exception("Illegal column " + arguments[0].column->getName(), ErrorCodes::ILLEGAL_COLUMN); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arguments[0].column->getName()); if constexpr (Impl::is_column_array) return ColumnArray::create(std::move(col_res), std::move(col_offsets)); From 4bc59c18e38bb7a42df2ff9be1c79cf9d1867d57 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 15:34:40 +0200 Subject: [PATCH 37/60] Cosmetics: Move some code around + docs + whitespaces + minor stuff --- .../FunctionsMultiStringFuzzySearch.h | 9 ++++--- src/Functions/FunctionsMultiStringSearch.h | 11 +++++---- src/Functions/MultiMatchAllIndicesImpl.h | 24 ++++++++----------- src/Functions/MultiMatchAnyImpl.h | 18 +++++++------- src/Functions/Regexps.h | 8 +++---- 5 files changed, 34 insertions(+), 36 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 1d6b8cfeffd..b0f2de96622 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -72,13 +72,12 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { - using ResultType = typename Impl::ResultType; - const ColumnPtr & column_haystack = arguments[0].column; + const ColumnPtr & num_ptr = arguments[1].column; + const ColumnPtr & arr_ptr = arguments[2].column; const ColumnString * col_haystack_vector = checkAndGetColumn(&*column_haystack); - const ColumnPtr & num_ptr = arguments[1].column; const ColumnConst * col_const_num = nullptr; UInt32 edit_distance = 0; @@ -91,7 +90,6 @@ public: else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The number is not const or does not fit in UInt32", arguments[1].column->getName()); - const ColumnPtr & arr_ptr = arguments[2].column; const ColumnConst * col_const_arr = checkAndGetColumnConst(arr_ptr.get()); if (!col_const_arr) @@ -113,13 +111,14 @@ public: if (Impl::is_using_hyperscan) checkHyperscanRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + using ResultType = typename Impl::ResultType; auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); auto & vec_res = col_res->getData(); auto & offsets_res = col_offsets->getData(); - /// The blame for resizing output is for the callee. + // the implementations are responsible for resizing the output column if (col_haystack_vector) Impl::vectorConstant( col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance); diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 53eda533cb6..e3e14458764 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -18,6 +18,10 @@ namespace DB { /** + * multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) + * multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) + * multiMatchAllIndices(haystack, [pattern_1, pattern_2, ..., pattern_n]) + * * multiSearchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1 * multiSearchAnyUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) * multiSearchAnyCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n]) @@ -82,13 +86,11 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { - using ResultType = typename Impl::ResultType; - const ColumnPtr & column_haystack = arguments[0].column; + const ColumnPtr & arr_ptr = arguments[1].column; const ColumnString * col_haystack_vector = checkAndGetColumn(&*column_haystack); - const ColumnPtr & arr_ptr = arguments[1].column; const ColumnConst * col_const_arr = checkAndGetColumnConst(arr_ptr.get()); if (!col_const_arr) @@ -111,13 +113,14 @@ public: if (Impl::is_using_hyperscan) checkHyperscanRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + using ResultType = typename Impl::ResultType; auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); auto & vec_res = col_res->getData(); auto & offsets_res = col_offsets->getData(); - /// The blame for resizing output is for the callee. + // the implementations are responsible for resizing the output column if (col_haystack_vector) Impl::vectorConstant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res); else diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index a9479a9d7f5..c77fd5fbbcc 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -26,10 +26,11 @@ namespace ErrorCodes } -template +template struct MultiMatchAllIndicesImpl { - using ResultType = Type; + using ResultType = ResultType_; + static constexpr bool is_using_hyperscan = true; /// Variable for understanding, if we used offsets for the output, most /// likely to determine whether the function returns ColumnVector of ColumnArray. @@ -45,18 +46,18 @@ struct MultiMatchAllIndicesImpl const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, - PaddedPODArray & res, + PaddedPODArray & res, PaddedPODArray & offsets) { vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt); } static void vectorConstant( - const ColumnString::Chars & haystack_data, - const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, - PaddedPODArray & res, - PaddedPODArray & offsets, + [[maybe_unused]] const ColumnString::Chars & haystack_data, + [[maybe_unused]] const ColumnString::Offsets & haystack_offsets, + [[maybe_unused]] const std::vector & needles, + [[maybe_unused]] PaddedPODArray & res, + [[maybe_unused]] PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance) { offsets.resize(haystack_offsets.size()); @@ -76,7 +77,7 @@ struct MultiMatchAllIndicesImpl unsigned int /* flags */, void * context) -> int { - static_cast*>(context)->push_back(id); + static_cast*>(context)->push_back(id); return 0; }; const size_t haystack_offsets_size = haystack_offsets.size(); @@ -102,11 +103,6 @@ struct MultiMatchAllIndicesImpl offset = haystack_offsets[i]; } #else - (void)haystack_data; - (void)haystack_offsets; - (void)needles; - (void)res; - (void)offsets; throw Exception( "multi-search all indices is not implemented when vectorscan is off", ErrorCodes::NOT_IMPLEMENTED); diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index f8d790c98cc..b0a4d7d55e4 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -27,11 +27,13 @@ namespace ErrorCodes } -template +template struct MultiMatchAnyImpl { - static_assert(static_cast(FindAny) + static_cast(FindAnyIndex) == 1); - using ResultType = Type; + static_assert(FindAny ^ FindAnyIndex); + + using ResultType = ResultType_; + static constexpr bool is_using_hyperscan = true; /// Variable for understanding, if we used offsets for the output, most /// likely to determine whether the function returns ColumnVector of ColumnArray. @@ -47,7 +49,7 @@ struct MultiMatchAnyImpl const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, - PaddedPODArray & res, + PaddedPODArray & res, PaddedPODArray & offsets) { vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt); @@ -57,7 +59,7 @@ struct MultiMatchAnyImpl const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, - PaddedPODArray & res, + PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance) { @@ -81,9 +83,9 @@ struct MultiMatchAnyImpl void * context) -> int { if constexpr (FindAnyIndex) - *reinterpret_cast(context) = id; + *reinterpret_cast(context) = id; else if constexpr (FindAny) - *reinterpret_cast(context) = 1; + *reinterpret_cast(context) = 1; /// Once we hit the callback, there is no need to search for others. return 1; }; @@ -110,7 +112,7 @@ struct MultiMatchAnyImpl offset = haystack_offsets[i]; } #else - /// Fallback if do not use vectorscan + // fallback if vectorscan is not compiled if constexpr (MultiSearchDistance) throw Exception( "Edit distance multi-search is not implemented when vectorscan is off", diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index a85c89774a2..87ad6577e15 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -167,9 +167,8 @@ struct Pool }; template -inline Regexps constructRegexps(const std::vector & str_patterns, std::optional edit_distance) +inline Regexps constructRegexps(const std::vector & str_patterns, [[maybe_unused]] std::optional edit_distance) { - (void)edit_distance; /// Common pointers std::vector patterns; std::vector flags; @@ -270,7 +269,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, std::o if (err != HS_SUCCESS) throw Exception("Could not allocate scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); - return Regexps{db, scratch}; + return {db, scratch}; } /// If CompileForEditDistance is False, edit_distance must be nullopt @@ -280,8 +279,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, std::o template inline Regexps * get(const std::vector & patterns, std::optional edit_distance) { - /// C++11 has thread-safe function-local static on most modern compilers. - static Pool known_regexps; /// Different variables for different pattern parameters. + static Pool known_regexps; /// Different variables for different pattern parameters, thread-safe in C++11 std::vector str_patterns; str_patterns.reserve(patterns.size()); From 580d89477f5b5d6f84c0ce259b24b418dcf66503 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 15:42:42 +0200 Subject: [PATCH 38/60] Minimally faster performance --- src/Functions/Regexps.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index 87ad6577e15..3e12f7fa651 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -145,7 +145,7 @@ public: Regexps * operator()() { - std::unique_lock lock(mutex); + std::lock_guard lock(mutex); if (regexp) return &*regexp; regexp = constructor(); @@ -284,7 +284,7 @@ inline Regexps * get(const std::vector & patterns, std::option std::vector str_patterns; str_patterns.reserve(patterns.size()); for (const auto & pattern : patterns) - str_patterns.push_back(std::string(pattern.data(), pattern.size())); + str_patterns.emplace_back(std::string(pattern.data(), pattern.size())); /// Get the lock for finding database. std::unique_lock lock(known_regexps.mutex); From 89bfdd50bfa7b00304ed8805c08d4958fb231a89 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 15:44:42 +0200 Subject: [PATCH 39/60] Remove unnecessary check - getReturnTypeImpl() ensures that the haystack column has type "String" and we can simply assert that. --- src/Functions/FunctionsMultiStringFuzzySearch.h | 10 +++------- src/Functions/FunctionsMultiStringSearch.h | 9 +++------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index b0f2de96622..822c513b0d7 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -49,7 +49,6 @@ public: {} String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 3; } bool useDefaultImplementationForConstants() const override { return true; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } @@ -77,6 +76,7 @@ public: const ColumnPtr & arr_ptr = arguments[2].column; const ColumnString * col_haystack_vector = checkAndGetColumn(&*column_haystack); + assert(col_haystack_vector); // getReturnTypeImpl() checks the data type const ColumnConst * col_const_num = nullptr; UInt32 edit_distance = 0; @@ -117,13 +117,9 @@ public: auto & vec_res = col_res->getData(); auto & offsets_res = col_offsets->getData(); - // the implementations are responsible for resizing the output column - if (col_haystack_vector) - Impl::vectorConstant( - col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance); - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arguments[0].column->getName()); + + Impl::vectorConstant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance); if constexpr (Impl::is_column_array) return ColumnArray::create(std::move(col_res), std::move(col_offsets)); diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index e3e14458764..0b2e68099d7 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -66,7 +66,6 @@ public: {} String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 2; } bool useDefaultImplementationForConstants() const override { return true; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } @@ -90,6 +89,7 @@ public: const ColumnPtr & arr_ptr = arguments[1].column; const ColumnString * col_haystack_vector = checkAndGetColumn(&*column_haystack); + assert(col_haystack_vector); // getReturnTypeImpl() checks the data type const ColumnConst * col_const_arr = checkAndGetColumnConst(arr_ptr.get()); @@ -119,12 +119,9 @@ public: auto & vec_res = col_res->getData(); auto & offsets_res = col_offsets->getData(); - // the implementations are responsible for resizing the output column - if (col_haystack_vector) - Impl::vectorConstant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res); - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arguments[0].column->getName()); + + Impl::vectorConstant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res); if constexpr (Impl::is_column_array) return ColumnArray::create(std::move(col_res), std::move(col_offsets)); From 7913edc172abaeedb0d3267e732f0272ad95db85 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 16:12:38 +0200 Subject: [PATCH 40/60] Move check for hyperscan regexp constraints into implementations - This is preparation for non-const regexp arguments, where this check will run for each row. --- src/Functions/FunctionsMultiStringFuzzySearch.h | 8 +++----- src/Functions/FunctionsMultiStringSearch.h | 8 +++----- src/Functions/MultiMatchAllIndicesImpl.h | 16 ++++++++++++---- src/Functions/MultiMatchAnyImpl.h | 14 +++++++++++--- src/Functions/MultiSearchFirstIndexImpl.h | 4 +++- src/Functions/MultiSearchFirstPositionImpl.h | 4 +++- src/Functions/MultiSearchImpl.h | 4 +++- 7 files changed, 38 insertions(+), 20 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 822c513b0d7..514b0b0189a 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -108,9 +107,6 @@ public: for (const auto & el : src_arr) refs.emplace_back(el.get()); - if (Impl::is_using_hyperscan) - checkHyperscanRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); - using ResultType = typename Impl::ResultType; auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); @@ -119,7 +115,9 @@ public: auto & offsets_res = col_offsets->getData(); // the implementations are responsible for resizing the output column - Impl::vectorConstant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance); + Impl::vectorConstant( + col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance, + max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); if constexpr (Impl::is_column_array) return ColumnArray::create(std::move(col_res), std::move(col_offsets)); diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 0b2e68099d7..8af5689c4a6 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -110,9 +109,6 @@ public: for (const auto & el : src_arr) refs.emplace_back(el.get()); - if (Impl::is_using_hyperscan) - checkHyperscanRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); - using ResultType = typename Impl::ResultType; auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); @@ -121,7 +117,9 @@ public: auto & offsets_res = col_offsets->getData(); // the implementations are responsible for resizing the output column - Impl::vectorConstant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res); + Impl::vectorConstant( + col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, + max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); if constexpr (Impl::is_column_array) return ColumnArray::create(std::move(col_res), std::move(col_offsets)); diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index c77fd5fbbcc..cb8d29723a6 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "Regexps.h" #include "config_functions.h" @@ -47,9 +48,11 @@ struct MultiMatchAllIndicesImpl const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res, - PaddedPODArray & offsets) + PaddedPODArray & offsets, + size_t max_hyperscan_regexp_length, + size_t max_hyperscan_regexp_total_length) { - vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt); + vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); } static void vectorConstant( @@ -58,10 +61,15 @@ struct MultiMatchAllIndicesImpl [[maybe_unused]] const std::vector & needles, [[maybe_unused]] PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, - [[maybe_unused]] std::optional edit_distance) + [[maybe_unused]] std::optional edit_distance, + [[maybe_unused]] size_t max_hyperscan_regexp_length, + [[maybe_unused]] size_t max_hyperscan_regexp_total_length) { - offsets.resize(haystack_offsets.size()); #if USE_VECTORSCAN + checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + + offsets.resize(haystack_offsets.size()); + const auto & hyperscan_regex = MultiRegexps::get(needles, edit_distance); hs_scratch_t * scratch = nullptr; hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch); diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index b0a4d7d55e4..349078b9d4a 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -3,6 +3,7 @@ #include #include #include +#include #include "Regexps.h" #include "config_functions.h" @@ -50,9 +51,11 @@ struct MultiMatchAnyImpl const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res, - PaddedPODArray & offsets) + PaddedPODArray & offsets, + size_t max_hyperscan_regexp_length, + size_t max_hyperscan_regexp_total_length) { - vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt); + vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); } static void vectorConstant( @@ -61,10 +64,15 @@ struct MultiMatchAnyImpl const std::vector & needles, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, - [[maybe_unused]] std::optional edit_distance) + [[maybe_unused]] std::optional edit_distance, + size_t max_hyperscan_regexp_length, + size_t max_hyperscan_regexp_total_length) { (void)FindAny; (void)FindAnyIndex; + + checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + res.resize(haystack_offsets.size()); #if USE_VECTORSCAN const auto & hyperscan_regex = MultiRegexps::get(needles, edit_distance); diff --git a/src/Functions/MultiSearchFirstIndexImpl.h b/src/Functions/MultiSearchFirstIndexImpl.h index e452ac03902..b04ce157583 100644 --- a/src/Functions/MultiSearchFirstIndexImpl.h +++ b/src/Functions/MultiSearchFirstIndexImpl.h @@ -24,7 +24,9 @@ struct MultiSearchFirstIndexImpl const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res, - [[maybe_unused]] PaddedPODArray & offsets) + [[maybe_unused]] PaddedPODArray & offsets, + size_t /*max_hyperscan_regexp_length*/, + size_t /*max_hyperscan_regexp_total_length*/) { auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 1aa89b54a1e..5bfc13be10b 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -24,7 +24,9 @@ struct MultiSearchFirstPositionImpl const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res, - [[maybe_unused]] PaddedPODArray & offsets) + [[maybe_unused]] PaddedPODArray & offsets, + size_t /*max_hyperscan_regexp_length*/, + size_t /*max_hyperscan_regexp_total_length*/) { auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 { diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index 91c3d71fdb9..ac198b16765 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -24,7 +24,9 @@ struct MultiSearchImpl const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res, - [[maybe_unused]] PaddedPODArray & offsets) + [[maybe_unused]] PaddedPODArray & offsets, + size_t /*max_hyperscan_regexp_length*/, + size_t /*max_hyperscan_regexp_total_length*/) { auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); From 3478db9fb6a81e4fcbd698cfca94d5508b709103 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 Jun 2022 16:42:39 +0200 Subject: [PATCH 41/60] Move check for regexp array size into implementations - This is not needed for non-const regexp array arguments (the cardinality of arrays is fixed per column) but it cleans up the code and runs the check only in functions which have restrictions on the number of patterns. - For functions using hyperscans, it was checked that the number of regexes is < 2^32. Removed the check because I don't think anyone will every specify 4 billion patterns. --- src/Functions/FunctionsMultiStringFuzzySearch.h | 10 +--------- src/Functions/FunctionsMultiStringSearch.h | 13 +------------ src/Functions/MultiSearchFirstIndexImpl.h | 11 +++++++++++ src/Functions/MultiSearchImpl.h | 11 +++++++++++ src/Functions/multiFuzzyMatchAllIndices.cpp | 4 +--- src/Functions/multiFuzzyMatchAny.cpp | 4 +--- src/Functions/multiFuzzyMatchAnyIndex.cpp | 4 +--- src/Functions/multiMatchAllIndices.cpp | 4 +--- src/Functions/multiMatchAny.cpp | 4 +--- src/Functions/multiMatchAnyIndex.cpp | 4 +--- src/Functions/multiSearchAnyCaseInsensitive.cpp | 3 +-- src/Functions/multiSearchFirstIndex.cpp | 3 +-- 12 files changed, 32 insertions(+), 43 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 514b0b0189a..b83b5c4192b 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -21,16 +21,13 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int FUNCTION_NOT_ALLOWED; } -template +template class FunctionsMultiStringFuzzySearch : public IFunction { - static_assert(LimitArgs > 0); - public: static constexpr auto name = Impl::name; static FunctionPtr create(ContextPtr context) @@ -96,11 +93,6 @@ public: Array src_arr = col_const_arr->getValue(); - if (src_arr.size() > LimitArgs) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be at most {}", - getName(), std::to_string(src_arr.size()), std::to_string(LimitArgs)); - std::vector refs; refs.reserve(src_arr.size()); diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 8af5689c4a6..8b48f8f9eaa 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -36,18 +36,13 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int FUNCTION_NOT_ALLOWED; } -/// The argument limiting raises from Volnitsky searcher -- it is performance crucial to save only one byte for pattern number. -/// But some other searchers use this function, for example, multiMatchAny -- hyperscan does not have such restrictions -template ::max()> +template class FunctionsMultiStringSearch : public IFunction { - static_assert(LimitArgs > 0); - public: static constexpr auto name = Impl::name; static FunctionPtr create(ContextPtr context) @@ -97,12 +92,6 @@ public: Array src_arr = col_const_arr->getValue(); - if (src_arr.size() > LimitArgs) - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be at most {}", - getName(), std::to_string(src_arr.size()), std::to_string(LimitArgs)); - std::vector refs; refs.reserve(src_arr.size()); diff --git a/src/Functions/MultiSearchFirstIndexImpl.h b/src/Functions/MultiSearchFirstIndexImpl.h index b04ce157583..4f01c45bdf4 100644 --- a/src/Functions/MultiSearchFirstIndexImpl.h +++ b/src/Functions/MultiSearchFirstIndexImpl.h @@ -7,6 +7,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + template struct MultiSearchFirstIndexImpl { @@ -28,6 +33,12 @@ struct MultiSearchFirstIndexImpl size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) { + // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. + if (needles.size() > std::numeric_limits::max()) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be at most {}", + name, std::to_string(needles.size()), std::to_string(std::numeric_limits::max())); + auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); res.resize(haystack_string_size); diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index ac198b16765..0c951f3c5fc 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -7,6 +7,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + template struct MultiSearchImpl { @@ -28,6 +33,12 @@ struct MultiSearchImpl size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) { + // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. + if (needles.size() > std::numeric_limits::max()) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be at most {}", + name, std::to_string(needles.size()), std::to_string(std::numeric_limits::max())); + auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); res.resize(haystack_string_size); diff --git a/src/Functions/multiFuzzyMatchAllIndices.cpp b/src/Functions/multiFuzzyMatchAllIndices.cpp index d0121ee3981..2c4ac05ddbe 100644 --- a/src/Functions/multiFuzzyMatchAllIndices.cpp +++ b/src/Functions/multiFuzzyMatchAllIndices.cpp @@ -13,9 +13,7 @@ struct NameMultiFuzzyMatchAllIndices static constexpr auto name = "multiFuzzyMatchAllIndices"; }; -using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch< - MultiMatchAllIndicesImpl, - std::numeric_limits::max()>; +using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch>; } diff --git a/src/Functions/multiFuzzyMatchAny.cpp b/src/Functions/multiFuzzyMatchAny.cpp index 640e93a23b0..fbd84ead31b 100644 --- a/src/Functions/multiFuzzyMatchAny.cpp +++ b/src/Functions/multiFuzzyMatchAny.cpp @@ -13,9 +13,7 @@ struct NameMultiFuzzyMatchAny static constexpr auto name = "multiFuzzyMatchAny"; }; -using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch< - MultiMatchAnyImpl, - std::numeric_limits::max()>; +using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch>; } diff --git a/src/Functions/multiFuzzyMatchAnyIndex.cpp b/src/Functions/multiFuzzyMatchAnyIndex.cpp index f8bad1bc461..255f710e2b3 100644 --- a/src/Functions/multiFuzzyMatchAnyIndex.cpp +++ b/src/Functions/multiFuzzyMatchAnyIndex.cpp @@ -13,9 +13,7 @@ struct NameMultiFuzzyMatchAnyIndex static constexpr auto name = "multiFuzzyMatchAnyIndex"; }; -using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch< - MultiMatchAnyImpl, - std::numeric_limits::max()>; +using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch>; } diff --git a/src/Functions/multiMatchAllIndices.cpp b/src/Functions/multiMatchAllIndices.cpp index 940c9e7e3bf..1c6577e5aa3 100644 --- a/src/Functions/multiMatchAllIndices.cpp +++ b/src/Functions/multiMatchAllIndices.cpp @@ -13,9 +13,7 @@ struct NameMultiMatchAllIndices static constexpr auto name = "multiMatchAllIndices"; }; -using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch< - MultiMatchAllIndicesImpl, - std::numeric_limits::max()>; +using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch>; } diff --git a/src/Functions/multiMatchAny.cpp b/src/Functions/multiMatchAny.cpp index 47510e0ecc2..4920e9c230f 100644 --- a/src/Functions/multiMatchAny.cpp +++ b/src/Functions/multiMatchAny.cpp @@ -13,9 +13,7 @@ struct NameMultiMatchAny static constexpr auto name = "multiMatchAny"; }; -using FunctionMultiMatchAny = FunctionsMultiStringSearch< - MultiMatchAnyImpl, - std::numeric_limits::max()>; +using FunctionMultiMatchAny = FunctionsMultiStringSearch>; } diff --git a/src/Functions/multiMatchAnyIndex.cpp b/src/Functions/multiMatchAnyIndex.cpp index a56d41dc95b..bf68e8576a5 100644 --- a/src/Functions/multiMatchAnyIndex.cpp +++ b/src/Functions/multiMatchAnyIndex.cpp @@ -13,9 +13,7 @@ struct NameMultiMatchAnyIndex static constexpr auto name = "multiMatchAnyIndex"; }; -using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch< - MultiMatchAnyImpl, - std::numeric_limits::max()>; +using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch>; } diff --git a/src/Functions/multiSearchAnyCaseInsensitive.cpp b/src/Functions/multiSearchAnyCaseInsensitive.cpp index 9bc950c0d3d..af463805ea5 100644 --- a/src/Functions/multiSearchAnyCaseInsensitive.cpp +++ b/src/Functions/multiSearchAnyCaseInsensitive.cpp @@ -13,8 +13,7 @@ struct NameMultiSearchAnyCaseInsensitive { static constexpr auto name = "multiSearchAnyCaseInsensitive"; }; -using FunctionMultiSearchCaseInsensitive - = FunctionsMultiStringSearch>; +using FunctionMultiSearchCaseInsensitive = FunctionsMultiStringSearch>; } diff --git a/src/Functions/multiSearchFirstIndex.cpp b/src/Functions/multiSearchFirstIndex.cpp index a96ebed029c..a5fbec2dc36 100644 --- a/src/Functions/multiSearchFirstIndex.cpp +++ b/src/Functions/multiSearchFirstIndex.cpp @@ -14,8 +14,7 @@ struct NameMultiSearchFirstIndex static constexpr auto name = "multiSearchFirstIndex"; }; -using FunctionMultiSearchFirstIndex - = FunctionsMultiStringSearch>; +using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch>; } From 2f15d45f27b8d9d8d32b557decc2ab9030032308 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 26 Jun 2022 16:25:43 +0000 Subject: [PATCH 42/60] Move check for regexp array size into implementations - This is not needed for non-const regexp array arguments (the cardinality of arrays is fixed per column) but it cleans up the code and runs the check only in functions which have restrictions on the number of patterns. - For functions using hyperscans, it was checked that the number of regexes is < 2^32. Removed the check because I don't think anyone will every specify 4 billion patterns. --- src/Functions/MultiSearchFirstPositionImpl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 5bfc13be10b..2260950c5d4 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -7,6 +7,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + template struct MultiSearchFirstPositionImpl { @@ -28,6 +33,12 @@ struct MultiSearchFirstPositionImpl size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) { + // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. + if (needles.size() > std::numeric_limits::max()) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be at most {}", + name, std::to_string(needles.size()), std::to_string(std::numeric_limits::max())); + auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 { return 1 + Impl::countChars(reinterpret_cast(start), reinterpret_cast(end)); From c9ce0efa662b4af905450aa2a0db17cd7609ed89 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 25 Jun 2022 17:28:15 +0200 Subject: [PATCH 43/60] Instantiate MultiMatchAnyImpl template using enums - With this, invalid combinations of the FindAny/FindAnyIndex bools are no longer possible and we can remove the corresponding check - Also makes the instantiations more readable. --- src/Functions/MultiMatchAllIndicesImpl.h | 5 ++--- src/Functions/MultiMatchAnyImpl.h | 23 ++++++++++++++------- src/Functions/Regexps.h | 14 ++++++------- src/Functions/multiFuzzyMatchAllIndices.cpp | 2 +- src/Functions/multiFuzzyMatchAny.cpp | 2 +- src/Functions/multiFuzzyMatchAnyIndex.cpp | 2 +- src/Functions/multiMatchAllIndices.cpp | 2 +- src/Functions/multiMatchAny.cpp | 2 +- src/Functions/multiMatchAnyIndex.cpp | 2 +- 9 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index cb8d29723a6..3234782d1c1 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -27,7 +27,7 @@ namespace ErrorCodes } -template +template struct MultiMatchAllIndicesImpl { using ResultType = ResultType_; @@ -69,8 +69,7 @@ struct MultiMatchAllIndicesImpl checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); offsets.resize(haystack_offsets.size()); - - const auto & hyperscan_regex = MultiRegexps::get(needles, edit_distance); + const auto & hyperscan_regex = MultiRegexps::get(needles, edit_distance); hs_scratch_t * scratch = nullptr; hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch); diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 349078b9d4a..12b7f76d173 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -27,14 +27,24 @@ namespace ErrorCodes extern const int TOO_MANY_BYTES; } +// For more readable instantiations of MultiMatchAnyImpl<> +struct MultiMatchTraits +{ +enum class Find +{ + Any, + AnyIndex +}; +}; -template +template struct MultiMatchAnyImpl { - static_assert(FindAny ^ FindAnyIndex); - using ResultType = ResultType_; + static constexpr bool FindAny = (Find == MultiMatchTraits::Find::Any); + static constexpr bool FindAnyIndex = (Find == MultiMatchTraits::Find::AnyIndex); + static constexpr bool is_using_hyperscan = true; /// Variable for understanding, if we used offsets for the output, most /// likely to determine whether the function returns ColumnVector of ColumnArray. @@ -68,14 +78,11 @@ struct MultiMatchAnyImpl size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) { - (void)FindAny; - (void)FindAnyIndex; - checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); res.resize(haystack_offsets.size()); #if USE_VECTORSCAN - const auto & hyperscan_regex = MultiRegexps::get(needles, edit_distance); + const auto & hyperscan_regex = MultiRegexps::get(needles, edit_distance); hs_scratch_t * scratch = nullptr; hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch); @@ -121,7 +128,7 @@ struct MultiMatchAnyImpl } #else // fallback if vectorscan is not compiled - if constexpr (MultiSearchDistance) + if constexpr (WithEditDistance) throw Exception( "Edit distance multi-search is not implemented when vectorscan is off", ErrorCodes::NOT_IMPLEMENTED); diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index 3e12f7fa651..b932b14a6a9 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -166,7 +166,7 @@ struct Pool std::map, std::optional>, RegexpsConstructor> storage; }; -template +template inline Regexps constructRegexps(const std::vector & str_patterns, [[maybe_unused]] std::optional edit_distance) { /// Common pointers @@ -180,7 +180,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb patterns.reserve(str_patterns.size()); flags.reserve(str_patterns.size()); - if constexpr (CompileForEditDistance) + if constexpr (WithEditDistance) { ext_exprs.reserve(str_patterns.size()); ext_exprs_ptrs.reserve(str_patterns.size()); @@ -198,7 +198,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag */ flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8); - if constexpr (CompileForEditDistance) + if constexpr (WithEditDistance) { /// Hyperscan currently does not support UTF8 matching with edit distance. flags.back() &= ~HS_FLAG_UTF8; @@ -223,7 +223,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb } hs_error_t err; - if constexpr (!CompileForEditDistance) + if constexpr (!WithEditDistance) err = hs_compile_multi( patterns.data(), flags.data(), @@ -272,11 +272,11 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb return {db, scratch}; } -/// If CompileForEditDistance is False, edit_distance must be nullopt +/// If WithEditDistance is False, edit_distance must be nullopt /// Also, we use templates here because each instantiation of function /// template has its own copy of local static variables which must not be the same /// for different hyperscan compilations. -template +template inline Regexps * get(const std::vector & patterns, std::optional edit_distance) { static Pool known_regexps; /// Different variables for different pattern parameters, thread-safe in C++11 @@ -299,7 +299,7 @@ inline Regexps * get(const std::vector & patterns, std::option .first; it->second.setConstructor([&str_patterns = it->first.first, edit_distance]() { - return constructRegexps(str_patterns, edit_distance); + return constructRegexps(str_patterns, edit_distance); }); } diff --git a/src/Functions/multiFuzzyMatchAllIndices.cpp b/src/Functions/multiFuzzyMatchAllIndices.cpp index 2c4ac05ddbe..93ffb936dc1 100644 --- a/src/Functions/multiFuzzyMatchAllIndices.cpp +++ b/src/Functions/multiFuzzyMatchAllIndices.cpp @@ -13,7 +13,7 @@ struct NameMultiFuzzyMatchAllIndices static constexpr auto name = "multiFuzzyMatchAllIndices"; }; -using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch>; +using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch>; } diff --git a/src/Functions/multiFuzzyMatchAny.cpp b/src/Functions/multiFuzzyMatchAny.cpp index fbd84ead31b..a627030d7af 100644 --- a/src/Functions/multiFuzzyMatchAny.cpp +++ b/src/Functions/multiFuzzyMatchAny.cpp @@ -13,7 +13,7 @@ struct NameMultiFuzzyMatchAny static constexpr auto name = "multiFuzzyMatchAny"; }; -using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch>; +using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch>; } diff --git a/src/Functions/multiFuzzyMatchAnyIndex.cpp b/src/Functions/multiFuzzyMatchAnyIndex.cpp index 255f710e2b3..4b24a06e171 100644 --- a/src/Functions/multiFuzzyMatchAnyIndex.cpp +++ b/src/Functions/multiFuzzyMatchAnyIndex.cpp @@ -13,7 +13,7 @@ struct NameMultiFuzzyMatchAnyIndex static constexpr auto name = "multiFuzzyMatchAnyIndex"; }; -using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch>; +using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch>; } diff --git a/src/Functions/multiMatchAllIndices.cpp b/src/Functions/multiMatchAllIndices.cpp index 1c6577e5aa3..47bd57029e2 100644 --- a/src/Functions/multiMatchAllIndices.cpp +++ b/src/Functions/multiMatchAllIndices.cpp @@ -13,7 +13,7 @@ struct NameMultiMatchAllIndices static constexpr auto name = "multiMatchAllIndices"; }; -using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch>; +using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch>; } diff --git a/src/Functions/multiMatchAny.cpp b/src/Functions/multiMatchAny.cpp index 4920e9c230f..324e435de26 100644 --- a/src/Functions/multiMatchAny.cpp +++ b/src/Functions/multiMatchAny.cpp @@ -13,7 +13,7 @@ struct NameMultiMatchAny static constexpr auto name = "multiMatchAny"; }; -using FunctionMultiMatchAny = FunctionsMultiStringSearch>; +using FunctionMultiMatchAny = FunctionsMultiStringSearch>; } diff --git a/src/Functions/multiMatchAnyIndex.cpp b/src/Functions/multiMatchAnyIndex.cpp index bf68e8576a5..6a11fa4eb35 100644 --- a/src/Functions/multiMatchAnyIndex.cpp +++ b/src/Functions/multiMatchAnyIndex.cpp @@ -13,7 +13,7 @@ struct NameMultiMatchAnyIndex static constexpr auto name = "multiMatchAnyIndex"; }; -using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch>; +using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch>; } From c2cea38b97e2282277d02e27c6ba249ed50179b5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 25 Jun 2022 17:41:24 +0200 Subject: [PATCH 44/60] Move local variable into if statement --- src/Functions/FunctionsMultiStringFuzzySearch.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index b83b5c4192b..c161ffc50cd 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -74,15 +74,14 @@ public: const ColumnString * col_haystack_vector = checkAndGetColumn(&*column_haystack); assert(col_haystack_vector); // getReturnTypeImpl() checks the data type - const ColumnConst * col_const_num = nullptr; UInt32 edit_distance = 0; - if ((col_const_num = checkAndGetColumnConst(num_ptr.get()))) - edit_distance = col_const_num->getValue(); - else if ((col_const_num = checkAndGetColumnConst(num_ptr.get()))) - edit_distance = col_const_num->getValue(); - else if ((col_const_num = checkAndGetColumnConst(num_ptr.get()))) - edit_distance = col_const_num->getValue(); + if (const auto * col_const_uint8 = checkAndGetColumnConst(num_ptr.get())) + edit_distance = col_const_uint8->getValue(); + else if (const auto * col_const_uint16 = checkAndGetColumnConst(num_ptr.get())) + edit_distance = col_const_uint16->getValue(); + else if (const auto * col_const_uint32 = checkAndGetColumnConst(num_ptr.get())) + edit_distance = col_const_uint32->getValue(); else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The number is not const or does not fit in UInt32", arguments[1].column->getName()); From e2b11899a1c6371c70f375a3ac119bd5b73a43d8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 25 Jun 2022 17:53:11 +0200 Subject: [PATCH 45/60] Move check if cfg allows hyperscan into implementations - This is not needed for non-const regexp array arguments but cleans up the code and runs the check only in functions which actually use hyperscan. --- src/Functions/FunctionsMultiStringFuzzySearch.h | 16 ++++++++-------- src/Functions/FunctionsMultiStringSearch.h | 16 ++++++++-------- src/Functions/MultiMatchAllIndicesImpl.h | 10 +++++++--- src/Functions/MultiMatchAnyImpl.h | 11 ++++++++--- src/Functions/MultiSearchFirstIndexImpl.h | 2 +- src/Functions/MultiSearchFirstPositionImpl.h | 2 +- src/Functions/MultiSearchImpl.h | 2 +- 7 files changed, 34 insertions(+), 25 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index c161ffc50cd..33721dd19ef 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -30,18 +30,17 @@ class FunctionsMultiStringFuzzySearch : public IFunction { public: static constexpr auto name = Impl::name; + static FunctionPtr create(ContextPtr context) { const auto & settings = context->getSettingsRef(); - - if (Impl::is_using_hyperscan && !settings.allow_hyperscan) - throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); - - return std::make_shared(settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); + return std::make_shared(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); } - FunctionsMultiStringFuzzySearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) - : max_hyperscan_regexp_length(max_hyperscan_regexp_length_), max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) + FunctionsMultiStringFuzzySearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) + : allow_hyperscan(allow_hyperscan_) + , max_hyperscan_regexp_length(max_hyperscan_regexp_length_) + , max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) {} String getName() const override { return name; } @@ -108,7 +107,7 @@ public: Impl::vectorConstant( col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance, - max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); if constexpr (Impl::is_column_array) return ColumnArray::create(std::move(col_res), std::move(col_offsets)); @@ -117,6 +116,7 @@ public: } private: + const bool allow_hyperscan; const size_t max_hyperscan_regexp_length; const size_t max_hyperscan_regexp_total_length; }; diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 8b48f8f9eaa..b4ff354af41 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -45,18 +45,17 @@ class FunctionsMultiStringSearch : public IFunction { public: static constexpr auto name = Impl::name; + static FunctionPtr create(ContextPtr context) { const auto & settings = context->getSettingsRef(); - - if (Impl::is_using_hyperscan && !settings.allow_hyperscan) - throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); - - return std::make_shared(settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); + return std::make_shared(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); } - FunctionsMultiStringSearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) - : max_hyperscan_regexp_length(max_hyperscan_regexp_length_), max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) + FunctionsMultiStringSearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) + : allow_hyperscan(allow_hyperscan_) + , max_hyperscan_regexp_length(max_hyperscan_regexp_length_) + , max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) {} String getName() const override { return name; } @@ -108,7 +107,7 @@ public: Impl::vectorConstant( col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, - max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); if constexpr (Impl::is_column_array) return ColumnArray::create(std::move(col_res), std::move(col_offsets)); @@ -117,6 +116,7 @@ public: } private: + const bool allow_hyperscan; const size_t max_hyperscan_regexp_length; const size_t max_hyperscan_regexp_total_length; }; diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index 3234782d1c1..97149b48df1 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -20,8 +20,9 @@ namespace DB namespace ErrorCodes { - extern const int HYPERSCAN_CANNOT_SCAN_TEXT; extern const int CANNOT_ALLOCATE_MEMORY; + extern const int FUNCTION_NOT_ALLOWED; + extern const int HYPERSCAN_CANNOT_SCAN_TEXT; extern const int NOT_IMPLEMENTED; extern const int TOO_MANY_BYTES; } @@ -32,7 +33,6 @@ struct MultiMatchAllIndicesImpl { using ResultType = ResultType_; - static constexpr bool is_using_hyperscan = true; /// Variable for understanding, if we used offsets for the output, most /// likely to determine whether the function returns ColumnVector of ColumnArray. static constexpr bool is_column_array = true; @@ -49,10 +49,11 @@ struct MultiMatchAllIndicesImpl const std::vector & needles, PaddedPODArray & res, PaddedPODArray & offsets, + bool allow_hyperscan, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) { - vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); } static void vectorConstant( @@ -62,9 +63,12 @@ struct MultiMatchAllIndicesImpl [[maybe_unused]] PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance, + bool allow_hyperscan, [[maybe_unused]] size_t max_hyperscan_regexp_length, [[maybe_unused]] size_t max_hyperscan_regexp_total_length) { + if (!allow_hyperscan) + throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); #if USE_VECTORSCAN checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 12b7f76d173..956458e2953 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -21,8 +21,9 @@ namespace DB namespace ErrorCodes { - extern const int HYPERSCAN_CANNOT_SCAN_TEXT; extern const int CANNOT_ALLOCATE_MEMORY; + extern const int FUNCTION_NOT_ALLOWED; + extern const int HYPERSCAN_CANNOT_SCAN_TEXT; extern const int NOT_IMPLEMENTED; extern const int TOO_MANY_BYTES; } @@ -45,7 +46,6 @@ struct MultiMatchAnyImpl static constexpr bool FindAny = (Find == MultiMatchTraits::Find::Any); static constexpr bool FindAnyIndex = (Find == MultiMatchTraits::Find::AnyIndex); - static constexpr bool is_using_hyperscan = true; /// Variable for understanding, if we used offsets for the output, most /// likely to determine whether the function returns ColumnVector of ColumnArray. static constexpr bool is_column_array = false; @@ -62,10 +62,11 @@ struct MultiMatchAnyImpl const std::vector & needles, PaddedPODArray & res, PaddedPODArray & offsets, + bool allow_hyperscan, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) { - vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); } static void vectorConstant( @@ -75,9 +76,13 @@ struct MultiMatchAnyImpl PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance, + bool allow_hyperscan, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) { + if (!allow_hyperscan) + throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); + checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); res.resize(haystack_offsets.size()); diff --git a/src/Functions/MultiSearchFirstIndexImpl.h b/src/Functions/MultiSearchFirstIndexImpl.h index 4f01c45bdf4..3a853625b37 100644 --- a/src/Functions/MultiSearchFirstIndexImpl.h +++ b/src/Functions/MultiSearchFirstIndexImpl.h @@ -16,7 +16,6 @@ template struct MultiSearchFirstIndexImpl { using ResultType = UInt64; - static constexpr bool is_using_hyperscan = false; /// Variable for understanding, if we used offsets for the output, most /// likely to determine whether the function returns ColumnVector of ColumnArray. static constexpr bool is_column_array = false; @@ -30,6 +29,7 @@ struct MultiSearchFirstIndexImpl const std::vector & needles, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, + bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) { diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 2260950c5d4..87eb56c3b11 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -16,7 +16,6 @@ template struct MultiSearchFirstPositionImpl { using ResultType = UInt64; - static constexpr bool is_using_hyperscan = false; /// Variable for understanding, if we used offsets for the output, most /// likely to determine whether the function returns ColumnVector of ColumnArray. static constexpr bool is_column_array = false; @@ -30,6 +29,7 @@ struct MultiSearchFirstPositionImpl const std::vector & needles, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, + bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) { diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index 0c951f3c5fc..63ad41f1f7b 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -16,7 +16,6 @@ template struct MultiSearchImpl { using ResultType = UInt8; - static constexpr bool is_using_hyperscan = false; /// Variable for understanding, if we used offsets for the output, most /// likely to determine whether the function returns ColumnVector of ColumnArray. static constexpr bool is_column_array = false; @@ -30,6 +29,7 @@ struct MultiSearchImpl const std::vector & needles, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, + bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) { From b8f67185bf43e25005f697cdd0402479155bbeb3 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 25 Jun 2022 18:05:49 +0200 Subject: [PATCH 46/60] Cosmetics: Whitespaces --- src/Functions/FunctionsMultiStringFuzzySearch.h | 4 ---- src/Functions/FunctionsMultiStringSearch.h | 3 --- 2 files changed, 7 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 33721dd19ef..b42352d990e 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -74,7 +74,6 @@ public: assert(col_haystack_vector); // getReturnTypeImpl() checks the data type UInt32 edit_distance = 0; - if (const auto * col_const_uint8 = checkAndGetColumnConst(num_ptr.get())) edit_distance = col_const_uint8->getValue(); else if (const auto * col_const_uint16 = checkAndGetColumnConst(num_ptr.get())) @@ -85,15 +84,12 @@ public: throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The number is not const or does not fit in UInt32", arguments[1].column->getName()); const ColumnConst * col_const_arr = checkAndGetColumnConst(arr_ptr.get()); - if (!col_const_arr) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The array is not const", arguments[2].column->getName()); Array src_arr = col_const_arr->getValue(); - std::vector refs; refs.reserve(src_arr.size()); - for (const auto & el : src_arr) refs.emplace_back(el.get()); diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index b4ff354af41..1fe2e036a9a 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -85,15 +85,12 @@ public: assert(col_haystack_vector); // getReturnTypeImpl() checks the data type const ColumnConst * col_const_arr = checkAndGetColumnConst(arr_ptr.get()); - if (!col_const_arr) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The array is not const", arguments[1].column->getName()); Array src_arr = col_const_arr->getValue(); - std::vector refs; refs.reserve(src_arr.size()); - for (const auto & el : src_arr) refs.emplace_back(el.get()); From cb5d1a4a856e3362a276f801de4c456e1908429f Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 26 Jun 2022 15:45:52 +0000 Subject: [PATCH 47/60] Fix style check --- src/Functions/FunctionsMultiStringFuzzySearch.h | 1 - src/Functions/FunctionsMultiStringSearch.h | 1 - 2 files changed, 2 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index b42352d990e..b6428bb0f11 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -21,7 +21,6 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; - extern const int FUNCTION_NOT_ALLOWED; } diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 1fe2e036a9a..be51f412f06 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -36,7 +36,6 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; - extern const int FUNCTION_NOT_ALLOWED; } From 959cbaab02b5c4cb961a1621cfbd6aa7cc932c31 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 26 Jun 2022 16:12:17 +0000 Subject: [PATCH 48/60] Move loop over patterns into implementations - This is preparation for non-const regexp arguments, where this loop will run for each row. --- src/Functions/FunctionsMultiStringFuzzySearch.h | 9 ++------- src/Functions/FunctionsMultiStringSearch.h | 9 ++------- src/Functions/MultiMatchAllIndicesImpl.h | 12 +++++++++--- src/Functions/MultiMatchAnyImpl.h | 12 +++++++++--- src/Functions/MultiSearchFirstIndexImpl.h | 12 +++++++++--- src/Functions/MultiSearchFirstPositionImpl.h | 12 +++++++++--- src/Functions/MultiSearchImpl.h | 12 +++++++++--- 7 files changed, 49 insertions(+), 29 deletions(-) diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index b6428bb0f11..865a5d182c8 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -86,12 +86,6 @@ public: if (!col_const_arr) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The array is not const", arguments[2].column->getName()); - Array src_arr = col_const_arr->getValue(); - std::vector refs; - refs.reserve(src_arr.size()); - for (const auto & el : src_arr) - refs.emplace_back(el.get()); - using ResultType = typename Impl::ResultType; auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); @@ -100,8 +94,9 @@ public: auto & offsets_res = col_offsets->getData(); // the implementations are responsible for resizing the output column + Array needles_arr = col_const_arr->getValue(); Impl::vectorConstant( - col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance, + col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needles_arr, vec_res, offsets_res, edit_distance, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); if constexpr (Impl::is_column_array) diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index be51f412f06..04235e0a97a 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -87,12 +87,6 @@ public: if (!col_const_arr) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The array is not const", arguments[1].column->getName()); - Array src_arr = col_const_arr->getValue(); - std::vector refs; - refs.reserve(src_arr.size()); - for (const auto & el : src_arr) - refs.emplace_back(el.get()); - using ResultType = typename Impl::ResultType; auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); @@ -101,8 +95,9 @@ public: auto & offsets_res = col_offsets->getData(); // the implementations are responsible for resizing the output column + Array needles_arr = col_const_arr->getValue(); Impl::vectorConstant( - col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, + col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needles_arr, vec_res, offsets_res, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); if constexpr (Impl::is_column_array) diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index 97149b48df1..9c60dbffe91 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -46,20 +47,20 @@ struct MultiMatchAllIndicesImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const Array & needles_arr, PaddedPODArray & res, PaddedPODArray & offsets, bool allow_hyperscan, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) { - vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); } static void vectorConstant( [[maybe_unused]] const ColumnString::Chars & haystack_data, [[maybe_unused]] const ColumnString::Offsets & haystack_offsets, - [[maybe_unused]] const std::vector & needles, + [[maybe_unused]] const Array & needles_arr, [[maybe_unused]] PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance, @@ -70,6 +71,11 @@ struct MultiMatchAllIndicesImpl if (!allow_hyperscan) throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); #if USE_VECTORSCAN + std::vector needles; + needles.reserve(needles_arr.size()); + for (const auto & needle : needles_arr) + needles.emplace_back(needle.get()); + checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); offsets.resize(haystack_offsets.size()); diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 956458e2953..0752e87e8af 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -59,20 +60,20 @@ struct MultiMatchAnyImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const Array & needles_arr, PaddedPODArray & res, PaddedPODArray & offsets, bool allow_hyperscan, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) { - vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); } static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const Array & needles_arr, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance, @@ -83,6 +84,11 @@ struct MultiMatchAnyImpl if (!allow_hyperscan) throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); + std::vector needles; + needles.reserve(needles_arr.size()); + for (const auto & needle : needles_arr) + needles.emplace_back(needle.get()); + checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); res.resize(haystack_offsets.size()); diff --git a/src/Functions/MultiSearchFirstIndexImpl.h b/src/Functions/MultiSearchFirstIndexImpl.h index 3a853625b37..f69a3edbf8b 100644 --- a/src/Functions/MultiSearchFirstIndexImpl.h +++ b/src/Functions/MultiSearchFirstIndexImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -26,7 +27,7 @@ struct MultiSearchFirstIndexImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const Array & needles_arr, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, bool /*allow_hyperscan*/, @@ -34,10 +35,15 @@ struct MultiSearchFirstIndexImpl size_t /*max_hyperscan_regexp_total_length*/) { // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. - if (needles.size() > std::numeric_limits::max()) + if (needles_arr.size() > std::numeric_limits::max()) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Number of arguments for function {} doesn't match: passed {}, should be at most {}", - name, std::to_string(needles.size()), std::to_string(std::numeric_limits::max())); + name, std::to_string(needles_arr.size()), std::to_string(std::numeric_limits::max())); + + std::vector needles; + needles.reserve(needles_arr.size()); + for (const auto & needle : needles_arr) + needles.emplace_back(needle.get()); auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 87eb56c3b11..21d558a6d58 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -26,7 +27,7 @@ struct MultiSearchFirstPositionImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const Array & needles_arr, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, bool /*allow_hyperscan*/, @@ -34,10 +35,15 @@ struct MultiSearchFirstPositionImpl size_t /*max_hyperscan_regexp_total_length*/) { // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. - if (needles.size() > std::numeric_limits::max()) + if (needles_arr.size() > std::numeric_limits::max()) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Number of arguments for function {} doesn't match: passed {}, should be at most {}", - name, std::to_string(needles.size()), std::to_string(std::numeric_limits::max())); + name, std::to_string(needles_arr.size()), std::to_string(std::numeric_limits::max())); + + std::vector needles; + needles.reserve(needles_arr.size()); + for (const auto & needle : needles_arr) + needles.emplace_back(needle.get()); auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 { diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index 63ad41f1f7b..1124184f58c 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -26,7 +27,7 @@ struct MultiSearchImpl static void vectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, + const Array & needles_arr, PaddedPODArray & res, [[maybe_unused]] PaddedPODArray & offsets, bool /*allow_hyperscan*/, @@ -34,10 +35,15 @@ struct MultiSearchImpl size_t /*max_hyperscan_regexp_total_length*/) { // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. - if (needles.size() > std::numeric_limits::max()) + if (needles_arr.size() > std::numeric_limits::max()) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Number of arguments for function {} doesn't match: passed {}, should be at most {}", - name, std::to_string(needles.size()), std::to_string(std::numeric_limits::max())); + name, std::to_string(needles_arr.size()), std::to_string(std::numeric_limits::max())); + + std::vector needles; + needles.reserve(needles_arr.size()); + for (const auto & needle : needles_arr) + needles.emplace_back(needle.get()); auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); From e515579cbe070583a8b650f8db272982842ebc71 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 27 Jun 2022 16:26:36 +0200 Subject: [PATCH 49/60] Update IObjectStorage.h --- src/Disks/ObjectStorages/IObjectStorage.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 64022ec046d..ad8d5d2530e 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -25,8 +25,7 @@ class WriteBufferFromFileBase; using ObjectAttributes = std::map; -/// Path to a file and its size. -/// Path can be either relative or absolute - according to the context of use. +/// Path to a file (always absolute) and its size. struct PathWithSize { std::string path; From c9faab49928e24b985d193bcb01f73dff5f0d1fb Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 27 Jun 2022 16:43:26 +0200 Subject: [PATCH 50/60] Fix --- .../AzureBlobStorage/AzureObjectStorage.cpp | 2 +- .../AzureBlobStorage/AzureObjectStorage.h | 2 +- .../ObjectStorages/DiskObjectStorageMetadata.cpp | 2 +- .../ObjectStorages/DiskObjectStorageMetadata.h | 15 ++------------- ...skObjectStorageRemoteMetadataRestoreHelper.cpp | 5 +++-- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 2 +- src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h | 2 +- src/Disks/ObjectStorages/IObjectStorage.h | 3 ++- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 2 +- 10 files changed, 14 insertions(+), 23 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 32fd285dcdb..c25d1d7470c 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -113,7 +113,7 @@ std::unique_ptr AzureObjectStorage::writeObject( /// NO return std::make_unique(std::move(buffer), std::move(finalize_callback), path); } -void AzureObjectStorage::listPrefix(const std::string & path, PathsWithSize & children) const +void AzureObjectStorage::listPrefix(const std::string & path, RelativePathsWithSize & children) const { auto client_ptr = client.get(); diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 559be0ad257..ab7d2b28508 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -73,7 +73,7 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void listPrefix(const std::string & path, PathsWithSize & children) const override; + void listPrefix(const std::string & path, RelativePathsWithSize & children) const override; /// Remove file. Throws exception if file doesn't exists or it's a directory. void removeObject(const std::string & path) override; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp index 4564e84316d..a2c0c8abd36 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp @@ -51,7 +51,7 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) remote_fs_object_path = remote_fs_object_path.substr(remote_fs_root_path.size()); } assertChar('\n', buf); - storage_objects[i].relative_path = remote_fs_object_path; + storage_objects[i].path = remote_fs_object_path; storage_objects[i].bytes_size = remote_fs_object_size; } diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h index adebbcb952d..0f2a2a5507d 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h @@ -12,17 +12,6 @@ namespace DB struct DiskObjectStorageMetadata { private: - struct RelativePathWithSize - { - String relative_path; - size_t bytes_size; - - RelativePathWithSize() = default; - - RelativePathWithSize(const String & relative_path_, size_t bytes_size_) - : relative_path(relative_path_), bytes_size(bytes_size_) {} - }; - /// Metadata file version. static constexpr uint32_t VERSION_ABSOLUTE_PATHS = 1; static constexpr uint32_t VERSION_RELATIVE_PATHS = 2; @@ -31,7 +20,7 @@ private: const std::string & common_metadata_path; /// Relative paths of blobs. - std::vector storage_objects; + RelativePathsWithSize storage_objects; /// URI const std::string & remote_fs_root_path; @@ -71,7 +60,7 @@ public: return remote_fs_root_path; } - std::vector getBlobsRelativePaths() const + RelativePathsWithSize getBlobsRelativePaths() const { return storage_objects; } diff --git a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp index a8140e8954e..c9dbb5de078 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -379,7 +380,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFiles(IObjectStorage * return true; }; - PathsWithSize children; + RelativePathsWithSize children; source_object_storage->listPrefix(restore_information.source_path, children); restore_files(children); @@ -523,7 +524,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFileOperations(IObject return true; }; - PathsWithSize children; + RelativePathsWithSize children; source_object_storage->listPrefix(restore_information.source_path + "operations/", children); restore_file_operations(children); diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 82c700e1a63..bedd1a83df1 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -81,7 +81,7 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL } -void HDFSObjectStorage::listPrefix(const std::string & path, PathsWithSize & children) const +void HDFSObjectStorage::listPrefix(const std::string & path, RelativePathsWithSize & children) const { const size_t begin_of_path = path.find('/', path.find("//") + 2); int32_t num_entries; diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 28f553906ea..69878568548 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -75,7 +75,7 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void listPrefix(const std::string & path, PathsWithSize & children) const override; + void listPrefix(const std::string & path, RelativePathsWithSize & children) const override; /// Remove file. Throws exception if file doesn't exists or it's a directory. void removeObject(const std::string & path) override; diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 64022ec046d..705c2d223b4 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -42,6 +42,7 @@ struct PathWithSize /// List of paths with their sizes using PathsWithSize = std::vector; +using RelativePathsWithSize = PathsWithSize; struct ObjectMetadata { @@ -66,7 +67,7 @@ public: virtual bool exists(const std::string & path) const = 0; /// List on prefix, return children (relative paths) with their sizes. - virtual void listPrefix(const std::string & path, PathsWithSize & children) const = 0; + virtual void listPrefix(const std::string & path, RelativePathsWithSize & children) const = 0; /// Get object metadata if supported. It should be possible to receive /// at least size of object diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 58bd29d2d73..9236cde6e93 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -195,7 +195,7 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN return std::make_unique(std::move(s3_buffer), std::move(finalize_callback), path); } -void S3ObjectStorage::listPrefix(const std::string & path, PathsWithSize & children) const +void S3ObjectStorage::listPrefix(const std::string & path, RelativePathsWithSize & children) const { auto settings_ptr = s3_settings.get(); auto client_ptr = client.get(); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 5c53ea1f894..5d4300bffd3 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -80,7 +80,7 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void listPrefix(const std::string & path, PathsWithSize & children) const override; + void listPrefix(const std::string & path, RelativePathsWithSize & children) const override; /// Remove file. Throws exception if file doesn't exist or it's a directory. void removeObject(const std::string & path) override; From 83b87938e8b47f162fe02cc7a0fd98c954089639 Mon Sep 17 00:00:00 2001 From: Yuko Takagi <70714860+yukotakagi@users.noreply.github.com> Date: Mon, 27 Jun 2022 13:23:26 -0600 Subject: [PATCH 51/60] Add 22.7 release webinar. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e07a701d7c7..153a0d5ce11 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,4 @@ ClickHouse® is an open-source column-oriented database management system that a ## Upcoming events * [Paris Meetup](https://www.meetup.com/clickhouse-france-user-group/events/286304312/) Please join us for an evening of talks (in English), food and discussion. Featuring talks of ClickHouse in production and at least one on the deep internals of ClickHouse itself. +* [v22.7 Release Webinar](https://clickhouse.com/company/events/v22-7-release-webinar/) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap. From c24c041b1214345ab341726754d0f90bddb1095d Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 27 Jun 2022 20:54:52 +0000 Subject: [PATCH 52/60] Handle full queue exception in tests --- tests/clickhouse-test | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 75159053f26..22c6816eec2 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -5,6 +5,7 @@ # pylint: disable=too-many-lines import enum +from queue import Full import shutil import sys import os @@ -1581,15 +1582,18 @@ def do_run_tests(jobs, test_suite: TestSuite, parallel): for _ in range(jobs): parallel_tests_array.append((None, batch_size, test_suite)) - with closing(multiprocessing.Pool(processes=jobs)) as pool: - pool.map_async(run_tests_array, parallel_tests_array) + try: + with closing(multiprocessing.Pool(processes=jobs)) as pool: + pool.map_async(run_tests_array, parallel_tests_array) - for suit in test_suite.parallel_tests: - queue.put(suit, timeout=args.timeout * 1.1) + for suit in test_suite.parallel_tests: + queue.put(suit, timeout=args.timeout * 1.1) - for _ in range(jobs): - queue.put(None, timeout=args.timeout * 1.1) + for _ in range(jobs): + queue.put(None, timeout=args.timeout * 1.1) + queue.close() + except Full: queue.close() pool.join() From e41d612b1d1b80f64b4bd945bd7b3829f7ef0b9f Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 27 Jun 2022 20:57:18 +0000 Subject: [PATCH 53/60] Cleanup: local clang-tidy warnings founded during review --- src/Core/PostgreSQL/Connection.h | 2 +- src/Core/PostgreSQL/PoolWithFailover.h | 4 ++-- src/Dictionaries/RedisDictionarySource.h | 5 ----- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/Core/PostgreSQL/Connection.h b/src/Core/PostgreSQL/Connection.h index 8c5609dc66b..97ce3c152d5 100644 --- a/src/Core/PostgreSQL/Connection.h +++ b/src/Core/PostgreSQL/Connection.h @@ -32,7 +32,7 @@ struct ConnectionInfo class Connection : private boost::noncopyable { public: - Connection(const ConnectionInfo & connection_info_, bool replication_ = false, size_t num_tries = 3); + explicit Connection(const ConnectionInfo & connection_info_, bool replication_ = false, size_t num_tries = 3); void execWithRetry(const std::function & exec); diff --git a/src/Core/PostgreSQL/PoolWithFailover.h b/src/Core/PostgreSQL/PoolWithFailover.h index 600e12fb53a..4e3a17b5e9c 100644 --- a/src/Core/PostgreSQL/PoolWithFailover.h +++ b/src/Core/PostgreSQL/PoolWithFailover.h @@ -25,13 +25,13 @@ public: static constexpr inline auto POSTGRESQL_POOL_WAIT_TIMEOUT = 5000; static constexpr inline auto POSTGRESQL_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES = 5; - PoolWithFailover( + explicit PoolWithFailover( const DB::ExternalDataSourcesConfigurationByPriority & configurations_by_priority, size_t pool_size = POSTGRESQL_POOL_DEFAULT_SIZE, size_t pool_wait_timeout = POSTGRESQL_POOL_WAIT_TIMEOUT, size_t max_tries_ = POSTGRESQL_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES); - PoolWithFailover( + explicit PoolWithFailover( const DB::StoragePostgreSQLConfiguration & configuration, size_t pool_size = POSTGRESQL_POOL_DEFAULT_SIZE, size_t pool_wait_timeout = POSTGRESQL_POOL_WAIT_TIMEOUT, diff --git a/src/Dictionaries/RedisDictionarySource.h b/src/Dictionaries/RedisDictionarySource.h index bf745a7bb41..26f5ab2a613 100644 --- a/src/Dictionaries/RedisDictionarySource.h +++ b/src/Dictionaries/RedisDictionarySource.h @@ -8,11 +8,6 @@ namespace Poco { - namespace Util - { - class AbstractConfiguration; - } - namespace Redis { class Client; From 94f932add10f8028bced53cf8b4bed9dffc70b9a Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 28 Jun 2022 10:27:25 +0200 Subject: [PATCH 54/60] A tiny improvement in logging --- tests/ci/build_report_check.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 1ba91a38a60..dbf5adfe174 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -151,6 +151,8 @@ def main(): needs_data = json.load(file_handler) required_builds = len(needs_data) + logging.info("The next builds are required: %s", ", ".join(needs_data)) + gh = Github(get_best_robot_token()) pr_info = PRInfo() rerun_helper = RerunHelper(gh, pr_info, build_check_name) @@ -159,7 +161,6 @@ def main(): sys.exit(0) builds_for_check = CI_CONFIG["builds_report_config"][build_check_name] - logging.info("My reports list %s", builds_for_check) required_builds = required_builds or len(builds_for_check) # Collect reports from json artifacts From 0c51bd9c23906865b4e67a27a097767c9d60ad69 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 28 Jun 2022 12:36:13 +0200 Subject: [PATCH 55/60] fix flaky test --- .../configs/store_cleanup.xml | 2 +- .../test_broken_detached_part_clean_up/test.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_broken_detached_part_clean_up/configs/store_cleanup.xml b/tests/integration/test_broken_detached_part_clean_up/configs/store_cleanup.xml index 3b0260dd07a..5fbe87cce00 100644 --- a/tests/integration/test_broken_detached_part_clean_up/configs/store_cleanup.xml +++ b/tests/integration/test_broken_detached_part_clean_up/configs/store_cleanup.xml @@ -1,6 +1,6 @@ 0 - 15 + 60 1