diff --git a/dbms/cmake/version.cmake b/dbms/cmake/version.cmake index 332fa51ab26..a2decdff605 100644 --- a/dbms/cmake/version.cmake +++ b/dbms/cmake/version.cmake @@ -1,11 +1,11 @@ # This strings autochanged from release_lib.sh: -set(VERSION_REVISION 54425) +set(VERSION_REVISION 54426) set(VERSION_MAJOR 19) -set(VERSION_MINOR 14) +set(VERSION_MINOR 15) set(VERSION_PATCH 1) -set(VERSION_GITHASH adfc36917222bdb03eba069f0cad0f4f5b8f1c94) -set(VERSION_DESCRIBE v19.14.1.1-prestable) -set(VERSION_STRING 19.14.1.1) +set(VERSION_GITHASH 6f1a8c37abe6ee4e7ee74c0b5cb9c05a87417b61) +set(VERSION_DESCRIBE v19.15.1.1-prestable) +set(VERSION_STRING 19.15.1.1) # end of autochange set(VERSION_EXTRA "" CACHE STRING "") diff --git a/dbms/programs/client/Client.cpp b/dbms/programs/client/Client.cpp index df5e8568d21..854e226cb94 100644 --- a/dbms/programs/client/Client.cpp +++ b/dbms/programs/client/Client.cpp @@ -431,8 +431,14 @@ private: /// Load command history if present. if (config().has("history_file")) history_file = config().getString("history_file"); - else if (!home_path.empty()) - history_file = home_path + "/.clickhouse-client-history"; + else + { + auto history_file_from_env = getenv("CLICKHOUSE_HISTORY_FILE"); + if (history_file_from_env) + history_file = history_file_from_env; + else if (!home_path.empty()) + history_file = home_path + "/.clickhouse-client-history"; + } if (!history_file.empty()) { diff --git a/dbms/programs/odbc-bridge/ODBCBridge.cpp b/dbms/programs/odbc-bridge/ODBCBridge.cpp index 214d9f75328..c84452b691e 100644 --- a/dbms/programs/odbc-bridge/ODBCBridge.cpp +++ b/dbms/programs/odbc-bridge/ODBCBridge.cpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace DB { @@ -165,8 +166,7 @@ int ODBCBridge::main(const std::vector & /*args*/) if (config().has("query_masking_rules")) { - context->setSensitiveDataMasker(std::make_unique(config(), "query_masking_rules")); - setLoggerSensitiveDataMasker(logger(), context->getSensitiveDataMasker()); + SensitiveDataMasker::setInstance(std::make_unique(config(), "query_masking_rules")); } auto server = Poco::Net::HTTPServer( diff --git a/dbms/programs/server/Server.cpp b/dbms/programs/server/Server.cpp index 82f50f26569..bba96f18c35 100644 --- a/dbms/programs/server/Server.cpp +++ b/dbms/programs/server/Server.cpp @@ -55,6 +55,7 @@ #include "TCPHandlerFactory.h" #include "Common/config_version.h" #include "MySQLHandlerFactory.h" +#include #if defined(__linux__) @@ -279,8 +280,6 @@ int Server::main(const std::vector & /*args*/) */ LOG_INFO(log, "Shutting down storages."); - // global_context is the owner of sensitive_data_masker, which will be destoyed after global_context->shutdown() call - setLoggerSensitiveDataMasker(logger(), nullptr); global_context->shutdown(); LOG_DEBUG(log, "Shutted down storages."); @@ -414,7 +413,7 @@ int Server::main(const std::vector & /*args*/) if (config().has("query_masking_rules")) { - global_context->setSensitiveDataMasker(std::make_unique(config(), "query_masking_rules")); + SensitiveDataMasker::setInstance(std::make_unique(config(), "query_masking_rules")); } auto main_config_reloader = std::make_unique(config_path, @@ -426,10 +425,6 @@ int Server::main(const std::vector & /*args*/) { setTextLog(global_context->getTextLog()); buildLoggers(*config, logger()); - if (auto masker = global_context->getSensitiveDataMasker()) - { - setLoggerSensitiveDataMasker(logger(), masker); - } global_context->setClustersConfig(config); global_context->setMacros(std::make_unique(*config, "macros")); }, diff --git a/dbms/programs/server/config.xml b/dbms/programs/server/config.xml index d8fcd9b0c9e..34fe98b0e31 100644 --- a/dbms/programs/server/config.xml +++ b/dbms/programs/server/config.xml @@ -447,7 +447,7 @@ hide SSN - (^|\D)\d{3}-\d{2}-\d{4}($|\D) + \b\d{3}-\d{2}-\d{4}\b 000-00-0000 diff --git a/dbms/src/Common/HashTable/FixedHashTable.h b/dbms/src/Common/HashTable/FixedHashTable.h index 0c3a2c1237a..0ad56f354b9 100644 --- a/dbms/src/Common/HashTable/FixedHashTable.h +++ b/dbms/src/Common/HashTable/FixedHashTable.h @@ -262,8 +262,9 @@ public: iterator end() { return iterator(this, buf + BUFFER_SIZE); } -protected: - void ALWAYS_INLINE emplaceImpl(Key x, iterator & it, bool & inserted) +public: + /// The last parameter is unused but exists for compatibility with HashTable interface. + void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t /* hash */ = 0) { it = iterator(this, &buf[x]); @@ -278,22 +279,16 @@ protected: ++m_size; } - -public: std::pair ALWAYS_INLINE insert(const value_type & x) { std::pair res; - emplaceImpl(Cell::getKey(x), res.first, res.second); + emplace(Cell::getKey(x), res.first, res.second); if (res.second) res.first.ptr->setMapped(x); return res; } - - void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted) { emplaceImpl(x, it, inserted); } - void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t) { emplaceImpl(x, it, inserted); } - iterator ALWAYS_INLINE find(Key x) { return !buf[x].isZero(*this) ? iterator(this, &buf[x]) : end(); diff --git a/dbms/src/Common/SensitiveDataMasker.cpp b/dbms/src/Common/SensitiveDataMasker.cpp index 41e14aabb3c..23014e8956e 100644 --- a/dbms/src/Common/SensitiveDataMasker.cpp +++ b/dbms/src/Common/SensitiveDataMasker.cpp @@ -24,6 +24,7 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_COMPILE_REGEXP; + extern const int LOGICAL_ERROR; extern const int NO_ELEMENTS_IN_CONFIG; extern const int INVALID_CONFIG_PARAMETER; } @@ -38,7 +39,9 @@ private: const RE2 regexp; const re2::StringPiece replacement; +#ifndef NDEBUG mutable std::atomic matches_count = 0; +#endif public: //* TODO: option with hyperscan? https://software.intel.com/en-us/articles/why-and-how-to-replace-pcre-with-hyperscan @@ -61,15 +64,37 @@ public: uint64_t apply(std::string & data) const { auto m = RE2::GlobalReplace(&data, regexp, replacement); +#ifndef NDEBUG matches_count += m; +#endif return m; } const std::string & getName() const { return name; } const std::string & getReplacementString() const { return replacement_string; } +#ifndef NDEBUG uint64_t getMatchesCount() const { return matches_count; } +#endif + }; +std::unique_ptr SensitiveDataMasker::sensitive_data_masker = nullptr; + +void SensitiveDataMasker::setInstance(std::unique_ptr sensitive_data_masker_) +{ + if (!sensitive_data_masker_) + throw Exception("Logical error: the 'sensitive_data_masker' is not set", ErrorCodes::LOGICAL_ERROR); + + if (sensitive_data_masker_->rulesCount() > 0) + { + sensitive_data_masker = std::move(sensitive_data_masker_); + } +} + +SensitiveDataMasker * SensitiveDataMasker::getInstance() +{ + return sensitive_data_masker.get(); +} SensitiveDataMasker::SensitiveDataMasker(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) { diff --git a/dbms/src/Common/SensitiveDataMasker.h b/dbms/src/Common/SensitiveDataMasker.h index b7a7b12ee93..942b97181c5 100644 --- a/dbms/src/Common/SensitiveDataMasker.h +++ b/dbms/src/Common/SensitiveDataMasker.h @@ -4,7 +4,6 @@ #include #include - namespace Poco { namespace Util @@ -13,6 +12,32 @@ namespace Util } } +/// SensitiveDataMasker allows to remove sensitive data from queries using set of regexp-based rules + +/// It's used as a singelton via getInstance method + +/// Initially it's empty (nullptr) and after manual initialization +/// (one-time, done by setInstance call) it takes the proper value which +/// is stored in unique_ptr. + +/// It looks like the singelton is the best option here, as +/// two users of that object (OwnSplitChannel & Interpreters/executeQuery) +/// can't own/share that Masker properly without syncronization & locks, +/// and we can't afford setting global locks for each logged line. + +/// I've considered singleton alternatives, but it's unclear who should own the object, +/// and it introduce unnecessary complexity in implementation (passing references back and forward): +/// +/// context can't own, as Context is destroyed before logger, +/// and logger lives longer and logging can still happen after Context destruction. +/// resetting masker in the logger at the moment of +/// context destruction can't be done w/o synchronization / locks in a safe manner. +/// +/// logger is Poco derived and i didn't want to brake it's interface, +/// also logger can be dynamically reconfigured without server restart, +/// and it actually recreates OwnSplitChannel when reconfiguration happen, +/// so that makes it's quite tricky. So it a bad candidate for owning masker too. + namespace DB { class SensitiveDataMasker @@ -20,6 +45,7 @@ class SensitiveDataMasker private: class MaskingRule; std::vector> all_masking_rules; + static std::unique_ptr sensitive_data_masker; public: SensitiveDataMasker(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); @@ -28,6 +54,11 @@ public: /// Returns the number of matched rules. size_t wipeSensitiveData(std::string & data) const; + /// setInstance is not thread-safe and should be called once in single-thread mode. + /// https://github.com/yandex/ClickHouse/pull/6810#discussion_r321183367 + static void setInstance(std::unique_ptr sensitive_data_masker_); + static SensitiveDataMasker * getInstance(); + /// Used in tests. void addMaskingRule(const std::string & name, const std::string & regexp_string, const std::string & replacement_string); diff --git a/dbms/src/Core/Names.h b/dbms/src/Core/Names.h index 61220779f7b..5489a233b6e 100644 --- a/dbms/src/Core/Names.h +++ b/dbms/src/Core/Names.h @@ -16,4 +16,7 @@ using NameOrderedSet = std::set; using NameToNameMap = std::unordered_map; using NameToNameSetMap = std::unordered_map; +using NameWithAlias = std::pair; +using NamesWithAliases = std::vector; + } diff --git a/dbms/src/Functions/array/arrayElement.cpp b/dbms/src/Functions/array/arrayElement.cpp index 59f275692aa..d7132d0fe23 100644 --- a/dbms/src/Functions/array/arrayElement.cpp +++ b/dbms/src/Functions/array/arrayElement.cpp @@ -858,7 +858,7 @@ void FunctionArrayElement::perform(Block & block, const ColumnNumbers & argument builder.initSink(input_rows_count); if (index == 0u) - throw Exception("Array indices is 1-based", ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX); + throw Exception("Array indices are 1-based", ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX); if (!(executeNumberConst(block, arguments, result, index, builder) || executeNumberConst(block, arguments, result, index, builder) diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index f60afe81276..7deb21d0dcc 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -93,14 +93,14 @@ NameSet AnalyzedJoin::getOriginalColumnsSet() const return out; } -std::unordered_map AnalyzedJoin::getOriginalColumnsMap(const NameSet & required_columns) const +NamesWithAliases AnalyzedJoin::getNamesWithAliases(const NameSet & required_columns) const { - std::unordered_map out; + NamesWithAliases out; for (const auto & column : required_columns) { auto it = original_names.find(column); if (it != original_names.end()) - out.insert(*it); + out.emplace_back(it->second, it->first); /// {original_name, name} } return out; } @@ -129,15 +129,15 @@ Names AnalyzedJoin::requiredJoinedNames() const return Names(required_columns_set.begin(), required_columns_set.end()); } -void AnalyzedJoin::appendRequiredColumns(const Block & sample, NameSet & required_columns) const +NamesWithAliases AnalyzedJoin::getRequiredColumns(const Block & sample, const Names & action_required_columns) const { - for (auto & column : key_names_right) + NameSet required_columns(action_required_columns.begin(), action_required_columns.end()); + + for (auto & column : requiredJoinedNames()) if (!sample.has(column)) required_columns.insert(column); - for (auto & column : columns_added_by_join) - if (!sample.has(column.name)) - required_columns.insert(column.name); + return getNamesWithAliases(required_columns); } void AnalyzedJoin::addJoinedColumn(const NameAndTypePair & joined_column) diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index 2622f35a941..bea430de479 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -64,12 +64,12 @@ public: NameSet getQualifiedColumnsSet() const; NameSet getOriginalColumnsSet() const; - std::unordered_map getOriginalColumnsMap(const NameSet & required_columns) const; + NamesWithAliases getNamesWithAliases(const NameSet & required_columns) const; + NamesWithAliases getRequiredColumns(const Block & sample, const Names & action_columns) const; void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix); size_t rightKeyInclusion(const String & name) const; - void appendRequiredColumns(const Block & sample, NameSet & required_columns) const; void addJoinedColumn(const NameAndTypePair & joined_column); void addJoinedColumnsAndCorrectNullability(Block & sample_block) const; diff --git a/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp b/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp index dc0d3ef27b1..989595b3647 100644 --- a/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -30,7 +30,7 @@ Context removeUserRestrictionsFromSettings(const Context & context, const Settin /// Set as unchanged to avoid sending to remote server. new_settings.max_concurrent_queries_for_user.changed = false; new_settings.max_memory_usage_for_user.changed = false; - new_settings.max_memory_usage_for_all_queries = false; + new_settings.max_memory_usage_for_all_queries.changed = false; Context new_context(context); new_context.setSettings(new_settings); diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index 53b652681e2..6b70d6c2d8b 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -143,8 +143,6 @@ struct ContextShared std::unique_ptr ddl_worker; /// Process ddl commands from zk. /// Rules for selecting the compression settings, depending on the size of the part. mutable std::unique_ptr compression_codec_selector; - /// Allows to remove sensitive data from queries using set of regexp-based rules - std::unique_ptr sensitive_data_masker; std::optional merge_tree_settings; /// Settings of MergeTree* engines. size_t max_table_size_to_drop = 50000000000lu; /// Protects MergeTree tables from accidental DROP (50GB by default) size_t max_partition_size_to_drop = 50000000000lu; /// Protects MergeTree partitions from accidental DROP (50GB by default) @@ -287,8 +285,6 @@ struct ContextShared /// Stop trace collector if any trace_collector.reset(); - - sensitive_data_masker.reset(); } bool hasTraceCollector() @@ -538,23 +534,6 @@ String Context::getUserFilesPath() const return shared->user_files_path; } -void Context::setSensitiveDataMasker(std::unique_ptr sensitive_data_masker) -{ - if (!sensitive_data_masker) - throw Exception("Logical error: the 'sensitive_data_masker' is not set", ErrorCodes::LOGICAL_ERROR); - - if (sensitive_data_masker->rulesCount() > 0) - { - auto lock = getLock(); - shared->sensitive_data_masker = std::move(sensitive_data_masker); - } -} - -SensitiveDataMasker * Context::getSensitiveDataMasker() const -{ - return shared->sensitive_data_masker.get(); -} - void Context::setPath(const String & path) { auto lock = getLock(); @@ -1222,8 +1201,8 @@ void Context::setCurrentQueryId(const String & query_id) } words; } random; - random.words.a = thread_local_rng(); - random.words.b = thread_local_rng(); + random.words.a = thread_local_rng(); //-V656 + random.words.b = thread_local_rng(); //-V656 /// Use protected constructor. struct qUUID : Poco::UUID diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h index d8e4748ddad..f7ba0a7dbaa 100644 --- a/dbms/src/Interpreters/Context.h +++ b/dbms/src/Interpreters/Context.h @@ -12,7 +12,6 @@ #include #include #include "config_core.h" -#include #include #include #include @@ -178,9 +177,6 @@ public: String getFlagsPath() const; String getUserFilesPath() const; - void setSensitiveDataMasker(std::unique_ptr sensitive_data_masker); - SensitiveDataMasker * getSensitiveDataMasker() const; - void setPath(const String & path); void setTemporaryPath(const String & path); void setFlagsPath(const String & path); diff --git a/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp b/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp index 4b0a76cfb37..ec29fbf40c1 100644 --- a/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp +++ b/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp @@ -1,6 +1,5 @@ #include #include -#include /// for getNamesAndTypeListFromTableExpression #include #include @@ -15,6 +14,8 @@ namespace DB NameSet removeDuplicateColumns(NamesAndTypesList & columns); +struct ASTTableExpression; +NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context); DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTIdentifier & identifier, const String & current_database) { diff --git a/dbms/src/Interpreters/ExpressionActions.h b/dbms/src/Interpreters/ExpressionActions.h index 6997c3ef759..20acd1a95c8 100644 --- a/dbms/src/Interpreters/ExpressionActions.h +++ b/dbms/src/Interpreters/ExpressionActions.h @@ -20,9 +20,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -using NameWithAlias = std::pair; -using NamesWithAliases = std::vector; - class AnalyzedJoin; class IPreparedFunction; diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index d82169cf8e4..9777e3d508d 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -417,8 +418,7 @@ bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, b if (!ast_join) return false; - SubqueryForSet & subquery_for_set = getSubqueryForJoin(*ast_join); - syntax->analyzed_join->setHashJoin(subquery_for_set.join); + makeTableJoin(*ast_join); initChain(chain, sourceColumns()); ExpressionActionsChain::Step & step = chain.steps.back(); @@ -456,7 +456,15 @@ static JoinPtr tryGetStorageJoin(const ASTTablesInSelectQueryElement & join_elem return {}; } -SubqueryForSet & SelectQueryExpressionAnalyzer::getSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element) +static ExpressionActionsPtr createJoinedBlockActions(const Context & context, const AnalyzedJoin & analyzed_join) +{ + ASTPtr expression_list = analyzed_join.rightKeysList(); + auto syntax_result = SyntaxAnalyzer(context).analyze(expression_list, + analyzed_join.columnsFromJoinedTable(), analyzed_join.requiredJoinedNames()); + return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false); +} + +void SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element) { /// Two JOINs are not supported with the same subquery, but different USINGs. auto join_hash = join_element.getTreeHash(); @@ -469,64 +477,45 @@ SubqueryForSet & SelectQueryExpressionAnalyzer::getSubqueryForJoin(const ASTTabl subquery_for_set.join = tryGetStorageJoin(join_element, context); if (!subquery_for_set.join) - makeHashJoin(join_element, subquery_for_set); + { + /// Actions which need to be calculated on joined block. + ExpressionActionsPtr joined_block_actions = createJoinedBlockActions(context, analyzedJoin()); - return subquery_for_set; + if (!subquery_for_set.source) + makeSubqueryForJoin(join_element, joined_block_actions, subquery_for_set); + + /// Test actions on sample block (early error detection) + Block sample_block = subquery_for_set.renamedSampleBlock(); + joined_block_actions->execute(sample_block); + + /// TODO You do not need to set this up when JOIN is only needed on remote servers. + subquery_for_set.join = analyzedJoin().makeHashJoin(sample_block, settings.size_limits_for_join); + subquery_for_set.joined_block_actions = joined_block_actions; + } + + syntax->analyzed_join->setHashJoin(subquery_for_set.join); } -void SelectQueryExpressionAnalyzer::makeHashJoin(const ASTTablesInSelectQueryElement & join_element, - SubqueryForSet & subquery_for_set) const +void SelectQueryExpressionAnalyzer::makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, + const ExpressionActionsPtr & joined_block_actions, + SubqueryForSet & subquery_for_set) const { - /// Actions which need to be calculated on joined block. - ExpressionActionsPtr joined_block_actions = createJoinedBlockActions(); - /** For GLOBAL JOINs (in the case, for example, of the push method for executing GLOBAL subqueries), the following occurs * - in the addExternalStorage function, the JOIN (SELECT ...) subquery is replaced with JOIN _data1, * in the subquery_for_set object this subquery is exposed as source and the temporary table _data1 as the `table`. * - this function shows the expression JOIN _data1. */ - if (!subquery_for_set.source) - { - ASTPtr table; - auto & table_to_join = join_element.table_expression->as(); - if (table_to_join.subquery) - table = table_to_join.subquery; - else if (table_to_join.table_function) - table = table_to_join.table_function; - else if (table_to_join.database_and_table_name) - table = table_to_join.database_and_table_name; + NamesWithAliases required_columns_with_aliases = + analyzedJoin().getRequiredColumns(joined_block_actions->getSampleBlock(), joined_block_actions->getRequiredColumns()); - Names action_columns = joined_block_actions->getRequiredColumns(); - NameSet required_columns(action_columns.begin(), action_columns.end()); + Names original_columns; + for (auto & pr : required_columns_with_aliases) + original_columns.push_back(pr.first); - analyzedJoin().appendRequiredColumns(joined_block_actions->getSampleBlock(), required_columns); + auto interpreter = interpretSubquery(join_element.table_expression, context, subquery_depth, original_columns); - auto original_map = analyzedJoin().getOriginalColumnsMap(required_columns); - Names original_columns; - for (auto & pr : original_map) - original_columns.push_back(pr.second); - - auto interpreter = interpretSubquery(table, context, subquery_depth, original_columns); - - subquery_for_set.makeSource(interpreter, original_map); - } - - Block sample_block = subquery_for_set.renamedSampleBlock(); - joined_block_actions->execute(sample_block); - - /// TODO You do not need to set this up when JOIN is only needed on remote servers. - subquery_for_set.join = analyzedJoin().makeHashJoin(sample_block, settings.size_limits_for_join); - subquery_for_set.joined_block_actions = joined_block_actions; -} - -ExpressionActionsPtr SelectQueryExpressionAnalyzer::createJoinedBlockActions() const -{ - ASTPtr expression_list = analyzedJoin().rightKeysList(); - Names required_columns = analyzedJoin().requiredJoinedNames(); - - auto syntax_result = SyntaxAnalyzer(context).analyze(expression_list, analyzedJoin().columnsFromJoinedTable(), required_columns); - return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false); + subquery_for_set.makeSource(interpreter, std::move(required_columns_with_aliases)); } bool SelectQueryExpressionAnalyzer::appendPrewhere( diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.h b/dbms/src/Interpreters/ExpressionAnalyzer.h index aebbaf038cc..9356046aee3 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.h +++ b/dbms/src/Interpreters/ExpressionAnalyzer.h @@ -26,9 +26,6 @@ class ASTExpressionList; class ASTSelectQuery; struct ASTTablesInSelectQueryElement; -struct SyntaxAnalyzerResult; -using SyntaxAnalyzerResultPtr = std::shared_ptr; - /// ExpressionAnalyzer sources, intermediates and results. It splits data and logic, allows to test them separately. struct ExpressionAnalyzerData { @@ -222,9 +219,9 @@ private: */ void tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_or_table_name); - SubqueryForSet & getSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element); - ExpressionActionsPtr createJoinedBlockActions() const; - void makeHashJoin(const ASTTablesInSelectQueryElement & join_element, SubqueryForSet & subquery_for_set) const; + void makeTableJoin(const ASTTablesInSelectQueryElement & join_element); + void makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, const ExpressionActionsPtr & joined_block_actions, + SubqueryForSet & subquery_for_set) const; const ASTSelectQuery * getAggregatingQuery() const; }; diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp index f9de263d7c2..a795fac596d 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -417,8 +418,6 @@ QueryPipeline InterpreterSelectQuery::executeWithProcessors() Block InterpreterSelectQuery::getSampleBlockImpl() { - FilterInfoPtr filter_info; - /// Need to create sets before analyzeExpressions(). Otherwise some sets for index won't be created. query_analyzer->makeSetsForIndex(getSelectQuery().where()); query_analyzer->makeSetsForIndex(getSelectQuery().prewhere()); @@ -430,8 +429,9 @@ Block InterpreterSelectQuery::getSampleBlockImpl() options.to_stage, context, storage, - true, - filter_info); + true, // only_types + {} // filter_info + ); if (options.to_stage == QueryProcessingStage::Enum::FetchColumns) { @@ -990,6 +990,21 @@ void InterpreterSelectQuery::executeImpl(TPipeline & pipeline, const BlockInputS pipeline.streams.back() = std::make_shared( pipeline.streams.back(), expressions.prewhere_info->prewhere_actions, expressions.prewhere_info->prewhere_column_name, expressions.prewhere_info->remove_prewhere_column); + + // To remove additional columns in dry run + // For example, sample column which can be removed in this stage + if (expressions.prewhere_info->remove_columns_actions) + { + if constexpr (pipeline_with_processors) + { + pipeline.addSimpleTransform([&](const Block & header) + { + return std::make_shared(header, expressions.prewhere_info->remove_columns_actions); + }); + } + else + pipeline.streams.back() = std::make_shared(pipeline.streams.back(), expressions.prewhere_info->remove_columns_actions); + } } } else @@ -1561,12 +1576,22 @@ void InterpreterSelectQuery::executeFetchColumns( streams = {std::make_shared(storage->getSampleBlockForColumns(required_columns))}; if (query_info.prewhere_info) + { streams.back() = std::make_shared( streams.back(), prewhere_info->prewhere_actions, prewhere_info->prewhere_column_name, prewhere_info->remove_prewhere_column); + // To remove additional columns + // In some cases, we did not read any marks so that the pipeline.streams is empty + // Thus, some columns in prewhere are not removed as expected + // This leads to mismatched header in distributed table + if (query_info.prewhere_info->remove_columns_actions) + { + streams.back() = std::make_shared(streams.back(), query_info.prewhere_info->remove_columns_actions); + } + } } for (auto & stream : streams) diff --git a/dbms/src/Interpreters/QueryNormalizer.cpp b/dbms/src/Interpreters/QueryNormalizer.cpp index ffa94f3d700..c2991885cf3 100644 --- a/dbms/src/Interpreters/QueryNormalizer.cpp +++ b/dbms/src/Interpreters/QueryNormalizer.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include diff --git a/dbms/src/Interpreters/SubqueryForSet.cpp b/dbms/src/Interpreters/SubqueryForSet.cpp index f6528bf110c..6e0cd540db4 100644 --- a/dbms/src/Interpreters/SubqueryForSet.cpp +++ b/dbms/src/Interpreters/SubqueryForSet.cpp @@ -1,5 +1,4 @@ #include -#include #include #include @@ -7,26 +6,14 @@ namespace DB { void SubqueryForSet::makeSource(std::shared_ptr & interpreter, - const std::unordered_map & name_to_origin) + NamesWithAliases && joined_block_aliases_) { + joined_block_aliases = std::move(joined_block_aliases_); source = std::make_shared(interpreter->getSampleBlock(), [interpreter]() mutable { return interpreter->execute().in; }); - for (const auto & names : name_to_origin) - joined_block_aliases.emplace_back(names.second, names.first); - sample_block = source->getHeader(); - for (const auto & name_with_alias : joined_block_aliases) - { - if (sample_block.has(name_with_alias.first)) - { - auto pos = sample_block.getPositionByName(name_with_alias.first); - auto column = sample_block.getByPosition(pos); - sample_block.erase(pos); - column.name = name_with_alias.second; - sample_block.insert(std::move(column)); - } - } + renameColumns(sample_block); } void SubqueryForSet::renameColumns(Block & block) diff --git a/dbms/src/Interpreters/SubqueryForSet.h b/dbms/src/Interpreters/SubqueryForSet.h index 79d32d836c6..abba7a4ec2f 100644 --- a/dbms/src/Interpreters/SubqueryForSet.h +++ b/dbms/src/Interpreters/SubqueryForSet.h @@ -31,7 +31,7 @@ struct SubqueryForSet StoragePtr table; void makeSource(std::shared_ptr & interpreter, - const std::unordered_map & name_to_origin); + NamesWithAliases && joined_block_aliases_); Block renamedSampleBlock() const { return sample_block; } void renameColumns(Block & block); diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.cpp b/dbms/src/Interpreters/SyntaxAnalyzer.cpp index 3419a5baba3..dd0c37c50b5 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.cpp +++ b/dbms/src/Interpreters/SyntaxAnalyzer.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include /// getSmallestColumn() #include diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.h b/dbms/src/Interpreters/SyntaxAnalyzer.h index a2187ee2ef0..44fdc61ded3 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.h +++ b/dbms/src/Interpreters/SyntaxAnalyzer.h @@ -1,7 +1,7 @@ #pragma once +#include #include -#include #include #include @@ -11,6 +11,9 @@ namespace DB NameSet removeDuplicateColumns(NamesAndTypesList & columns); class ASTFunction; +class AnalyzedJoin; +class Context; +struct SelectQueryOptions; struct SyntaxAnalyzerResult { diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp index 3793f2f79c9..c54a8c7b82e 100644 --- a/dbms/src/Interpreters/executeQuery.cpp +++ b/dbms/src/Interpreters/executeQuery.cpp @@ -31,6 +31,7 @@ #include #include +#include #include #include @@ -76,7 +77,7 @@ static String prepareQueryForLogging(const String & query, Context & context) // wiping sensitive data before cropping query by log_queries_cut_to_length, // otherwise something like credit card without last digit can go to log - if (auto masker = context.getSensitiveDataMasker()) + if (auto masker = SensitiveDataMasker::getInstance()) { auto matches = masker->wipeSensitiveData(res); if (matches > 0) diff --git a/dbms/src/Interpreters/interpretSubquery.cpp b/dbms/src/Interpreters/interpretSubquery.cpp index 76b570ecdb9..82545d4b3be 100644 --- a/dbms/src/Interpreters/interpretSubquery.cpp +++ b/dbms/src/Interpreters/interpretSubquery.cpp @@ -18,6 +18,19 @@ namespace DB std::shared_ptr interpretSubquery( const ASTPtr & table_expression, const Context & context, size_t subquery_depth, const Names & required_source_columns) { + if (auto * expr = table_expression->as()) + { + ASTPtr table; + if (expr->subquery) + table = expr->subquery; + else if (expr->table_function) + table = expr->table_function; + else if (expr->database_and_table_name) + table = expr->database_and_table_name; + + return interpretSubquery(table, context, subquery_depth, required_source_columns); + } + /// Subquery or table name. The name of the table is similar to the subquery `SELECT * FROM t`. const auto * subquery = table_expression->as(); const auto * function = table_expression->as(); diff --git a/dbms/tests/queries/0_stateless/00956_sensitive_data_masking.reference b/dbms/tests/queries/0_stateless/00956_sensitive_data_masking.reference index 10856627eac..86323ec45e8 100644 --- a/dbms/tests/queries/0_stateless/00956_sensitive_data_masking.reference +++ b/dbms/tests/queries/0_stateless/00956_sensitive_data_masking.reference @@ -7,4 +7,6 @@ 6 7 8 +9 +text_log non empty finish diff --git a/dbms/tests/queries/0_stateless/00956_sensitive_data_masking.sh b/dbms/tests/queries/0_stateless/00956_sensitive_data_masking.sh index cca4b363b10..0c5bd753f26 100755 --- a/dbms/tests/queries/0_stateless/00956_sensitive_data_masking.sh +++ b/dbms/tests/queries/0_stateless/00956_sensitive_data_masking.sh @@ -11,6 +11,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cur_name=$(basename "${BASH_SOURCE[0]}") tmp_file=${CLICKHOUSE_TMP}/$cur_name"_server.logs" +tmp_file2=${CLICKHOUSE_TMP}/$cur_name"_server.2.logs" rm -f $tmp_file >/dev/null 2>&1 echo 1 @@ -55,9 +56,10 @@ grep 'TOPSECRET' $tmp_file && echo 'fail 4b' echo 5 # run in background +rm -f $tmp_file2 >/dev/null 2>&1 bash -c "$CLICKHOUSE_CLIENT \ --query=\"select sleepEachRow(0.5) from numbers(4) where ignore('find_me_TOPSECRET=TOPSECRET')=0 and ignore('fwerkh_that_magic_string_make_me_unique') = 0 FORMAT Null\" \ - --log_queries=1 --ignore-error --multiquery 2>&1 | grep TOPSECRET" & + --log_queries=1 --ignore-error --multiquery >$tmp_file2 2>&1" & sleep 0.1 @@ -67,12 +69,14 @@ rm -f $tmp_file >/dev/null 2>&1 echo '5.1' # check that executing query doesn't expose secrets in processlist $CLICKHOUSE_CLIENT --query="SHOW PROCESSLIST" --log_queries=0 >$tmp_file 2>&1 - -grep 'fwerkh_that_magic_string_make_me_unique' $tmp_file >/dev/null || echo 'fail 5a' -( grep 'fwerkh_that_magic_string_make_me_unique' $tmp_file | grep 'find_me_\[hidden\]' $tmp_file >/dev/null ) || echo 'fail 5b' -grep 'TOPSECRET' $tmp_file && echo 'fail 5c' - wait +grep 'TOPSECRET' $tmp_file2 && echo 'fail 5d' + +rm -f $tmp_file2 >/dev/null 2>&1 + +grep 'fwerkh_that_magic_string_make_me_unique' $tmp_file >$tmp_file2 || echo 'fail 5a' +grep 'find_me_\[hidden\]' $tmp_file2 >/dev/null || echo 'fail 5b' +grep 'TOPSECRET' $tmp_file && echo 'fail 5c' # instead of disabling send_logs_level=trace (enabled globally for that test) - redir it's output to /dev/null @@ -107,4 +111,15 @@ drop table sensitive;" --log_queries=1 --ignore-error --multiquery >$tmp_file 2> grep 'find_me_\[hidden\]' $tmp_file >/dev/null || echo 'fail 8a' grep 'TOPSECRET' $tmp_file && echo 'fail 8b' -echo 'finish' \ No newline at end of file +$CLICKHOUSE_CLIENT --query="SYSTEM FLUSH LOGS" --server_logs_file=/dev/null +sleep 0.1; + +echo 9 +$CLICKHOUSE_CLIENT \ + --server_logs_file=/dev/null \ + --query="SELECT if( count() > 0, 'text_log non empty', 'text_log empty') FROM system.text_log WHERE event_time>now() - 60 and message like '%find_me%'; + select * from system.text_log where event_time>now() - 60 and message like '%TOPSECRET=TOPSECRET%';" --ignore-error --multiquery + +echo 'finish' +rm -f $tmp_file >/dev/null 2>&1 +rm -f $tmp_file2 >/dev/null 2>&1 diff --git a/dbms/tests/queries/0_stateless/00975_sample_prewhere.reference b/dbms/tests/queries/0_stateless/00975_sample_prewhere.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/00975_sample_prewhere.sql b/dbms/tests/queries/0_stateless/00975_sample_prewhere.sql new file mode 100644 index 00000000000..a64eb5d0edf --- /dev/null +++ b/dbms/tests/queries/0_stateless/00975_sample_prewhere.sql @@ -0,0 +1,7 @@ +create table if not exists sample_prewhere (date Date, id Int32, time Int64) engine = MergeTree partition by date order by (id, time, intHash64(time)) sample by intHash64(time); + +insert into sample_prewhere values ('2019-01-01', 2, toDateTime('2019-07-20 00:00:01')); +insert into sample_prewhere values ('2019-01-01', 1, toDateTime('2019-07-20 00:00:02')); +insert into sample_prewhere values ('2019-01-02', 3, toDateTime('2019-07-20 00:00:03')); + +select id from remote('127.0.0.{1,3}', currentDatabase(), sample_prewhere) SAMPLE 1 where toDateTime(time) = '2019-07-20 00:00:00'; diff --git a/debian/changelog b/debian/changelog index f1db1b81185..563be7c48eb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,5 @@ -clickhouse (19.13.1.1) unstable; urgency=low +clickhouse (19.15.1.1) unstable; urgency=low * Modified source code - -- clickhouse-release Tue, 23 Jul 2019 11:20:49 +0300 + -- clickhouse-release Fri, 06 Sep 2019 17:58:30 +0300 diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile index 68cdf3f0204..9fde85b9fb0 100644 --- a/docker/client/Dockerfile +++ b/docker/client/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/" -ARG version=19.13.1.* +ARG version=19.15.1.* RUN apt-get update \ && apt-get install --yes --no-install-recommends \ diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index 934c1921a67..c65a0ddb550 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/" -ARG version=19.13.1.* +ARG version=19.15.1.* ARG gosu_ver=1.10 RUN apt-get update \ diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile index 5c2bd25b48c..0106d877feb 100644 --- a/docker/test/Dockerfile +++ b/docker/test/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/" -ARG version=19.13.1.* +ARG version=19.15.1.* RUN apt-get update && \ apt-get install -y apt-transport-https dirmngr && \ diff --git a/docs/en/development/build.md b/docs/en/development/build.md index 854562e191d..02cea936c70 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -40,7 +40,7 @@ sudo apt-get install git cmake ninja-build Or cmake3 instead of cmake on older systems. -## Install GCC 8 +## Install GCC 9 There are several ways to do this. @@ -50,18 +50,18 @@ There are several ways to do this. sudo apt-get install software-properties-common sudo apt-add-repository ppa:ubuntu-toolchain-r/test sudo apt-get update -sudo apt-get install gcc-8 g++-8 +sudo apt-get install gcc-9 g++-9 ``` ### Install from Sources Look at [utils/ci/build-gcc-from-sources.sh](https://github.com/yandex/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh) -## Use GCC 8 for Builds +## Use GCC 9 for Builds ```bash -export CC=gcc-8 -export CXX=g++-8 +export CC=gcc-9 +export CXX=g++-9 ``` ## Install Required Libraries from Packages diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 792e7bce020..ed36e79fbc0 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -11,7 +11,7 @@ The supported formats are: | [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [Template](#template) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | | [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | @@ -121,7 +121,7 @@ During parsing, the first and second rows are completely ignored. This format is also available under the name `TSVWithNamesAndTypes`. -## Template {#template} +## Template {#format-template} This format allows to specify a custom format string with placeholders for values with specified escaping rule. diff --git a/docs/en/operations/configuration_files.md b/docs/en/operations/configuration_files.md index 1ee72ab188e..de4bb0a0f7b 100644 --- a/docs/en/operations/configuration_files.md +++ b/docs/en/operations/configuration_files.md @@ -1,10 +1,11 @@ # Configuration Files {#configuration_files} -The main server config file is `config.xml`. It resides in the `/etc/clickhouse-server/` directory. +ClickHouse supports multi-file configuration management. The main server configuration file is `/etc/clickhouse-server/config.xml`. Other files must be in the `/etc/clickhouse-server/config.d` directory. -Individual settings can be overridden in the `*.xml` and `*.conf` files in the `config.d` directory next to the config file. +!!! note "Note" + All the configuration files should be in XML format. Also, they should have the same root element, usually ``. -The `replace` or `remove` attributes can be specified for the elements of these config files. +Some settings specified in the main configuration file can be overridden in other configuration files. The `replace` or `remove` attributes can be specified for the elements of these configuration files. If neither is specified, it combines the contents of elements recursively, replacing values of duplicate children. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 9c711073177..6f6201feb24 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -74,7 +74,7 @@ If `force_primary_key=1`, ClickHouse checks to see if the query has a primary ke ## format_schema -This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) or [Template](https://clickhouse.yandex/docs/en/interfaces/formats/#template). The value depends on the format. +This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) or [Template](../../interfaces/formats.md#format-template). The value depends on the format. ## fsync_metadata diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index a3646d3d0e9..7c694a1612c 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -313,6 +313,48 @@ INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100 INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4 ``` +#### Functions Support + +Conditions in the `WHERE` clause contain calls of functions over the columns. If the column is a part of some index, ClickHouse tries to use this index when performing the functions. ClickHouse supports different subset of functions for using indexes. + +The `set` index can be used with all functions. Functions subsets for other indexes are in the table below. + +Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter +----------------------------|-------------|--------|------------|------------|--------------- +[equals (=, ==)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ +[notEquals(!=, <>)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ +[like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ +[notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ +[startsWith](../../query_language/functions/string_functions.md#function-startswith) | ✔ | ✔ | ✔ | ✔ | ✗ +[endsWith](../../query_language/functions/string_functions.md#function-endswith) | ✗ | ✗ | ✔ | ✔ | +[multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✔ | ✗ +[in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ +[notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ +[less (<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ +[greater (>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ +[lessOrEquals (<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ +[greaterOrEquals (>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ +[empty](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ +[notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ +hasToken | ✗ | ✗ | ✗ | ✔ | ✗ + +Functions with a constant argument less than ngram size couldn't be used by `ngrambf_v1` for the query optimization. + +Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, `bloom_filter` indexes couldn't be used for optimizing queries where the result of a function is expected to be false, for example: + +- Can be optimized: + - `s LIKE '%test%'` + - `NOT s NOT LIKE '%test%'` + - `s = 1` + - `NOT s != 1` + - `startsWith(s, 'test')` +- Can't be optimized: + - `NOT s LIKE '%test%'` + - `s NOT LIKE '%test%'` + - `NOT s = 1` + - `s != 1` + - `NOT startsWith(s, 'test')` + ## Concurrent Data Access For concurrent table access, we use multi-versioning. In other words, when a table is simultaneously read and updated, data is read from a set of parts that is current at the time of the query. There are no lengthy locks. Inserts do not get in the way of read operations. diff --git a/docs/en/query_language/functions/array_functions.md b/docs/en/query_language/functions/array_functions.md index 94d79a1898b..5065d428994 100644 --- a/docs/en/query_language/functions/array_functions.md +++ b/docs/en/query_language/functions/array_functions.md @@ -1,12 +1,12 @@ # Functions for working with arrays -## empty +## empty {#function-empty} Returns 1 for an empty array, or 0 for a non-empty array. The result type is UInt8. The function also works for strings. -## notEmpty +## notEmpty {#function-notempty} Returns 0 for an empty array, or 1 for a non-empty array. The result type is UInt8. @@ -73,7 +73,7 @@ Get the element with the index `n` from the array `arr`. `n` must be any integer Indexes in an array begin from one. Negative indexes are supported. In this case, it selects the corresponding element numbered from the end. For example, `arr[-1]` is the last item in the array. -If the index falls outside of the bounds of an array, it returns some default value (0 for numbers, an empty string for strings, etc.). +If the index falls outside of the bounds of an array, it returns some default value (0 for numbers, an empty string for strings, etc.), except for the case with a non-constant array and a constant index 0 (in this case there will be an error `Array indices are 1-based`). ## has(arr, elem) diff --git a/docs/en/query_language/functions/comparison_functions.md b/docs/en/query_language/functions/comparison_functions.md index 39987ef2893..337a213673d 100644 --- a/docs/en/query_language/functions/comparison_functions.md +++ b/docs/en/query_language/functions/comparison_functions.md @@ -17,17 +17,17 @@ Strings are compared by bytes. A shorter string is smaller than all strings that Note. Up until version 1.1.54134, signed and unsigned numbers were compared the same way as in C++. In other words, you could get an incorrect result in cases like SELECT 9223372036854775807 > -1. This behavior changed in version 1.1.54134 and is now mathematically correct. -## equals, a = b and a == b operator +## equals, a = b and a == b operator {#function-equals} -## notEquals, a ! operator= b and a `<>` b +## notEquals, a ! operator= b and a `<>` b {#function-notequals} -## less, `< operator` +## less, `< operator` {#function-less} -## greater, `> operator` +## greater, `> operator` {#function-greater} -## lessOrEquals, `<= operator` +## lessOrEquals, `<= operator` {#function-lessorequals} -## greaterOrEquals, `>= operator` +## greaterOrEquals, `>= operator` {#function-greaterorequals} [Original article](https://clickhouse.yandex/docs/en/query_language/functions/comparison_functions/) diff --git a/docs/en/query_language/functions/in_functions.md b/docs/en/query_language/functions/in_functions.md index e7b355bb76c..5886dcc3bc7 100644 --- a/docs/en/query_language/functions/in_functions.md +++ b/docs/en/query_language/functions/in_functions.md @@ -1,6 +1,6 @@ # Functions for implementing the IN operator -## in, notIn, globalIn, globalNotIn +## in, notIn, globalIn, globalNotIn {#in-functions} See the section [IN operators](../select.md#select-in-operators). diff --git a/docs/en/query_language/functions/string_functions.md b/docs/en/query_language/functions/string_functions.md index b2ef05d4c3e..1eca9c0e815 100644 --- a/docs/en/query_language/functions/string_functions.md +++ b/docs/en/query_language/functions/string_functions.md @@ -145,11 +145,11 @@ Decode base64-encoded string 's' into original string. In case of failure raises ## tryBase64Decode(s) Similar to base64Decode, but in case of error an empty string would be returned. -## endsWith(s, suffix) +## endsWith(s, suffix) {#function-endswith} Returns whether to end with the specified suffix. Returns 1 if the string ends with the specified suffix, otherwise it returns 0. -## startsWith(s, prefix) +## startsWith(s, prefix) {#function-startswith} Returns whether to start with the specified prefix. Returns 1 if the string starts with the specified prefix, otherwise it returns 0. diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index 86f038b6a1e..723b8edc154 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -33,7 +33,7 @@ Returns the index `i` (starting from 1) of the leftmost found needlei For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) +## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) {#function-multisearchany} Returns 1, if at least one string needlei matches the string `haystack` and 0 otherwise. @@ -86,7 +86,7 @@ Extracts a fragment of a string using a regular expression. If 'haystack' doesn' Extracts all the fragments of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. Returns an array of strings consisting of all matches to the regex. In general, the behavior is the same as the 'extract' function (it takes the first subpattern, or the entire expression if there isn't a subpattern). -## like(haystack, pattern), haystack LIKE pattern operator +## like(haystack, pattern), haystack LIKE pattern operator {#function-like} Checks whether a string matches a simple regular expression. The regular expression can contain the metasymbols `%` and `_`. @@ -100,7 +100,7 @@ Use the backslash (`\`) for escaping metasymbols. See the note on escaping in th For regular expressions like `%needle%`, the code is more optimal and works as fast as the `position` function. For other regular expressions, the code is the same as for the 'match' function. -## notLike(haystack, pattern), haystack NOT LIKE pattern operator +## notLike(haystack, pattern), haystack NOT LIKE pattern operator {#function-notlike} The same thing as 'like', but negative. diff --git a/docs/fa/interfaces/formats.md b/docs/fa/interfaces/formats.md index a8c91c73b8a..d01dc1d1be0 100644 --- a/docs/fa/interfaces/formats.md +++ b/docs/fa/interfaces/formats.md @@ -12,6 +12,8 @@ Format | INSERT | SELECT [TabSeparatedRaw](formats.md#tabseparatedraw) | ✗ | ✔ | [TabSeparatedWithNames](formats.md#tabseparatedwithnames) | ✔ | ✔ | [TabSeparatedWithNamesAndTypes](formats.md#tabseparatedwithnamesandtypes) | ✔ | ✔ | +[Template](#format-template) | ✔ | ✔ | +[TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | [CSV](formats.md#csv) | ✔ | ✔ | [CSVWithNames](formats.md#csvwithnames) | ✔ | ✔ | [Values](formats.md#data-format-values) | ✔ | ✔ | @@ -115,6 +117,122 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD همچنین این فرمت تحت عنوان ` TSVWithNamesAndTypes`وجود دارد. + +## Template {#format-template} + +This format allows to specify a custom format string with placeholders for values with specified escaping rule. + +It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) + +Format string `format_schema_rows` specifies rows format with the following syntax: + + `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, + + where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), + `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), + `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: + + - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) + - `Escaped` (similarly to `TSV`) + - `Quoted` (similarly to `Values`) + - `Raw` (without escaping, similarly to `TSVRaw`) + - `None` (no escaping rule, see further) + + If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. + + So, for the following format string: + + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + + the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: + + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + + The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) + +Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: + + - `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. + - `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) + - `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) + - `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) + - `rows` is the total number of output rows + - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. + - `time` is the request execution time in seconds + - `rows_read` is the number of rows have been read + - `bytes_read` is the number of bytes (uncompressed) have been read + + The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. + If the `format_schema` setting is an empty string, `${data}` is used as default value. + For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). + + `Select` example: +```sql +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 +FORMAT Template +SETTINGS format_schema = ' + Search phrases + + + + ${data} +
Search phrases
Search phrase Count
+ + ${max} +
Max
+ Processed ${rows_read:XML} rows in ${time:XML} sec + +', +format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', +format_schema_rows_between_delimiter = '\n ' +``` +```html + + Search phrases + + + + + + + + +
Search phrases
Search phrase Count
8267016
bathroom interior design 2166
yandex 1655
spring 2014 fashion 1549
freeform photos 1480
+ + +
Max
8873898
+ Processed 3095973 rows in 0.1569913 sec + + +``` + +`Insert` example: +``` +Some header +Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 +Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 +Total rows: 2 +``` +```sql +INSERT INTO UserActivity FORMAT Template SETTINGS +format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', +format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' +``` +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. +All delimiters in the input data must be strictly equal to delimiters in specified format strings. + +## TemplateIgnoreSpaces {#templateignorespaces} + +This format is suitable only for input. +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): +```sql +INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS +format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', +format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', +format_schema_rows_between_delimiter = ',' +``` + ## TSKV مشابه فرمت TabSeparated، اما خروجی به صورت name=value می باشد. نام ها مشابه روش TabSeparated، escape می شوند، و همچنین = symbol هم escape می شود. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 8bfaf8186e0..15f7552f877 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -10,7 +10,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT | [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [Template](#template) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | | [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | @@ -120,7 +120,7 @@ world Этот формат также доступен под именем `TSVWithNamesAndTypes`. -## Template {#template} +## Template {#format-template} Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 831f5958c29..c3518eb7f74 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -72,7 +72,7 @@ ClickHouse применяет настройку в тех случаях, ко ## format_schema -Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) или [Template](../../interfaces/formats.md#template). Значение параметра зависит от формата. +Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) или [Template](../../interfaces/formats.md#format-template). Значение параметра зависит от формата. ## fsync_metadata diff --git a/docs/ru/query_language/functions/array_functions.md b/docs/ru/query_language/functions/array_functions.md index 7945276434f..11d5c819b02 100644 --- a/docs/ru/query_language/functions/array_functions.md +++ b/docs/ru/query_language/functions/array_functions.md @@ -74,7 +74,7 @@ SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res Индексы в массиве начинаются с единицы. Поддерживаются отрицательные индексы. В этом случае, будет выбран соответствующий по номеру элемент с конца. Например, arr\[-1\] - последний элемент массива. -Если индекс выходит за границы массива, то возвращается некоторое значение по умолчанию (0 для чисел, пустая строка для строк и т. п.). +Если индекс выходит за границы массива, то возвращается некоторое значение по умолчанию (0 для чисел, пустая строка для строк и т. п.), кроме случая с неконстантным массивом и константным индексом 0 (в этом случае будет ошибка `Array indices are 1-based`). ## has(arr, elem) diff --git a/docs/ru/query_language/functions/comparison_functions.md b/docs/ru/query_language/functions/comparison_functions.md index 392c6c2573c..b69e272133b 100644 --- a/docs/ru/query_language/functions/comparison_functions.md +++ b/docs/ru/query_language/functions/comparison_functions.md @@ -18,16 +18,16 @@ Замечание. До версии 1.1.54134 сравнение знаковых и беззнаковых целых чисел производилось также, как в C++. То есть, вы могли получить неверный результат в таких случаях: SELECT 9223372036854775807 > -1. С версии 1.1.54134 поведение изменилось и стало математически корректным. -## equals, оператор a = b и a == b +## equals, оператор a = b и a == b {#function-equals} -## notEquals, оператор a != b и a `<>` b +## notEquals, оператор a != b и a `<>` b {#function-notequals} -## less, оператор `<` +## less, оператор `<` {#function-less} -## greater, оператор `>` +## greater, оператор `>` {#function-greater} -## lessOrEquals, оператор `<=` +## lessOrEquals, оператор `<=` {#function-lessorequals} -## greaterOrEquals, оператор `>=` +## greaterOrEquals, оператор `>=` {#function-greaterorequals} [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/comparison_functions/) diff --git a/docs/ru/query_language/functions/in_functions.md b/docs/ru/query_language/functions/in_functions.md index 7eb87e53a6e..8b4eccd0db6 100644 --- a/docs/ru/query_language/functions/in_functions.md +++ b/docs/ru/query_language/functions/in_functions.md @@ -1,6 +1,7 @@ # Функции для реализации оператора IN. -## in, notIn, globalIn, globalNotIn +## in, notIn, globalIn, globalNotIn {#in-functions} + Смотрите раздел [Операторы IN](../select.md#select-in-operators). ## tuple(x, y, ...), оператор (x, y, ...) diff --git a/docs/ru/query_language/functions/string_functions.md b/docs/ru/query_language/functions/string_functions.md index f427017ee59..cc6563dacd5 100644 --- a/docs/ru/query_language/functions/string_functions.md +++ b/docs/ru/query_language/functions/string_functions.md @@ -117,6 +117,14 @@ SELECT format('{} {}', 'Hello', 'World') ## tryBase64Decode(s) Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. +## endsWith(s, suffix) {#function-endswith} + +Возвращает 1, если строка завершается указанным суффиксом, и 0 в противном случае. + +## startsWith(s, prefix) {#function-startswith} + +Возвращает 1, если строка начинается указанным префиксом, и 0 в противном случае. + ## CRC32(s) Возвращает чексумму CRC32 данной строки. Тип результата - UInt32. diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index 3fc500bd203..0f86554b552 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -29,7 +29,7 @@ Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) +## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) {#function-multisearchany} Возвращает 1, если хотя бы одна подстрока needlei нашлась в строке `haystack` и 0 иначе. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. @@ -77,7 +77,7 @@ ## extractAll(haystack, pattern) Извлечение всех фрагментов строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Возвращается массив строк, состоящий из всех соответствий регулярному выражению. В остальном, поведение аналогично функции extract (по прежнему, вынимается первый subpattern, или всё выражение, если subpattern-а нет). -## like(haystack, pattern), оператор haystack LIKE pattern +## like(haystack, pattern), оператор haystack LIKE pattern {#function-like} Проверка строки на соответствие простому регулярному выражению. Регулярное выражение может содержать метасимволы `%` и `_`. @@ -90,7 +90,7 @@ Для регулярных выражений вида `%needle%` действует более оптимальный код, который работает также быстро, как функция `position`. Для остальных регулярных выражений, код аналогичен функции match. -## notLike(haystack, pattern), оператор haystack NOT LIKE pattern +## notLike(haystack, pattern), оператор haystack NOT LIKE pattern {#function-notlike} То же, что like, но с отрицанием. ## ngramDistance(haystack, needle) diff --git a/docs/zh/interfaces/formats.md b/docs/zh/interfaces/formats.md index 65358115295..e4663c2d418 100644 --- a/docs/zh/interfaces/formats.md +++ b/docs/zh/interfaces/formats.md @@ -10,6 +10,8 @@ ClickHouse 可以接受多种数据格式,可以在 (`INSERT`) 以及 (`SELECT | [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | | [Values](#data-format-values) | ✔ | ✔ | @@ -115,6 +117,121 @@ world 这种格式也可以使用名称 ` TSVWithNamesAndTypes` 来表示。 +## Template {#format-template} + +This format allows to specify a custom format string with placeholders for values with specified escaping rule. + +It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) + +Format string `format_schema_rows` specifies rows format with the following syntax: + + `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, + + where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), + `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), + `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: + + - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) + - `Escaped` (similarly to `TSV`) + - `Quoted` (similarly to `Values`) + - `Raw` (without escaping, similarly to `TSVRaw`) + - `None` (no escaping rule, see further) + + If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. + + So, for the following format string: + + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + + the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: + + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + + The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) + +Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: + + - `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. + - `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) + - `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) + - `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) + - `rows` is the total number of output rows + - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. + - `time` is the request execution time in seconds + - `rows_read` is the number of rows have been read + - `bytes_read` is the number of bytes (uncompressed) have been read + + The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. + If the `format_schema` setting is an empty string, `${data}` is used as default value. + For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). + + `Select` example: +```sql +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 +FORMAT Template +SETTINGS format_schema = ' + Search phrases + + + + ${data} +
Search phrases
Search phrase Count
+ + ${max} +
Max
+ Processed ${rows_read:XML} rows in ${time:XML} sec + +', +format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', +format_schema_rows_between_delimiter = '\n ' +``` +```html + + Search phrases + + + + + + + + +
Search phrases
Search phrase Count
8267016
bathroom interior design 2166
yandex 1655
spring 2014 fashion 1549
freeform photos 1480
+ + +
Max
8873898
+ Processed 3095973 rows in 0.1569913 sec + + +``` + +`Insert` example: +``` +Some header +Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 +Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 +Total rows: 2 +``` +```sql +INSERT INTO UserActivity FORMAT Template SETTINGS +format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', +format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' +``` +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. +All delimiters in the input data must be strictly equal to delimiters in specified format strings. + +## TemplateIgnoreSpaces {#templateignorespaces} + +This format is suitable only for input. +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): +```sql +INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS +format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', +format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', +format_schema_rows_between_delimiter = ',' +``` + ## TSKV {#tskv} 与 `TabSeparated` 格式类似,但它输出的是 `name=value` 的格式。名称会和 `TabSeparated` 格式一样被转义,`=` 字符也会被转义。 diff --git a/libs/libloggers/loggers/Loggers.cpp b/libs/libloggers/loggers/Loggers.cpp index eab541dd2ee..cf966f620e2 100644 --- a/libs/libloggers/loggers/Loggers.cpp +++ b/libs/libloggers/loggers/Loggers.cpp @@ -167,14 +167,6 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log logger.root().get(level).setLevel(config.getString("logger.levels." + level, "trace")); } -void Loggers::setLoggerSensitiveDataMasker(Poco::Logger & logger, DB::SensitiveDataMasker * sensitive_data_masker) -{ - if (auto split = dynamic_cast(logger.getChannel())) - { - split->setMasker(sensitive_data_masker); - } -} - void Loggers::closeLogs(Poco::Logger & logger) { if (log_file) diff --git a/libs/libloggers/loggers/Loggers.h b/libs/libloggers/loggers/Loggers.h index 49135afadc6..525cab1e649 100644 --- a/libs/libloggers/loggers/Loggers.h +++ b/libs/libloggers/loggers/Loggers.h @@ -11,18 +11,10 @@ namespace Poco::Util class AbstractConfiguration; } -namespace DB -{ - class SensitiveDataMasker; -} - - class Loggers { public: void buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Logger & logger, const std::string & cmd_name = ""); - void setLoggerSensitiveDataMasker(Poco::Logger & logger, DB::SensitiveDataMasker * sensitive_data_masker); - /// Close log files. On next log write files will be reopened. void closeLogs(Poco::Logger & logger); diff --git a/libs/libloggers/loggers/OwnSplitChannel.cpp b/libs/libloggers/loggers/OwnSplitChannel.cpp index 69e540e1d21..467d7eb4fc2 100644 --- a/libs/libloggers/loggers/OwnSplitChannel.cpp +++ b/libs/libloggers/loggers/OwnSplitChannel.cpp @@ -9,7 +9,7 @@ #include #include #include - +#include namespace DB { @@ -20,7 +20,7 @@ void OwnSplitChannel::log(const Poco::Message & msg) if (channels.empty() && (logs_queue == nullptr || msg.getPriority() > logs_queue->max_priority)) return; - if (auto masker = sensitive_data_masker.load()) + if (auto masker = SensitiveDataMasker::getInstance()) { auto message_text = msg.getText(); auto matches = masker->wipeSensitiveData(message_text); @@ -31,6 +31,7 @@ void OwnSplitChannel::log(const Poco::Message & msg) } } + logSplit(msg); } @@ -100,11 +101,6 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg) } -void OwnSplitChannel::setMasker(DB::SensitiveDataMasker * _sensitive_data_masker) -{ - sensitive_data_masker.store(_sensitive_data_masker); -} - void OwnSplitChannel::addChannel(Poco::AutoPtr channel) { channels.emplace_back(std::move(channel), dynamic_cast(channel.get())); diff --git a/libs/libloggers/loggers/OwnSplitChannel.h b/libs/libloggers/loggers/OwnSplitChannel.h index 4b4da639b97..f475b46a72e 100644 --- a/libs/libloggers/loggers/OwnSplitChannel.h +++ b/libs/libloggers/loggers/OwnSplitChannel.h @@ -4,7 +4,6 @@ #include #include #include "ExtendedLogChannel.h" -#include #include @@ -18,9 +17,6 @@ class OwnSplitChannel : public Poco::Channel public: /// Makes an extended message from msg and passes it to the client logs queue and child (if possible) void log(const Poco::Message & msg) override; - - void setMasker(DB::SensitiveDataMasker * _sensitive_data_masker); - /// Adds a child channel void addChannel(Poco::AutoPtr channel); @@ -33,9 +29,9 @@ private: /// Handler and its pointer casted to extended interface using ExtendedChannelPtrPair = std::pair; std::vector channels; - std::atomic sensitive_data_masker = nullptr; // global context owns that object, pointer should be reset before context destroying. std::mutex text_log_mutex; + std::weak_ptr text_log; }; diff --git a/utils/github/__main__.py b/utils/github/__main__.py index 836ad734fce..dd8ed692e1d 100644 --- a/utils/github/__main__.py +++ b/utils/github/__main__.py @@ -47,14 +47,14 @@ parser.add_argument('--login', type=str, args = parser.parse_args() -github = query.Query(args.token, 50) +github = query.Query(args.token, 30) repo = local.Local(args.repo, args.remote, github.get_default_branch()) stables = repo.get_stables()[-args.number:] # [(branch name, base)] if not stables: - sys.exit('No stable branches found!') + sys.exit('No release branches found!') else: - print('Found stable branches:') + print('Found release branches:') for stable in stables: print(f'{CHECK_MARK} {stable[0]} forked from {stable[1]}') @@ -171,3 +171,8 @@ print(f'{CHECK_MARK} - good') print(f'{CROSS_MARK} - bad') print(f'{LABEL_MARK} - backport is detected via label') print(f'{CLOCK_MARK} - backport is waiting to merge') + +# print API costs +print('\nGitHub API total costs per query:') +for name, value in github.api_costs.items(): + print(f'{name} : {value}') diff --git a/utils/github/query.py b/utils/github/query.py index dae4fd062d5..5810e3bb40a 100644 --- a/utils/github/query.py +++ b/utils/github/query.py @@ -11,9 +11,9 @@ class Query: self._token = token self._max_page_size = max_page_size self._min_page_size = min_page_size + self.api_costs = {} _MEMBERS = ''' - {{ organization(login: "{organization}") {{ team(slug: "{team}") {{ members(first: {max_page_size} {next}) {{ @@ -27,7 +27,6 @@ class Query: }} }} }} - }} ''' def get_members(self, organization, team): '''Get all team members for organization @@ -58,7 +57,6 @@ class Query: return logins _LABELS = ''' - {{ repository(owner: "yandex" name: "ClickHouse") {{ pullRequest(number: {number}) {{ labels(first: {max_page_size} {next}) {{ @@ -73,7 +71,6 @@ class Query: }} }} }} - }} ''' def get_labels(self, pull_request): '''Fetchs all labels for given pull-request @@ -102,7 +99,6 @@ class Query: return labels _TIMELINE = ''' - {{ repository(owner: "yandex" name: "ClickHouse") {{ pullRequest(number: {number}) {{ timeline(first: {max_page_size} {next}) {{ @@ -140,7 +136,6 @@ class Query: }} }} }} - }} ''' def get_timeline(self, pull_request): '''Fetchs all cross-reference events from pull-request's timeline @@ -169,7 +164,6 @@ class Query: return events _PULL_REQUESTS = ''' - {{ repository(owner: "yandex" name: "ClickHouse") {{ defaultBranchRef {{ name @@ -248,7 +242,6 @@ class Query: }} }} }} - }} ''' def get_pull_requests(self, before_commit, login): '''Get all merged pull-requests from the HEAD of default branch to the last commit (excluding) @@ -294,13 +287,11 @@ class Query: return pull_requests _DEFAULT = ''' - { repository(owner: "yandex", name: "ClickHouse") { defaultBranchRef { name } } - } ''' def get_default_branch(self): '''Get short name of the default branch @@ -334,11 +325,27 @@ class Query: return session headers = {'Authorization': f'bearer {self._token}'} + query = f''' + {{ + {query} + rateLimit {{ + cost + remaining + }} + }} + ''' request = requests_retry_session().post('https://api.github.com/graphql', json={'query': query}, headers=headers) if request.status_code == 200: result = request.json() if 'errors' in result: raise Exception(f'Errors occured: {result["errors"]}') + + import inspect + caller = inspect.getouterframes(inspect.currentframe(), 2)[1][3] + if caller not in self.api_costs.keys(): + self.api_costs[caller] = 0 + self.api_costs[caller] += result['data']['rateLimit']['cost'] + return result['data'] else: import json